def update_graph(entity_type, sql_filename): """ Performs a complete update of the database graph information, adding jweight, entropy and occurrence data from the sql file generated by complete_mining. This will remove ALL previous graph data. """ # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" elif entity_type == Thinker: table = "thinker_graph_edges" else: table = "idea_thinker_graph_edges" connection = Session.connection() print "deleting old graph information ..." connection.execute(""" TRUNCATE TABLE %(table)s; """ % {'filename' : sql_filename, 'table' : table }) print "inserting new graph information" connection.execute(""" SET foreign_key_checks=0; LOCK TABLES %(table)s WRITE; LOAD DATA INFILE '%(filename)s' INTO TABLE %(table)s FIELDS TERMINATED BY '::' (ante_id, cons_id, confidence, jweight, weight, occurs_in); UNLOCK TABLES; SET foreign_key_checks=1; """ % {'filename' : sql_filename, 'table' : table }) Session.close()
def update_graph(entity_type, sql_filename): # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" elif entity_type == Thinker: table = "thinker_graph_edges" else: table = "idea_thinker_graph_edges" connection = Session.connection() print "deleting old graph information ..." connection.execute(""" TRUNCATE TABLE %(table)s; """ % {'filename' : sql_filename, 'table' : table }) print "inserting new graph information" connection.execute(""" SET foreign_key_checks=0; LOCK TABLES %(table)s WRITE; LOAD DATA INFILE '%(filename)s' INTO TABLE %(table)s FIELDS TERMINATED BY '::' (ante_id, cons_id, confidence, jweight, weight, occurs_in); UNLOCK TABLES; SET foreign_key_checks=1; """ % {'filename' : sql_filename, 'table' : table }) Session.close()
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir != None) articles = articles.filter(Entity.sep_dir != '') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False, update_occurrences=False, update_db=False): occur_filename = os.path.abspath(root + "occurrences.txt") graph_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) doc_terms = doc_terms_list() if update_occurrences: print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "filtering occurrences..." filter_apriori_input(occur_filename, graph_filename, entity_type, doc_terms) print "running apriori miner..." dm.apriori(graph_filename, edge_filename) print "processing edges..." edges = dm.process_edges(graph_filename, edge_filename, occur_filename, doc_terms) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante, cons = edge row = "%s::%s" % edge row += ("::%(confidence)s::%(jweight)s::%(weight)s" "::%(occurs_in)s\n" % props) f.write(row) if update_entropy: print "updating term entropy..." for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() if update_db: print "updating the database..." update_graph(entity_type, sql_filename)
def filter_apriori_input(occur_filename, output_filename, entity_type=Idea, doc_terms=None): #select terms terms = select_terms(entity_type) Session.expunge_all() Session.close() lines = dm.prepare_apriori_input(occur_filename, terms, doc_terms) with open(output_filename, 'w') as f: f.writelines(lines)
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False, update_occurrences=False, update_db=False): occur_filename = os.path.abspath(root + "occurrences.txt") graph_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) doc_terms = doc_terms_list() if update_occurrences: print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "filtering occurrences..." filter_apriori_input( occur_filename, graph_filename, entity_type, doc_terms) print "running apriori miner..." dm.apriori(graph_filename, edge_filename) print "processing edges..." edges = dm.process_edges( graph_filename, edge_filename, occur_filename, doc_terms) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante,cons = edge row = "%s::%s" % edge row += ("::%(confidence)s::%(jweight)s::%(weight)s" "::%(occurs_in)s\n" % props) f.write(row) if update_entropy: print "updating term entropy..." for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() if update_db: print "updating the database..." update_graph(entity_type, sql_filename)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() # fix search patterns for term in terms: newpatterns = [] for pattern in term.searchpatterns: if '(' in pattern and ')' in pattern: pattern = pattern.replace('( ', '(\\b') pattern = pattern.replace(' )', '\\b)') else: pattern = '\\b%s\\b' % pattern.strip() newpatterns.append(pattern) term.searchpatterns = newpatterns articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(entity_type).filter(entity_type.sep_dir!='').all() # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() # write graph output to file with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(entity_type).filter( entity_type.sep_dir != '').all() # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() # write graph output to file with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def update_graph(entity_type, sql_filename): """ Performs a complete update of the database graph information, adding jweight, entropy and occurrence data from the sql file generated by complete_mining. This will remove ALL previous graph data. """ # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" elif entity_type == Thinker: table = "thinker_graph_edges" else: table = "idea_thinker_graph_edges" connection = Session.connection() print "deleting old graph information ..." connection.execute(""" TRUNCATE TABLE %(table)s; """ % { 'filename': sql_filename, 'table': table }) print "inserting new graph information" connection.execute(""" SET foreign_key_checks=0; LOCK TABLES %(table)s WRITE; LOAD DATA LOCAL INFILE '%(filename)s' INTO TABLE %(table)s FIELDS TERMINATED BY '::' (ante_id, cons_id, confidence, jweight, weight, occurs_in); UNLOCK TABLES; SET foreign_key_checks=1; """ % { 'filename': sql_filename, 'table': table }) Session.close()
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False): occur_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "running apriori miner..." dm.apriori(occur_filename, edge_filename) print "processing edges..." edges = dm.process_edges(occur_filename, edge_filename) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante, cons = edge row = "%s::%s" % edge row += "::%(confidence)s::%(jweight)s::%(weight)s\n" % props f.write(row) print "updating term entropy..." if update_entropy: for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" elif entity_type == Thinker: table = "thinker_graph_edges" else: table = "idea_thinker_graph_edges" connection = Session.connection() print "deleting old graph information ..." connection.execute(""" DELETE FROM %(table)s; """ % { 'filename': sql_filename, 'table': table }) print "inserting new graph information" connection.execute(""" SET foreign_key_checks=0; LOAD DATA INFILE '%(filename)s' INTO TABLE %(table)s FIELDS TERMINATED BY '::' (ante_id, cons_id, confidence, jweight, weight); SET foreign_key_checks=1; """ % { 'filename': sql_filename, 'table': table }) Session.close()
def complete_mining(entity_type=Idea, filename='graph.txt', root='./', corpus_root='corpus/', update_entropy=False): occur_filename = os.path.abspath(root + "graph-" + filename) edge_filename = os.path.abspath(root + "edge-" + filename) sql_filename = os.path.abspath(root + "sql-" + filename) print "processing articles..." process_articles(entity_type, occur_filename, corpus_root=corpus_root) print "running apriori miner..." dm.apriori(occur_filename, edge_filename) print "processing edges..." edges = dm.process_edges(occur_filename, edge_filename) ents = dm.calculate_node_entropy(edges) edges = dm.calculate_edge_weight(edges, ents) print "creating sql files..." with open(sql_filename, 'w') as f: for edge, props in edges.iteritems(): ante,cons = edge row = "%s::%s" % edge row += "::%(confidence)s::%(jweight)s::%(weight)s\n" % props f.write(row) print "updating term entropy..." if update_entropy: for term_id, entropy in ents.iteritems(): term = Session.query(Idea).get(term_id) if term: term.entropy = entropy Session.flush() Session.commit() Session.close() # Import SQL statements if entity_type == Idea: table = "idea_graph_edges" elif entity_type == Thinker: table = "thinker_graph_edges" else: table = "idea_thinker_graph_edges" connection = Session.connection() print "deleting old graph information ..." connection.execute(""" DELETE FROM %(table)s; """ % {'filename' : sql_filename, 'table' : table }) print "inserting new graph information" connection.execute(""" SET foreign_key_checks=0; LOAD DATA INFILE '%(filename)s' INTO TABLE %(table)s FIELDS TERMINATED BY '::' (ante_id, cons_id, confidence, jweight, weight); SET foreign_key_checks=1; """ % {'filename' : sql_filename, 'table' : table }) Session.close()