def find_keywords(self): root_set = set() for root in self.trees: pid_str = ",".join(str(pid) for pid in root.id_set); sql = "select content from keyword inner join keyword_paper_relation on keyword.id = kid where pid in (%s)" % pid_str result = self.database.executeSQL(sql) for row in result: root.keyword_list.append(row[0]) root.keyword = calc.CalcFatherv2(root.keyword_list, root_set) root_set.add(root.keyword) if root.keyword == '': print '@@@@', root.keyword_list else: print root.keyword, root.level, root.group_id for root in self.trees: is_used_keywords = set() | root_set node_queue = Queue.Queue(0) #0 means no max length queue for child in root.children_nodes: node_queue.put(child) while not node_queue.empty(): node = node_queue.get() pid_str = ",".join(str(pid) for pid in node.id_set); sql = "select content from keyword inner join keyword_paper_relation on keyword.id = kid where pid in (%s)" % pid_str result = self.database.executeSQL(sql) for row in result: node.keyword_list.append(row[0]) node.keyword = calc.CalcFatherv2(node.keyword_list, is_used_keywords) is_used_keywords.add(node.keyword) if node.keyword == '': print '@@@@', node.keyword_list else: print node.keyword, node.level, node.group_id for child in node.children_nodes: node_queue.put(child) print 'find keywords done.'
def start(): connectDB() calc.init_similarity() createSourceFile() callBGLL() getAllResult() clearHierarchyRelation() # group: the id of each tree for (group, pid_set) in result_paper_cate[total_level].items(): print "======================level:%s====================" % group is_selected = set() updateHierarchyRelation(group, pid_set, total_level, -1, is_selected) closeDB()
def __init__(self): self.BGLL = BGLL(self.output_path, self.is_weighted) self.database = Database(self.db_host, self.db_user, self.db_pass, self.db_name) calc.init_similarity()
import MySQLdb import os import CalcFather as calc # database config db_host = "localhost" db_user = "******" db_pass = "******" db_name = "papernet" # global database connection and cursor conn = MySQLdb.connect(db_host, db_user, db_pass, db_name) cursor = conn.cursor() calc.init_similarity() # read the database paper_paper_relation and write into paper.txt map1 = dict() # from old to new map2 = dict() # from new to old file_path = '/home/cowx/workspace/BGLL/paper/paper.txt' f = open(file_path, 'w+') sql = "select pid1, pid2 from paper_paper_relation order by pid1, pid2" cursor.execute(sql) result = cursor.fetchall() pid_set = set() # the set of all nodes for row in result: pid_set.add(row[0]) pid_set.add(row[1])
try: sql = 'insert into keyword_hierarchy_relation(group_id, father, child) values(%d, %d, %d)' % ( int(group), father, kid) executeSQL(sql) except Exception, e: if str(e).find('1062') != -1: #duplicate key sql = 'update keyword_hierarchy_relation set father = %d where group_id = %d and child = %d' % ( father, int(group), kid) executeSQL(sql) else: print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' print sql print e print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' content = calc.CalcFatherv2(content_list, is_selected) sql = 'select id from keyword where content = "%s"' % content try: result = executeSQL(sql) is_selected.add(content) except Exception, e: print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' print sql print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' print "%s %d %d" % (content, father, level) # if content == "": # print "[pids:%s content:%s]" % (pid_set, content_list) if level >= 1 and content != "":