def save_root_categories_to_db(root_categories): db, cursor = init_db() table_name = 'root_categories' if not is_tbl_exists(db, cursor, table_name): cursor.execute('CREATE TABLE ' + table_name + \ ' (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ ' name VARCHAR(100) NOT NULL)') for root_category in root_categories: if not is_root_category_in_db(db, cursor, root_category[0]): try: cursor.execute("""INSERT INTO root_categories (name) VALUES (%s)""", (root_category[0],)) except Exception, e: print repr(e)
def invalidate_stopwords_from_keyphrases(stopwords): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngrams'): raise Exception('Table "ngrams" does not exist!') for stopword in stopwords: try: cursor.execute("""UPDATE ngrams SET is_valid=0 WHERE name=%s""", \ (stopword)) db.commit() except: db.rollback() raise Exception('Error in updating table "ngrams"') close_db(db, cursor)
def save_keyphrases_to_table(keyphrases): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngrams'): cursor.execute('CREATE TABLE ngrams ' + \ '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'name varchar(255) NOT NULL UNIQUE, ' + \ 'n INT DEFAULT 0, ' + \ 'freq INT DEFAULT 0, ' + \ 'is_valid int(1) DEFAULT 1)') for k in keyphrases: ngram_id = get_ngram_id_by_name(db, cursor, k) if ngram_id == None: save_one_keyphrase_to_table(db, cursor, k) close_db(db, cursor)
def gen_author_keyphrase(trie): # ex: Assuming sean's author cluster is 203, sean is good at nuclear (id = 2111) and oxygen (id=76), # then: author_keyphrase[203] = {2111: 3, 76: 5}, where 3 and 5 are the appearance frequency # of nuclear and oxygen in his publications batch_save_size = 100 db, cursor = init_db() if not is_tbl_exists(db, cursor, 'personal_keywords'): cursor.execute('CREATE TABLE personal_keywords (' + \ 'id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'person_cluster int, ' + \ 'ngram_id int, ' + \ 'year int, ' + \ 'count int, ' + \ 'log_cite_prod_count float)') author_clusters = get_author_clusters() author_keyphrase = { } sys.stdout.write("Generating author_keyphrase\n") n_author_clusters = len(author_clusters) for i, author_cluster in enumerate(author_clusters): sys.stdout.write("\r%d / %d" % (i+1, n_author_clusters)) cursor.execute("""SELECT authors.cluster, papers.title, """ + \ """papers.abstract, papers.ncites FROM authors, papers WHERE """ + \ """authors.paper_cluster = papers.cluster and authors.cluster = %s""", (author_cluster)) rows = cursor.fetchall() for r in rows: author_cluster = r[0] contents = r[1].lower() + ' >>> ' if r[1] is not None else '' if r[2] is not None: contents += r[2].lower() ncites = r[3] term_ctr = gen_term_ctr(contents, trie) upd_author_keyphrase(author_keyphrase, author_cluster, term_ctr, ncites) if (i+1) % batch_save_size == 0: save_author_keyphrase_to_table(db, cursor, author_keyphrase) author_keyphrase = { } save_author_keyphrase_to_table(db, cursor, author_keyphrase) sys.stdout.write("\nCreating indexes...\n") cursor.execute('ALTER TABLE personal_keywords ADD INDEX (person_cluster), ' + \ 'ADD INDEX (ngram_id), ADD INDEX (year)') close_db(db, cursor)
def gen_keyphrase_info(trie): db, cursor = init_db() if not is_tbl_exists(db, cursor, 'ngram_relations'): cursor.execute('CREATE TABLE ngram_relations ' + \ '(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, ' + \ 'src_id INT NOT NULL, ' + \ 'tar_id INT NOT NULL, ' + \ 'co_occur INT DEFAULT 0, ' + \ 'co_occur_norm FLOAT DEFAULT 0, ' + \ 'is_valid INT(1) DEFAULT 1, ' + \ 'UNIQUE src_tar_idx (src_id, tar_id))') batch_save_size = 100 keyphrase_ctr = defaultdict(int) keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int)) cursor.execute("""SELECT title, abstract FROM papers""") rows = cursor.fetchall() num_rows = len(rows) sys.stdout.write('Generating keyphrase information\n') for i, r in enumerate(rows): sys.stdout.write("\r%d / %d" % (i+1, num_rows)) contents = r[0].lower() + ' >>> ' if r[0] is not None else '' if r[1] is not None: contents += r[1].lower() inc_keyphrase_ctr(keyphrase_ctr, contents, trie) inc_keyphrase_relation_ctr(keyphrase_relation_ctr, contents, trie) if (i+1) % batch_save_size == 0: upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr) upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr) keyphrase_ctr = defaultdict(int) keyphrase_relation_ctr = defaultdict(lambda : defaultdict(int)) upd_keyphrase_ctr_to_table(db, cursor, keyphrase_ctr) upd_keyphrase_relation_ctr_to_table(db, cursor, keyphrase_relation_ctr) upd_co_occur_norm() sys.stdout.write("\n") #cursor.execute('ALTER TABLE ngram_relations ADD UNIQUE INDEX (src_id, tar_id)') close_db(db, cursor)
def check_required_tables(db, cursor): required_tbls = ['authors', 'papers', 'ngrams'] for tbl in required_tbls: if not is_tbl_exists(db, cursor, tbl): raise Exception('Table "' + tbl + '" does not exist in the specified table')