def build_search_index(): start = time() try: search_idx = db.nodes.indexes.get('search_idx') with db.transaction: search_idx.delete() except ValueError: pass search_idx = db.nodes.indexes.create('search_idx',type='fulltext') # Loop through papers for batch_count, batch in enumerate(group_generator(PAPER.type.incoming, 1000)): print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count*1000, time()-start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode search_idx['title'][paper_node['title']]=paper_node for batch_count, batch in enumerate(group_generator(AUTHOR.type.incoming, 1000)): print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count*1000, time()-start) with db.transaction: for author_rel in batch: author_node = author_rel.startNode name = author_node['name'].replace(',',' ') search_idx['author'][name]=author_node
def build_search_index(): start = time() try: search_idx = db.nodes.indexes.get('search_idx') with db.transaction: search_idx.delete() except ValueError: pass search_idx = db.nodes.indexes.create('search_idx', type='fulltext') # Loop through papers for batch_count, batch in enumerate( group_generator(PAPER.type.incoming, 1000)): print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count * 1000, time() - start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode search_idx['title'][paper_node['title']] = paper_node for batch_count, batch in enumerate( group_generator(AUTHOR.type.incoming, 1000)): print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count * 1000, time() - start) with db.transaction: for author_rel in batch: author_node = author_rel.startNode name = author_node['name'].replace(',', ' ') search_idx['author'][name] = author_node
def write_cite_rank(iterations=4): start = time() # damping factor d = 0.85 # initial value cr_0 = 1. for iteration in range(iterations): for batch_count, batch in enumerate( group_generator(PAPER.type.incoming, 1000)): print "Calculating CiteRank. Iteration %d. Processed %d papers in %d sec." % ( iteration, batch_count * 1000, time() - start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode cr = (1 - d) for cite_rel in paper_node.ref.incoming: cite_node = cite_rel.startNode try: cr_node = cite_node['c_cite_rank'] except KeyError: cr_node = cr_0 cr += d * cr_node / cite_node['c_reference_count'] paper_node['c_cite_rank'] = cr
def unmatched_reference_fill_db(match_file=match_file, db=db): """ Findes lines in match_file, where the target cannot be found, and adds the corresponding reference strings to 'unknown_references' list in the source. """ in_iter = open(match_file) # line format examples at the end of the file start = time() rel_count = 0 last_id = '' ref_buffer = [] for batch_count, batch in enumerate(group_generator(in_iter, 10000)): sys.stderr.write( 'Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count)) with db.transaction: for line in batch: try: source_id, ref_string, target_id = line.rstrip().split('|') except ValueError: print 'Skipped line: not not enough separators "|". ', line[: 20] continue if not target_id == '': # Lookup target_id in the index target_node = None for target_node in source_idx['id']['arxiv:' + target_id]: break if target_node: # skip if target exists last_id = source_id continue # target node does not exist here # if source_id == last_id: # on same node? ref_buffer.append(ref_string) continue # new source_id # # lookup source node for last_node in source_idx['id']['arxiv:' + last_id]: break else: print "Skipped line: source id not forund. ", last_id last_id = source_id continue # last node exists # # write old references to last node last_node['unknown_references'] += ref_buffer last_id = source_id ref_buffer = [ref_string]
def json_export(json_dir=json_dir, pkl_dir=pkl_dir): if not os.path.exists(json_dir): os.makedirs(json_dir) for i, batch in enumerate( group_generator(get_meta_from_pkl(pkl_dir), 10000)): fh = open(json_dir + 'META_%03d.json' % i, 'w') json.dump(batch, fh) fh.close()
def unmatched_reference_fill_db(match_file = match_file , db=db): """ Findes lines in match_file, where the target cannot be found, and adds the corresponding reference strings to 'unknown_references' list in the source. """ in_iter = open(match_file) # line format examples at the end of the file start = time() rel_count = 0 last_id = '' ref_buffer = [] for batch_count, batch in enumerate(group_generator(in_iter,10000)): sys.stderr.write('Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count)) with db.transaction: for line in batch: try: source_id, ref_string, target_id = line.rstrip().split('|') except ValueError: print 'Skipped line: not not enough separators "|". ', line[:20] continue if not target_id == '': # Lookup target_id in the index target_node = None for target_node in source_idx['id']['arxiv:' + target_id]: break if target_node: # skip if target exists last_id = source_id continue # target node does not exist here # if source_id == last_id: # on same node? ref_buffer.append(ref_string) continue # new source_id # # lookup source node for last_node in source_idx['id']['arxiv:' + last_id]: break else: print "Skipped line: source id not forund. ", last_id last_id = source_id continue # last node exists # # write old references to last node last_node['unknown_references'] += ref_buffer last_id = source_id ref_buffer = [ref_string]
def reference_fill_db(match_file=match_file, db=db): """ Reads references from match_file and creates corresponding links in db """ in_iter = open(match_file) # line format examples at the end of the file start = time() rel_count = 0 for batch_count, batch in enumerate(group_generator(in_iter, 10000)): sys.stderr.write( 'Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count)) with db.transaction: for line in batch: try: source_id, ref_string, target_id = line.rstrip().split('|') except ValueError: print 'Skipped line: not not enough separators "|". ', line[: 20] continue # .single property is broken! Have to loop through results (which should be a single one) # equivalent to: source_node = source_idx['id']['arxiv:' + source_id].single for source_node in source_idx['id']['arxiv:' + source_id]: break else: print "Skipped line: source id not forund. ", line[:20] continue if not target_id == '': # Lookup target_id in the index target_node = None for target_node in source_idx['id']['arxiv:' + target_id]: break if target_node: # create reference relation source_node.ref(target_node, ref_string=ref_string, label='Reference') rel_count += 1 continue
def write_caches(): start = time() for i, batch in enumerate(group_generator(PAPER.type.incoming, 1000)): print "Filling citaion and author buffers. %d papers processed in %d sec." % (i*1000, time()-start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode ref_count = 0 for ref in paper_node.ref.outgoing: ref_count += 1 cite_count = 0 for ref in paper_node.ref.incoming: cite_count += 1 paper_node['c_reference_count'] = ref_count paper_node['c_citation_count'] = cite_count paper_node['c_authors'] = ' and '.join([ a_rel.endNode['name'] for a_rel in paper_node.author ])
def write_caches(): start = time() for i, batch in enumerate(group_generator(PAPER.type.incoming, 1000)): print "Filling citaion and author buffers. %d papers processed in %d sec." % ( i * 1000, time() - start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode ref_count = 0 for ref in paper_node.ref.outgoing: ref_count += 1 cite_count = 0 for ref in paper_node.ref.incoming: cite_count += 1 paper_node['c_reference_count'] = ref_count paper_node['c_citation_count'] = cite_count paper_node['c_authors'] = ' and '.join( [a_rel.endNode['name'] for a_rel in paper_node.author])
def reference_fill_db(match_file = match_file , db=db): """ Reads references from match_file and creates corresponding links in db """ in_iter = open(match_file) # line format examples at the end of the file start = time() rel_count = 0 for batch_count, batch in enumerate(group_generator(in_iter,10000)): sys.stderr.write('Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count)) with db.transaction: for line in batch: try: source_id, ref_string, target_id = line.rstrip().split('|') except ValueError: print 'Skipped line: not not enough separators "|". ', line[:20] continue # .single property is broken! Have to loop through results (which should be a single one) # equivalent to: source_node = source_idx['id']['arxiv:' + source_id].single for source_node in source_idx['id']['arxiv:' + source_id]: break else: print "Skipped line: source id not forund. ", line[:20] continue if not target_id == '': # Lookup target_id in the index target_node = None for target_node in source_idx['id']['arxiv:' + target_id]: break if target_node: # create reference relation source_node.ref(target_node, ref_string=ref_string, label='Reference') rel_count += 1 continue
def meta_fill_db(db=db, limit=-1): # # Create Paper Nodes # start = time() chunk_size = 1000 for batch_count, batch in enumerate( group_generator(get_json_from_dir(meta_json_dir, limit=limit), chunk_size)): print 'Processing metadata record %d. Time elapsed: %d sec.' % ( batch_count * chunk_size, time() - start) with db.transaction: for rec_id, meta_dict in batch: # create a new node paper_node = db.node( label='paper_node arxiv:' + rec_id, title=meta_dict['title'][0], abstract=meta_dict['description'][0], unknown_references=[''], date=meta_dict['date'][0], source_url=meta_dict['identifier'] [0], # Check if really works? source_id='arxiv:' + rec_id, # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ], ) # add a relation paper_node --[type]--> PAPER paper_node.type(PAPER) # register in source_id index source_idx['id'][paper_node['source_id']] = paper_node for author_name in meta_dict['creator']: # create an author name node author_node = add_get_author(author_name) # create a relation paper_node --[author]--> author_node paper_node.author(author_node) print 'closing transaction'
def fill_meta_table(db_file, pkl_dir=pkl_dir, max_batch=-1): """ Creates a table meta in db_file and inserts records from pkl_files in pkl_dir Schema: arxiv_id, author, title, abstract, info, subject, year """ # Initialize db con = lite.connect(db_file) # Create/Reset table with con: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS meta") cur.execute( "CREATE TABLE meta(arxiv_id TEXT, author TEXT, title TEXT, abstract TXT, info TEXT, subject TEXT, year INT)" ) def prepare_meta_row(rec_id, meta_dict): authors = ' and '.join(meta_dict['creator']) title = meta_dict['title'][0] abstract = meta_dict['description'][0] info = ', '.join(meta_dict['identifier'][1:]) date = meta_dict['date'][0] subject = ', '.join(meta_dict['subject']) year = date[0:4] return map(cleanup_rec, [rec_id, authors, title, abstract, info, subject, year]) rows = (prepare_meta_row(rec_id, meta_dict) for rec_id, meta_dict in get_meta_from_pkl(pkl_dir)) # Write rows, 10.000 per transaction for batch_count, batch in enumerate(group_generator(rows, 10000)): if batch_count == max_batch: break if DEBUG: print "Writing meta row ", batch_count * 10000 with con: cur.executemany("INSERT INTO meta VALUES(?, ?, ?, ?, ?, ?, ?)", batch)
def meta_fill_db(db=db,limit = -1): # # Create Paper Nodes # start = time() chunk_size = 1000 for batch_count, batch in enumerate(group_generator( get_json_from_dir(meta_json_dir, limit = limit), chunk_size)): print 'Processing metadata record %d. Time elapsed: %d sec.' % (batch_count * chunk_size, time() - start) with db.transaction: for rec_id, meta_dict in batch: # create a new node paper_node = db.node( label = 'paper_node arxiv:'+rec_id, title = meta_dict['title'][0], abstract = meta_dict['description'][0], unknown_references = [''], date = meta_dict['date'][0], source_url = meta_dict['identifier'][0], # Check if really works? source_id = 'arxiv:'+rec_id, # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ], ) # add a relation paper_node --[type]--> PAPER paper_node.type(PAPER) # register in source_id index source_idx['id'][paper_node['source_id']] = paper_node for author_name in meta_dict['creator']: # create an author name node author_node = add_get_author(author_name) # create a relation paper_node --[author]--> author_node paper_node.author(author_node) print 'closing transaction'
def fill_author_table(db_file, pkl_dir=pkl_dir, max_batch=-1): """ Creates a lookup table with schema: author, year, arxiv_id, title All elements from each 'creator' value in meta_dict give rows of the ayit table. """ # Initialize db con = lite.connect(db_file) # Write meta table with con: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS ayit_lookup") cur.execute( "CREATE TABLE ayit_lookup(author TEXT, year INT, arxiv_id TEXT, title TEXT)" ) def prepare_ayit_row(rec_id, meta_dict): title = meta_dict['title'][0] date = meta_dict['date'][0] year = date[0:4] for author in meta_dict['creator']: author = author.split(',')[0] yield map(cleanup_rec, [author, year, rec_id, title]) # take union/chain of all generators returned by prepare_ayit_row rows = (row for rec_id, meta_dict in get_meta_from_pkl(pkl_dir) for row in prepare_ayit_row(rec_id, meta_dict)) # Write rows, 10.000 per transaction for batch_count, batch in enumerate(group_generator(rows, 10000)): if batch_count == max_batch: break if DEBUG: print "Writing author info", batch_count * 10000 with con: cur.executemany("INSERT INTO ayit_lookup VALUES(?, ?, ?, ?)", batch)
def write_cite_rank(iterations=4): start = time() # damping factor d = 0.85 # initial value cr_0 = 1. for iteration in range(iterations): for batch_count, batch in enumerate(group_generator(PAPER.type.incoming, 1000)): print "Calculating CiteRank. Iteration %d. Processed %d papers in %d sec." % (iteration, batch_count*1000, time()-start) with db.transaction: for paper_rel in batch: paper_node = paper_rel.startNode cr = (1 - d) for cite_rel in paper_node.ref.incoming: cite_node = cite_rel.startNode try: cr_node = cite_node['c_cite_rank'] except KeyError: cr_node = cr_0 cr += d * cr_node / cite_node['c_reference_count'] paper_node['c_cite_rank'] = cr
from MetaRead import get_meta_from_pkl import sys sys.path.append('../tools') from shared import group_generator import json for i, batch in enumerate(group_generator(get_meta_from_pkl('../DATA/META/PKL/'),10000)): fh = open('../DATA/META/JSON/META_%03d.json' % i, 'w') json.dump(batch,fh) fh.close()
from MetaRead import get_meta_from_pkl import sys sys.path.append('../tools') from shared import group_generator import json for i, batch in enumerate( group_generator(get_meta_from_pkl('../DATA/META/PKL/'), 10000)): fh = open('../DATA/META/JSON/META_%03d.json' % i, 'w') json.dump(batch, fh) fh.close()