def build_search_index():
    start = time()
    try:
        search_idx = db.nodes.indexes.get('search_idx')
        with db.transaction:
            search_idx.delete()
    except ValueError:
        pass
    
    search_idx = db.nodes.indexes.create('search_idx',type='fulltext')

    # Loop through papers
    for batch_count, batch in enumerate(group_generator(PAPER.type.incoming, 1000)):
        print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count*1000, time()-start)
        with db.transaction:
            for paper_rel in batch:
                paper_node = paper_rel.startNode
                search_idx['title'][paper_node['title']]=paper_node

    for batch_count, batch in enumerate(group_generator(AUTHOR.type.incoming, 1000)):
        print "Building search index. Processing paper %d. Elapsed time %d sec," % ( batch_count*1000, time()-start)
        with db.transaction:
            for author_rel in batch:
                author_node = author_rel.startNode
                name  = author_node['name'].replace(',',' ')
                search_idx['author'][name]=author_node
Пример #2
0
def build_search_index():
    start = time()
    try:
        search_idx = db.nodes.indexes.get('search_idx')
        with db.transaction:
            search_idx.delete()
    except ValueError:
        pass

    search_idx = db.nodes.indexes.create('search_idx', type='fulltext')

    # Loop through papers
    for batch_count, batch in enumerate(
            group_generator(PAPER.type.incoming, 1000)):
        print "Building search index. Processing paper %d. Elapsed time %d sec," % (
            batch_count * 1000, time() - start)
        with db.transaction:
            for paper_rel in batch:
                paper_node = paper_rel.startNode
                search_idx['title'][paper_node['title']] = paper_node

    for batch_count, batch in enumerate(
            group_generator(AUTHOR.type.incoming, 1000)):
        print "Building search index. Processing paper %d. Elapsed time %d sec," % (
            batch_count * 1000, time() - start)
        with db.transaction:
            for author_rel in batch:
                author_node = author_rel.startNode
                name = author_node['name'].replace(',', ' ')
                search_idx['author'][name] = author_node
Пример #3
0
def write_cite_rank(iterations=4):
    start = time()
    # damping factor
    d = 0.85
    # initial value
    cr_0 = 1.

    for iteration in range(iterations):
        for batch_count, batch in enumerate(
                group_generator(PAPER.type.incoming, 1000)):
            print "Calculating CiteRank. Iteration %d. Processed %d papers in %d sec." % (
                iteration, batch_count * 1000, time() - start)
            with db.transaction:
                for paper_rel in batch:
                    paper_node = paper_rel.startNode

                    cr = (1 - d)
                    for cite_rel in paper_node.ref.incoming:
                        cite_node = cite_rel.startNode
                        try:
                            cr_node = cite_node['c_cite_rank']
                        except KeyError:
                            cr_node = cr_0
                        cr += d * cr_node / cite_node['c_reference_count']

                    paper_node['c_cite_rank'] = cr
Пример #4
0
def unmatched_reference_fill_db(match_file=match_file, db=db):
    """
    Findes lines in match_file, where the target cannot be found, and adds
    the corresponding reference strings to 'unknown_references' list in the source.
    """

    in_iter = open(match_file)
    # line format examples at the end of the file

    start = time()
    rel_count = 0
    last_id = ''
    ref_buffer = []
    for batch_count, batch in enumerate(group_generator(in_iter, 10000)):
        sys.stderr.write(
            'Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n'
            % (batch_count, time() - start, rel_count))
        with db.transaction:
            for line in batch:
                try:
                    source_id, ref_string, target_id = line.rstrip().split('|')
                except ValueError:
                    print 'Skipped line: not not enough separators "|". ', line[:
                                                                                20]
                    continue

                if not target_id == '':
                    # Lookup target_id in the index
                    target_node = None
                    for target_node in source_idx['id']['arxiv:' + target_id]:
                        break
                    if target_node:
                        # skip if target exists
                        last_id = source_id
                        continue

                # target node does not exist here #

                if source_id == last_id:  # on same node?
                    ref_buffer.append(ref_string)
                    continue

                # new source_id #

                # lookup source node
                for last_node in source_idx['id']['arxiv:' + last_id]:
                    break
                else:
                    print "Skipped line: source id not forund. ", last_id
                    last_id = source_id
                    continue
                # last node exists #

                # write old references to last node
                last_node['unknown_references'] += ref_buffer
                last_id = source_id
                ref_buffer = [ref_string]
Пример #5
0
def json_export(json_dir=json_dir, pkl_dir=pkl_dir):
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    for i, batch in enumerate(
            group_generator(get_meta_from_pkl(pkl_dir), 10000)):
        fh = open(json_dir + 'META_%03d.json' % i, 'w')
        json.dump(batch, fh)
        fh.close()
def unmatched_reference_fill_db(match_file = match_file , db=db):
    """
    Findes lines in match_file, where the target cannot be found, and adds
    the corresponding reference strings to 'unknown_references' list in the source.
    """
   
    in_iter = open(match_file)
    # line format examples at the end of the file

    start = time()
    rel_count = 0
    last_id = ''
    ref_buffer = []
    for batch_count, batch in enumerate(group_generator(in_iter,10000)):
        sys.stderr.write('Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count))
        with db.transaction:
            for line in batch:
                try:
                    source_id, ref_string, target_id = line.rstrip().split('|')
                except ValueError:
                    print 'Skipped line: not not enough separators "|". ',  line[:20]
                    continue

                if not target_id == '':
                    # Lookup target_id in the index
                    target_node = None
                    for target_node in source_idx['id']['arxiv:' + target_id]: 
                        break
                    if target_node:
                        # skip if target exists
                        last_id = source_id
                        continue

                # target node does not exist here #

                if source_id == last_id: # on same node?
                    ref_buffer.append(ref_string)
                    continue

                # new source_id #

                # lookup source node 
                for last_node in source_idx['id']['arxiv:' + last_id]:
                    break
                else: 
                    print "Skipped line: source id not forund. ", last_id
                    last_id = source_id
                    continue
                # last node exists #

                # write old references to last node
                last_node['unknown_references'] += ref_buffer                
                last_id = source_id
                ref_buffer = [ref_string]
Пример #7
0
def reference_fill_db(match_file=match_file, db=db):
    """
    Reads references from match_file and creates corresponding links in db
    """

    in_iter = open(match_file)
    # line format examples at the end of the file

    start = time()
    rel_count = 0
    for batch_count, batch in enumerate(group_generator(in_iter, 10000)):
        sys.stderr.write(
            'Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n'
            % (batch_count, time() - start, rel_count))
        with db.transaction:
            for line in batch:
                try:
                    source_id, ref_string, target_id = line.rstrip().split('|')
                except ValueError:
                    print 'Skipped line: not not enough separators "|". ', line[:
                                                                                20]
                    continue

                # .single property is broken! Have to loop through results (which should be a single one)
                # equivalent to: source_node = source_idx['id']['arxiv:' + source_id].single
                for source_node in source_idx['id']['arxiv:' + source_id]:
                    break
                else:
                    print "Skipped line: source id not forund. ", line[:20]
                    continue

                if not target_id == '':
                    # Lookup target_id in the index
                    target_node = None
                    for target_node in source_idx['id']['arxiv:' + target_id]:
                        break

                    if target_node:
                        # create reference relation
                        source_node.ref(target_node,
                                        ref_string=ref_string,
                                        label='Reference')
                        rel_count += 1
                        continue
def write_caches():
    start = time()
    for i, batch in enumerate(group_generator(PAPER.type.incoming, 1000)):
        print "Filling citaion and author buffers. %d papers processed in %d sec." % (i*1000, time()-start)
        with db.transaction:
            for paper_rel in batch:
                paper_node = paper_rel.startNode

                ref_count = 0
                for ref in paper_node.ref.outgoing:
                    ref_count += 1 

                cite_count = 0
                for ref in paper_node.ref.incoming:
                    cite_count += 1 

                paper_node['c_reference_count'] = ref_count
                paper_node['c_citation_count'] = cite_count
                paper_node['c_authors'] = ' and '.join([ a_rel.endNode['name'] for a_rel in paper_node.author ])
Пример #9
0
def write_caches():
    start = time()
    for i, batch in enumerate(group_generator(PAPER.type.incoming, 1000)):
        print "Filling citaion and author buffers. %d papers processed in %d sec." % (
            i * 1000, time() - start)
        with db.transaction:
            for paper_rel in batch:
                paper_node = paper_rel.startNode

                ref_count = 0
                for ref in paper_node.ref.outgoing:
                    ref_count += 1

                cite_count = 0
                for ref in paper_node.ref.incoming:
                    cite_count += 1

                paper_node['c_reference_count'] = ref_count
                paper_node['c_citation_count'] = cite_count
                paper_node['c_authors'] = ' and '.join(
                    [a_rel.endNode['name'] for a_rel in paper_node.author])
def reference_fill_db(match_file = match_file , db=db):
    """
    Reads references from match_file and creates corresponding links in db
    """
   
    in_iter = open(match_file)
    # line format examples at the end of the file

    start = time()
    rel_count = 0
    for batch_count, batch in enumerate(group_generator(in_iter,10000)):
        sys.stderr.write('Processing reference %d x 10000. Elapsed time %d sec. Relations created %d. \n' % (batch_count, time() - start, rel_count))
        with db.transaction:
            for line in batch:
                try:
                    source_id, ref_string, target_id = line.rstrip().split('|')
                except ValueError:
                    print 'Skipped line: not not enough separators "|". ',  line[:20]
                    continue

                # .single property is broken! Have to loop through results (which should be a single one)
                # equivalent to: source_node = source_idx['id']['arxiv:' + source_id].single
                for source_node in source_idx['id']['arxiv:' + source_id]: 
                    break
                else: 
                    print "Skipped line: source id not forund. ", line[:20]
                    continue
                

                if not target_id == '':
                    # Lookup target_id in the index
                    target_node = None
                    for target_node in source_idx['id']['arxiv:' + target_id]: 
                        break

                    if target_node:
                        # create reference relation
                        source_node.ref(target_node, ref_string=ref_string, label='Reference')
                        rel_count += 1
                        continue
Пример #11
0
def meta_fill_db(db=db, limit=-1):
    #
    # Create Paper Nodes
    #
    start = time()
    chunk_size = 1000
    for batch_count, batch in enumerate(
            group_generator(get_json_from_dir(meta_json_dir, limit=limit),
                            chunk_size)):
        print 'Processing metadata record %d. Time elapsed: %d sec.' % (
            batch_count * chunk_size, time() - start)
        with db.transaction:
            for rec_id, meta_dict in batch:
                # create a new node
                paper_node = db.node(
                    label='paper_node arxiv:' + rec_id,
                    title=meta_dict['title'][0],
                    abstract=meta_dict['description'][0],
                    unknown_references=[''],
                    date=meta_dict['date'][0],
                    source_url=meta_dict['identifier']
                    [0],  # Check if really works?
                    source_id='arxiv:' + rec_id,
                    # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ],
                )

                # add a relation paper_node --[type]--> PAPER
                paper_node.type(PAPER)

                # register in source_id index
                source_idx['id'][paper_node['source_id']] = paper_node

                for author_name in meta_dict['creator']:
                    # create an author name node
                    author_node = add_get_author(author_name)

                    # create a relation paper_node --[author]--> author_node
                    paper_node.author(author_node)
            print 'closing transaction'
Пример #12
0
def fill_meta_table(db_file, pkl_dir=pkl_dir, max_batch=-1):
    """ 
    Creates a table meta in db_file and inserts records from pkl_files in pkl_dir
    Schema: arxiv_id, author, title, abstract, info, subject, year
    """

    # Initialize db
    con = lite.connect(db_file)

    # Create/Reset table
    with con:
        cur = con.cursor()
        cur.execute("DROP TABLE IF EXISTS meta")
        cur.execute(
            "CREATE TABLE meta(arxiv_id TEXT, author TEXT, title TEXT, abstract TXT, info TEXT, subject TEXT, year INT)"
        )

    def prepare_meta_row(rec_id, meta_dict):
        authors = ' and '.join(meta_dict['creator'])
        title = meta_dict['title'][0]
        abstract = meta_dict['description'][0]
        info = ', '.join(meta_dict['identifier'][1:])
        date = meta_dict['date'][0]
        subject = ', '.join(meta_dict['subject'])
        year = date[0:4]

        return map(cleanup_rec,
                   [rec_id, authors, title, abstract, info, subject, year])

    rows = (prepare_meta_row(rec_id, meta_dict)
            for rec_id, meta_dict in get_meta_from_pkl(pkl_dir))

    # Write rows, 10.000 per transaction
    for batch_count, batch in enumerate(group_generator(rows, 10000)):
        if batch_count == max_batch: break
        if DEBUG: print "Writing meta row ", batch_count * 10000
        with con:
            cur.executemany("INSERT INTO meta VALUES(?, ?, ?, ?, ?, ?, ?)",
                            batch)
def meta_fill_db(db=db,limit = -1):
    #
    # Create Paper Nodes
    # 
    start = time()
    chunk_size = 1000
    for batch_count, batch in enumerate(group_generator(
            get_json_from_dir(meta_json_dir, limit = limit),
            chunk_size)):
        print 'Processing metadata record %d. Time elapsed: %d sec.' % (batch_count * chunk_size, time() - start)
        with db.transaction:
            for rec_id, meta_dict in batch:
                # create a new node
                paper_node = db.node(
                    label           = 'paper_node arxiv:'+rec_id,
                    title           = meta_dict['title'][0],
                    abstract        = meta_dict['description'][0],
                    unknown_references = [''],
                    date            = meta_dict['date'][0],
                    source_url      = meta_dict['identifier'][0], # Check if really works?
                    source_id       = 'arxiv:'+rec_id,
                    # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ],
                    )

                # add a relation paper_node --[type]--> PAPER
                paper_node.type(PAPER)

                # register in source_id index
                source_idx['id'][paper_node['source_id']] = paper_node

                for author_name in meta_dict['creator']:
                    # create an author name node
                    author_node = add_get_author(author_name)

                    # create a relation paper_node --[author]--> author_node
                    paper_node.author(author_node)
            print 'closing transaction'
Пример #14
0
def fill_author_table(db_file, pkl_dir=pkl_dir, max_batch=-1):
    """ 
    Creates a lookup table with schema: author, year, arxiv_id, title
    All elements from each 'creator' value in meta_dict give rows of the ayit table.
    """

    # Initialize db
    con = lite.connect(db_file)

    # Write meta table
    with con:
        cur = con.cursor()
        cur.execute("DROP TABLE IF EXISTS ayit_lookup")
        cur.execute(
            "CREATE TABLE ayit_lookup(author TEXT, year INT, arxiv_id TEXT, title TEXT)"
        )

    def prepare_ayit_row(rec_id, meta_dict):
        title = meta_dict['title'][0]
        date = meta_dict['date'][0]
        year = date[0:4]

        for author in meta_dict['creator']:
            author = author.split(',')[0]
            yield map(cleanup_rec, [author, year, rec_id, title])

    # take union/chain of all generators returned by prepare_ayit_row
    rows = (row for rec_id, meta_dict in get_meta_from_pkl(pkl_dir)
            for row in prepare_ayit_row(rec_id, meta_dict))

    # Write rows, 10.000 per transaction
    for batch_count, batch in enumerate(group_generator(rows, 10000)):
        if batch_count == max_batch: break
        if DEBUG: print "Writing author info", batch_count * 10000
        with con:
            cur.executemany("INSERT INTO ayit_lookup VALUES(?, ?, ?, ?)",
                            batch)
def write_cite_rank(iterations=4):
    start = time()
    # damping factor
    d = 0.85 
    # initial value
    cr_0 = 1.

    for iteration in range(iterations):
        for batch_count, batch in enumerate(group_generator(PAPER.type.incoming, 1000)):
            print "Calculating CiteRank. Iteration %d. Processed %d papers in %d sec." % (iteration, batch_count*1000, time()-start)
            with db.transaction:
                for paper_rel in batch:
                    paper_node = paper_rel.startNode
                    
                    cr = (1 - d)
                    for cite_rel in paper_node.ref.incoming:
                        cite_node = cite_rel.startNode
                        try:
                            cr_node = cite_node['c_cite_rank']
                        except KeyError:
                            cr_node = cr_0                            
                        cr += d * cr_node / cite_node['c_reference_count']

                    paper_node['c_cite_rank'] = cr
from MetaRead import get_meta_from_pkl

import sys
sys.path.append('../tools')
from shared import group_generator

import json

for i, batch in enumerate(group_generator(get_meta_from_pkl('../DATA/META/PKL/'),10000)):
    fh = open('../DATA/META/JSON/META_%03d.json' % i, 'w')
    json.dump(batch,fh)
    fh.close()
Пример #17
0
from MetaRead import get_meta_from_pkl

import sys
sys.path.append('../tools')
from shared import group_generator

import json

for i, batch in enumerate(
        group_generator(get_meta_from_pkl('../DATA/META/PKL/'), 10000)):
    fh = open('../DATA/META/JSON/META_%03d.json' % i, 'w')
    json.dump(batch, fh)
    fh.close()