Пример #1
0
def set_author_dict(limit = -1, cache_file = '../DATA/py_author_cache.pkl'):
    global authors_by_id

    if os.path.exists(cache_file):
        print "Reading from cache"
        with open(cache_file) as cf:
            authors_by_id = pickle.load(cf)
        return

    # if cache file not present:
    authors_by_id = defaultdict(set)
    # calculate author dict
    for rec_id,meta_dict in get_json_from_dir(META_DIR, limit = limit):
        authors_by_id[rec_id].update(meta_dict['creator'])

    # cache author dict
    with open(cache_file,'w') as cf:
        pickle.dump(authors_by_id,cf)
Пример #2
0
def meta_fill_db(db=db, limit=-1):
    #
    # Create Paper Nodes
    #
    start = time()
    chunk_size = 1000
    for batch_count, batch in enumerate(
            group_generator(get_json_from_dir(meta_json_dir, limit=limit),
                            chunk_size)):
        print 'Processing metadata record %d. Time elapsed: %d sec.' % (
            batch_count * chunk_size, time() - start)
        with db.transaction:
            for rec_id, meta_dict in batch:
                # create a new node
                paper_node = db.node(
                    label='paper_node arxiv:' + rec_id,
                    title=meta_dict['title'][0],
                    abstract=meta_dict['description'][0],
                    unknown_references=[''],
                    date=meta_dict['date'][0],
                    source_url=meta_dict['identifier']
                    [0],  # Check if really works?
                    source_id='arxiv:' + rec_id,
                    # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ],
                )

                # add a relation paper_node --[type]--> PAPER
                paper_node.type(PAPER)

                # register in source_id index
                source_idx['id'][paper_node['source_id']] = paper_node

                for author_name in meta_dict['creator']:
                    # create an author name node
                    author_node = add_get_author(author_name)

                    # create a relation paper_node --[author]--> author_node
                    paper_node.author(author_node)
            print 'closing transaction'
def meta_fill_db(db=db,limit = -1):
    #
    # Create Paper Nodes
    # 
    start = time()
    chunk_size = 1000
    for batch_count, batch in enumerate(group_generator(
            get_json_from_dir(meta_json_dir, limit = limit),
            chunk_size)):
        print 'Processing metadata record %d. Time elapsed: %d sec.' % (batch_count * chunk_size, time() - start)
        with db.transaction:
            for rec_id, meta_dict in batch:
                # create a new node
                paper_node = db.node(
                    label           = 'paper_node arxiv:'+rec_id,
                    title           = meta_dict['title'][0],
                    abstract        = meta_dict['description'][0],
                    unknown_references = [''],
                    date            = meta_dict['date'][0],
                    source_url      = meta_dict['identifier'][0], # Check if really works?
                    source_id       = 'arxiv:'+rec_id,
                    # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ],
                    )

                # add a relation paper_node --[type]--> PAPER
                paper_node.type(PAPER)

                # register in source_id index
                source_idx['id'][paper_node['source_id']] = paper_node

                for author_name in meta_dict['creator']:
                    # create an author name node
                    author_node = add_get_author(author_name)

                    # create a relation paper_node --[author]--> author_node
                    paper_node.author(author_node)
            print 'closing transaction'