def generate_cited_by_updates():
    dd = collections.defaultdict(list)
    c = conn.cursor()
    query = 'select PaperReferenceId, PaperId from PaperReferences order by PaperReferenceId'
    # cites = []
    prev_pid = None
    ips = ItemsPerSecondBar('citations', max=row_count('PaperReferences'))
    for paperid, citation in c.execute(query):
        # if prev_pid is None:
        #     prev_pid = paperid
        # elif prev_pid != paperid:
        #     yield {'id': prev_pid, 'cited_by': {'set': cites}, 'cited_by_count': {'set': len(cites)}}
        #     cites = []
        #     prev_pid = paperid
        if len(dd) == 10_000 and paperid not in dd:
            for k, v in dd.items():
                yield {
                    'id': k,
                    'cited_by': {
                        'set': v
                    },
                    'cited_by_count': {
                        'set': len(v)
                    }
                }
            dd = collections.defaultdict(list)
        dd[paperid].append(citation)
        # cites.append(citation)
        ips.next()
def generate_author_affiliation_updates():
    c = conn.cursor()
    authors = []
    affiliations = []
    query = 'select PaperId, OriginalAuthor, OriginalAffiliation ' \
            'from PaperAuthorAffiliations order by PaperId, AuthorSequenceNumber'
    prev_pid = None
    ips = ItemsPerSecondBar('author/affiliations',
                            max=row_count('PaperAuthorAffiliations'))
    for paperid, author, affiliation in c.execute(query):
        if prev_pid is None:
            prev_pid = paperid
        elif prev_pid != paperid:
            yield {
                'id': prev_pid,
                'author': {
                    'set': authors
                },
                'affiliation': {
                    'set': affiliations
                }
            }
            authors = []
            affiliations = []
            prev_pid = paperid
        authors.append(author)
        affiliations.append(affiliation)
        ips.next()
    ips.finish()
def generate_journal_updates():
    c = conn.cursor()
    query = 'select PaperId, J.DisplayName from Papers P inner join Journals J on P.JournalId = J.JournalId'
    ips = ItemsPerSecondBar('journals')
    for paperid, journal in c.execute(query):
        yield {'id': paperid, 'journal': {'set': journal}}
        ips.next()
    ips.finish()
def generate_papers():
    c = row_conn.cursor()
    # row_count = c.execute('SELECT Count(*) FROM Papers').fetchone()[0]
    fields = ', '.join(paper_field_names)
    new_names = list(paper_field_names.values())
    ips = ItemsPerSecondBar('papers', max=row_count('Papers'))
    for paper in c.execute(f'select {fields} from Papers'):
        yield dict(zip(new_names, paper))
        ips.next()
    ips.finish()
예제 #5
0
def merge_paper_data():
    conn = make_connection()

    journals = dict(
        conn.execute('select JournalId, DisplayName from Journals'))
    conference_series = dict(
        conn.execute(
            'select ConferenceSeriesId, DisplayName from ConferenceSeries'))
    conference_instances = dict(
        conn.execute(
            'select ConferenceInstanceId, DisplayName from ConferenceInstances'
        ))
    print('assembler started, looping')
    feeder_creators = [
        make_paper_feed_proc, make_author_affiliation_feed_proc,
        make_citation_feed_proc, make_references_feed_proc, make_url_feed_proc
    ]
    procs = []
    queues = []
    iters = []
    for creator in feeder_creators:
        q, p = creator()
        procs.append(p)
        queues.append(q)
        iters.append(iter(q.get, 'STOP'))
    count = 0
    print('iterating')
    row_count = count_papers()
    ips = ItemsPerSecondBar('Merging', max=row_count)
    for pid, group in groupby(merge(*iters, key=lambda x: x[0]),
                              key=lambda x: x[0]):
        _, paper = next(group)
        for _, remaining in group:
            paper.update(remaining)

        cii = paper.pop('conferenceinstanceid')
        if type(cii) is int:
            paper['conferenceinstance'] = conference_instances[cii]
        csi = paper.pop('conferenceseriesid')
        if type(csi) is int:
            paper['conferenceseries'] = conference_series[csi]
        ji = paper.pop('journalid')
        if type(ji) is int:
            paper['journal'] = journals[ji]
        strip_empty_fields(paper)
        yield paper
        ips.next()

        # if ips.index > 100:
        #     for proc in procs:
        #         proc.kill()
    ips.finish()
def generate_url_updates():
    c = conn.cursor()
    query = 'select PaperId, SourceUrl from PaperUrls order by PaperId'
    prev_pid = None
    urls = []
    ips = ItemsPerSecondBar('urls', max=row_count('PaperUrls'))
    for paperid, url in c.execute(query):
        if prev_pid is None:
            prev_pid = paperid
        elif prev_pid != paperid:
            yield {'id': prev_pid, 'urls': {'set': urls}}
            urls = []
            prev_pid = paperid
        urls.append(url)
        ips.next()
    ips.finish()
예제 #7
0
def yield_parallel():
    input_queue = mp.JoinableQueue(maxsize=10_000)
    output_queue = mp.JoinableQueue(maxsize=10_000)
    assemblers = []
    row_count = count_papers()
    for i in range(PROC_COUNT):
        proc = mp.Process(target=assemble, args=(input_queue, output_queue))
        proc.start()
        assemblers.append(proc)

    feeder = mp.Process(target=feed, args=(input_queue, ))
    feeder.start()
    ips = ItemsPerSecondBar('Merging', max=row_count)
    for jsonl in iter(output_queue.get, 'STOP'):
        yield jsonl
        ips.next()
    ips.finish()
def generate_references_updates():
    c = conn.cursor()
    query = 'select PaperId, PaperReferenceId from PaperReferences order by PaperId'
    prev_pid = None
    refs = []
    ips = ItemsPerSecondBar('references', max=row_count('PaperReferences'))
    for paperid, ref in c.execute(query):
        if prev_pid is None:
            prev_pid = paperid
        elif prev_pid != paperid:
            yield {
                'id': prev_pid,
                'references': {
                    'set': refs
                },
                'references_count': {
                    'set': len(refs)
                }
            }
            refs = []
            prev_pid = paperid
        refs.append(ref)
        ips.next()
    ips.finish()