def generate_cited_by_updates(): dd = collections.defaultdict(list) c = conn.cursor() query = 'select PaperReferenceId, PaperId from PaperReferences order by PaperReferenceId' # cites = [] prev_pid = None ips = ItemsPerSecondBar('citations', max=row_count('PaperReferences')) for paperid, citation in c.execute(query): # if prev_pid is None: # prev_pid = paperid # elif prev_pid != paperid: # yield {'id': prev_pid, 'cited_by': {'set': cites}, 'cited_by_count': {'set': len(cites)}} # cites = [] # prev_pid = paperid if len(dd) == 10_000 and paperid not in dd: for k, v in dd.items(): yield { 'id': k, 'cited_by': { 'set': v }, 'cited_by_count': { 'set': len(v) } } dd = collections.defaultdict(list) dd[paperid].append(citation) # cites.append(citation) ips.next()
def generate_author_affiliation_updates(): c = conn.cursor() authors = [] affiliations = [] query = 'select PaperId, OriginalAuthor, OriginalAffiliation ' \ 'from PaperAuthorAffiliations order by PaperId, AuthorSequenceNumber' prev_pid = None ips = ItemsPerSecondBar('author/affiliations', max=row_count('PaperAuthorAffiliations')) for paperid, author, affiliation in c.execute(query): if prev_pid is None: prev_pid = paperid elif prev_pid != paperid: yield { 'id': prev_pid, 'author': { 'set': authors }, 'affiliation': { 'set': affiliations } } authors = [] affiliations = [] prev_pid = paperid authors.append(author) affiliations.append(affiliation) ips.next() ips.finish()
def generate_journal_updates(): c = conn.cursor() query = 'select PaperId, J.DisplayName from Papers P inner join Journals J on P.JournalId = J.JournalId' ips = ItemsPerSecondBar('journals') for paperid, journal in c.execute(query): yield {'id': paperid, 'journal': {'set': journal}} ips.next() ips.finish()
def generate_papers(): c = row_conn.cursor() # row_count = c.execute('SELECT Count(*) FROM Papers').fetchone()[0] fields = ', '.join(paper_field_names) new_names = list(paper_field_names.values()) ips = ItemsPerSecondBar('papers', max=row_count('Papers')) for paper in c.execute(f'select {fields} from Papers'): yield dict(zip(new_names, paper)) ips.next() ips.finish()
def merge_paper_data(): conn = make_connection() journals = dict( conn.execute('select JournalId, DisplayName from Journals')) conference_series = dict( conn.execute( 'select ConferenceSeriesId, DisplayName from ConferenceSeries')) conference_instances = dict( conn.execute( 'select ConferenceInstanceId, DisplayName from ConferenceInstances' )) print('assembler started, looping') feeder_creators = [ make_paper_feed_proc, make_author_affiliation_feed_proc, make_citation_feed_proc, make_references_feed_proc, make_url_feed_proc ] procs = [] queues = [] iters = [] for creator in feeder_creators: q, p = creator() procs.append(p) queues.append(q) iters.append(iter(q.get, 'STOP')) count = 0 print('iterating') row_count = count_papers() ips = ItemsPerSecondBar('Merging', max=row_count) for pid, group in groupby(merge(*iters, key=lambda x: x[0]), key=lambda x: x[0]): _, paper = next(group) for _, remaining in group: paper.update(remaining) cii = paper.pop('conferenceinstanceid') if type(cii) is int: paper['conferenceinstance'] = conference_instances[cii] csi = paper.pop('conferenceseriesid') if type(csi) is int: paper['conferenceseries'] = conference_series[csi] ji = paper.pop('journalid') if type(ji) is int: paper['journal'] = journals[ji] strip_empty_fields(paper) yield paper ips.next() # if ips.index > 100: # for proc in procs: # proc.kill() ips.finish()
def generate_url_updates(): c = conn.cursor() query = 'select PaperId, SourceUrl from PaperUrls order by PaperId' prev_pid = None urls = [] ips = ItemsPerSecondBar('urls', max=row_count('PaperUrls')) for paperid, url in c.execute(query): if prev_pid is None: prev_pid = paperid elif prev_pid != paperid: yield {'id': prev_pid, 'urls': {'set': urls}} urls = [] prev_pid = paperid urls.append(url) ips.next() ips.finish()
def yield_parallel(): input_queue = mp.JoinableQueue(maxsize=10_000) output_queue = mp.JoinableQueue(maxsize=10_000) assemblers = [] row_count = count_papers() for i in range(PROC_COUNT): proc = mp.Process(target=assemble, args=(input_queue, output_queue)) proc.start() assemblers.append(proc) feeder = mp.Process(target=feed, args=(input_queue, )) feeder.start() ips = ItemsPerSecondBar('Merging', max=row_count) for jsonl in iter(output_queue.get, 'STOP'): yield jsonl ips.next() ips.finish()
def generate_references_updates(): c = conn.cursor() query = 'select PaperId, PaperReferenceId from PaperReferences order by PaperId' prev_pid = None refs = [] ips = ItemsPerSecondBar('references', max=row_count('PaperReferences')) for paperid, ref in c.execute(query): if prev_pid is None: prev_pid = paperid elif prev_pid != paperid: yield { 'id': prev_pid, 'references': { 'set': refs }, 'references_count': { 'set': len(refs) } } refs = [] prev_pid = paperid refs.append(ref) ips.next() ips.finish()