def generate_url_updates(): current_id = None urls = [] for line in read_gzip_lines(PAPERURLS_FILE, _ENCODING): next_id, _, url = line.split('\t') url = url.strip() if next_id != current_id: if len(urls) > 0: yield json.dumps({'id': current_id, 'urls': {'set': urls}}) current_id = next_id urls = [] urls.append(url)
def generate_reference_updates(): current_id = None references = [] for line in read_gzip_lines(REFERENCES_FILE, _ENCODING): next_id, ref = line.split('\t') ref = ref.strip() if next_id != current_id: if len(references) > 0: yield json.dumps({ 'PaperId': current_id, 'References': { 'set': references } }) current_id = next_id references = [] references.append(ref)
def generate_paper_author_affiliations(): current_paper_id = None rels = dict() for idx, thing in enumerate( generate_json_dict(PaperAuthorAffiliations, read_gzip_lines(PAPER_AUTHOR_FILE, _ENCODING))): if 0 == idx % 10_000: print(f'{idx}', end=' ') if 0 == idx % 100_000: print() rel = PaperAuthorAffiliation(**thing) if rel.PaperId != current_paper_id: if 0 < len(rels): yield current_paper_id, [ rels[idx].AuthorId for idx in sorted(rels) ] current_paper_id = rel.PaperId rels = dict() rels[int(rel.AuthorSequenceNumber)] = rel
def read_all(): for filename in file_list(): yield from read_gzip_lines(filename, _ENCODING)
def read_all(): yield from read_gzip_lines(_DATA, _ENCODING)
def generate_denormailzed_papers(): yield from read_gzip_lines(DENORM_PAPER_FILE, _ENCODING)
def generate_conference_instance_updates(): yield from read_gzip_lines(CONF_INST_UPDATE_FILE, _ENCODING)
def generate_conference_series_updates(): yield from read_gzip_lines(CONF_SER_UPDATE_FILE, _ENCODING)
def generate_journal_updates(): yield from read_gzip_lines(JOURNAL_UPDATE_FILE, _ENCODING)
def generate_author_updates(): yield from read_gzip_lines(AUTHOR_UPDATE_FILE, _ENCODING)
def generate_conference_series(): yield from generate_json_string( ConferenceSeries, read_gzip_lines(CONF_SERIES_FILE, _ENCODING))
def generate_conference_instances(): yield from generate_json_string(ConferenceInstances, read_gzip_lines(CONF_INST_FILE, _ENCODING))
def generate_journals(): yield from generate_json_string(Journals, read_gzip_lines(JOURNALS_FILE, _ENCODING))
def generate_papers(): yield from generate_json_string(Papers, read_gzip_lines(PAPERS_FILE, _ENCODING))
def generate_authors(): yield from generate_json_string(Authors, read_gzip_lines(AUTHORS_FILE, _ENCODING))
def read_gzip_lines_utf8(path): yield from read_gzip_lines(path, encoding=_ENCODING)