def build_test_corpus(source_file, target_file): try: os.unlink(target_file) except: pass with open(source_file, 'w') as tf: for i in range(100): json.dump( { FieldNames.TITLE: ' '.join(random.sample(WORDS, 10)), FieldNames.ABSTRACT: ' '.join(random.sample(WORDS, 1000)), FieldNames.AUTHORS: [], FieldNames.OUT_CITATIONS: [str(x) for x in random.sample(range(100), 2)], FieldNames.IN_CITATION_COUNT: len([str(x) for x in random.sample(range(100), 2)]), FieldNames.KEY_PHRASES: random.sample(WORDS, 3), FieldNames.YEAR: 2011, FieldNames.PAPER_ID: str(i), FieldNames.VENUE: 'v-{}'.format(random.randint(1, 5)) }, tf) tf.write('\n') Corpus.build(target_file, source_file)
def _verify(db_filename, corpus_json): try: Corpus.build(db_filename=db_filename, source_json=corpus_json) except Exception as e: logging.critical( "Failed to build corpus {} for file {}".format( db_filename, corpus_json)) print(e)
def main(self, args): logging.info("Reading Open Corpus file from: {}".format( self.input_path)) logging.info("Writing json file to: {}".format(self.output_path)) dp = DatasetPaths() assert os.path.exists(self.input_path) assert not os.path.exists(self.output_path) assert not os.path.exists(dp.get_pkl_path('oc')) with open(self.output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: len(obj['inCitations']), FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc')) pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
with open(output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: 0, FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } s += 1 if s == 10: break f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.build(dp.get_db_path('oc'), [output_path]) with open(output_pkl_path, 'wb') as f: pickle.dump(oc_corpus, f, -1)
def main(self, args): if self.dataset_name == 'dblp': input_path = DatasetPaths.DBLP_GOLD_DIR output_path = DatasetPaths.DBLP_CORPUS_JSON elif self.dataset_name == 'pubmed': input_path = DatasetPaths.PUBMED_GOLD_DIR output_path = DatasetPaths.PUBMED_CORPUS_JSON else: assert False logging.info("Reading Gold data from {}".format(input_path)) logging.info("Writing corpus to {}".format(output_path)) assert os.path.exists(input_path) assert not os.path.exists(output_path) papers_file = os.path.join(input_path, "papers.txt") abstracts_file = os.path.join(input_path, "abstracts.txt") keyphrases_file = os.path.join(input_path, "paper_keyphrases.txt") citations_file = os.path.join(input_path, "paper_paper.txt") authors_file = os.path.join(input_path, "paper_author.txt") venues_file = os.path.join(input_path, "paper_venue.txt") paper_titles = {} paper_years = {} paper_abstracts = {} paper_keyphrases = {} paper_citations = {} paper_in_citations = {} paper_authors = {} paper_venues = {} bad_ids = set() for line in file_util.read_lines(abstracts_file): parts = line.split("\t") paper_id = int(parts[0]) if len(parts) == 2: paper_abstracts[paper_id] = parts[1] else: paper_abstracts[paper_id] = "" if paper_abstracts[paper_id] == "": bad_ids.add(paper_id) for line in file_util.read_lines(papers_file): parts = line.split('\t') paper_id = int(parts[0]) paper_years[paper_id] = int(parts[2]) paper_titles[paper_id] = parts[3] for line in file_util.read_lines(keyphrases_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_keyphrases: paper_keyphrases[paper_id] = [] for kp in parts[1:]: kp = kp.strip() if len(kp) > 0: paper_keyphrases[paper_id].append(kp[:-4]) for line in file_util.read_lines(citations_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_citations: paper_citations[paper_id] = [] c = int(parts[1]) if c in bad_ids: continue paper_citations[paper_id].append(str(c)) if c not in paper_in_citations: paper_in_citations[c] = [] if paper_id not in paper_in_citations: paper_in_citations[paper_id] = [] paper_in_citations[c].append(paper_id) for line in file_util.read_lines(authors_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_authors: paper_authors[paper_id] = [] paper_authors[paper_id].append(parts[1]) for line in file_util.read_lines(venues_file): parts = line.split("\t") paper_id = int(parts[0]) paper_venues[paper_id] = parts[1] test_paper_id = 13 print("==== Test Paper Details ====") print(paper_titles[test_paper_id]) print(paper_years[test_paper_id]) print(paper_abstracts[test_paper_id]) print(paper_keyphrases[test_paper_id]) print(paper_citations[test_paper_id]) print(paper_in_citations[test_paper_id]) print(paper_authors[test_paper_id]) print(paper_venues[test_paper_id]) print("==== Test Paper Details ====") def _print_len(x, name=''): print("No. of {} = {}".format(name, len(x))) _print_len(paper_titles, 'Titles') _print_len(paper_years, 'Years') _print_len(paper_abstracts, 'Abstracts') _print_len(paper_keyphrases, 'KeyPhrases') _print_len(paper_citations, 'Paper Citations') _print_len(paper_in_citations, 'Paper In citations') _print_len(paper_authors, ' Authors') _print_len(paper_venues, ' Venues') logging.info("Skipped {} papers due to insufficient data".format( len(bad_ids))) corpus = {} for id, title in tqdm.tqdm(paper_titles.items()): if id in bad_ids: continue doc = document_from_dict({ FieldNames.PAPER_ID: str(id), FieldNames.TITLE: ' '.join(global_tokenizer(title)), FieldNames.ABSTRACT: ' '.join(global_tokenizer(paper_abstracts[id])), FieldNames.OUT_CITATIONS: paper_citations.get(id, []), FieldNames.YEAR: paper_years[id], FieldNames.AUTHORS: paper_authors.get(id, []), FieldNames.KEY_PHRASES: paper_keyphrases[id], FieldNames.OUT_CITATION_COUNT: len(paper_citations.get(id, [])), FieldNames.IN_CITATION_COUNT: len(paper_in_citations.get(id, [])), FieldNames.VENUE: paper_venues.get(id, ''), FieldNames.TITLE_RAW: title, FieldNames.ABSTRACT_RAW: paper_abstracts[id] }) corpus[id] = doc with open(output_path, 'w') as f: for _, doc in corpus.items(): doc_json = dict_from_document(doc) f.write(json.dumps(doc_json)) f.write("\n") dp = DatasetPaths() Corpus.build(dp.get_db_path(self.dataset_name), dp.get_json_path(self.dataset_name))
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model