def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) authors = Counter() key_phrases = Counter() years = Counter() venues = Counter() num_docs_with_kp = 0 in_citations_counts = [] out_citations_counts = [] for doc in corpus: authors.update(doc.authors) key_phrases.update(doc.key_phrases) if len(doc.key_phrases) > 0: num_docs_with_kp += 1 in_citations_counts.append(doc.in_citation_count) out_citations_counts.append(doc.out_citation_count) years.update([doc.year]) venues.update([doc.venue]) training_years = [corpus[doc_id].year for doc_id in corpus.train_ids] validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids] testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids] print("No. of documents = {}".format(len(corpus))) print("Unique number of authors = {}".format(len(authors))) print("Unique number of key phrases = {}".format(len(key_phrases))) print("Unique number of venues = {}".format(len(venues))) print("No. of docs with key phrases = {}".format(num_docs_with_kp)) print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts), np.std(in_citations_counts))) print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts), np.std(out_citations_counts))) print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids), np.min(training_years), np.max(training_years))) print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids), np.min(validation_years), np.max(validation_years))) print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids), np.min(testing_years), np.max(testing_years))) print(authors.most_common(10))
def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) if self.ranker_type == 'none': citation_ranker = NoneRanker() elif self.ranker_type == 'neural': assert self.citation_ranker_dir is not None ranker_featurizer, ranker_models = model_from_directory( self.citation_ranker_dir, on_cpu=True) citation_ranker = Ranker( corpus=corpus, featurizer=ranker_featurizer, citation_ranker=ranker_models['citeomatic'], num_candidates_to_rank=100) else: assert False candidate_results_map = {} if self.num_candidates is None: if self.dataset_type == 'oc': num_candidates_list = [100] else: num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100] else: num_candidates_list = [self.num_candidates] for num_candidates in num_candidates_list: if self.candidate_selector_type == 'bm25': index_path = dp.get_bm25_index_path(self.dataset_type) candidate_selector = BM25CandidateSelector( corpus, index_path, num_candidates, False) elif self.candidate_selector_type == 'ann': assert self.paper_embedder_dir is not None featurizer, models = model_from_directory( self.paper_embedder_dir, on_cpu=True) candidate_selector = self._make_ann_candidate_selector( corpus=corpus, featurizer=featurizer, embedding_model=models['embedding'], num_candidates=num_candidates) elif self.candidate_selector_type == 'oracle': candidate_selector = OracleCandidateSelector(corpus) else: assert False results = eval_text_model(corpus, candidate_selector, citation_ranker, papers_source=self.split, n_eval=self.n_eval) candidate_results_map[num_candidates] = results best_k = -1 best_metric = 0.0 metric_key = self.metric + "_1" for k, v in candidate_results_map.items(): if best_metric < v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]]: best_k = k best_metric = v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]] print(json.dumps(candidate_results_map, indent=4, sort_keys=True)) print(best_k) print(best_metric)
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model