Пример #1
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        authors = Counter()
        key_phrases = Counter()
        years = Counter()
        venues = Counter()
        num_docs_with_kp = 0

        in_citations_counts = []
        out_citations_counts = []
        for doc in corpus:
            authors.update(doc.authors)
            key_phrases.update(doc.key_phrases)
            if len(doc.key_phrases) > 0:
                num_docs_with_kp += 1
            in_citations_counts.append(doc.in_citation_count)
            out_citations_counts.append(doc.out_citation_count)
            years.update([doc.year])
            venues.update([doc.venue])

        training_years = [corpus[doc_id].year for doc_id in corpus.train_ids]
        validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids]
        testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids]

        print("No. of documents = {}".format(len(corpus)))
        print("Unique number of authors = {}".format(len(authors)))
        print("Unique number of key phrases = {}".format(len(key_phrases)))
        print("Unique number of venues = {}".format(len(venues)))
        print("No. of docs with key phrases = {}".format(num_docs_with_kp))
        print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts),
                                                          np.std(in_citations_counts)))
        print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts),
                                                           np.std(out_citations_counts)))
        print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids),
                                                                np.min(training_years),
                                                                np.max(training_years)))
        print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids),
                                                                  np.min(validation_years),
                                                                  np.max(validation_years)))
        print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids),
                                                               np.min(testing_years),
                                                               np.max(testing_years)))
        print(authors.most_common(10))
Пример #2
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        if self.ranker_type == 'none':
            citation_ranker = NoneRanker()
        elif self.ranker_type == 'neural':
            assert self.citation_ranker_dir is not None
            ranker_featurizer, ranker_models = model_from_directory(
                self.citation_ranker_dir, on_cpu=True)
            citation_ranker = Ranker(
                corpus=corpus,
                featurizer=ranker_featurizer,
                citation_ranker=ranker_models['citeomatic'],
                num_candidates_to_rank=100)
        else:
            assert False

        candidate_results_map = {}
        if self.num_candidates is None:
            if self.dataset_type == 'oc':
                num_candidates_list = [100]
            else:
                num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100]
        else:
            num_candidates_list = [self.num_candidates]

        for num_candidates in num_candidates_list:

            if self.candidate_selector_type == 'bm25':
                index_path = dp.get_bm25_index_path(self.dataset_type)
                candidate_selector = BM25CandidateSelector(
                    corpus, index_path, num_candidates, False)
            elif self.candidate_selector_type == 'ann':
                assert self.paper_embedder_dir is not None
                featurizer, models = model_from_directory(
                    self.paper_embedder_dir, on_cpu=True)
                candidate_selector = self._make_ann_candidate_selector(
                    corpus=corpus,
                    featurizer=featurizer,
                    embedding_model=models['embedding'],
                    num_candidates=num_candidates)
            elif self.candidate_selector_type == 'oracle':
                candidate_selector = OracleCandidateSelector(corpus)
            else:
                assert False

            results = eval_text_model(corpus,
                                      candidate_selector,
                                      citation_ranker,
                                      papers_source=self.split,
                                      n_eval=self.n_eval)
            candidate_results_map[num_candidates] = results

        best_k = -1
        best_metric = 0.0
        metric_key = self.metric + "_1"
        for k, v in candidate_results_map.items():
            if best_metric < v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]:
                best_k = k
                best_metric = v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]

        print(json.dumps(candidate_results_map, indent=4, sort_keys=True))
        print(best_k)
        print(best_metric)
Пример #3
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model