Exemplo n.º 1
0
def build_test_corpus(source_file, target_file):
    try:
        os.unlink(target_file)
    except:
        pass

    with open(source_file, 'w') as tf:
        for i in range(100):
            json.dump(
                {
                    FieldNames.TITLE:
                    ' '.join(random.sample(WORDS, 10)),
                    FieldNames.ABSTRACT:
                    ' '.join(random.sample(WORDS, 1000)),
                    FieldNames.AUTHORS: [],
                    FieldNames.OUT_CITATIONS:
                    [str(x) for x in random.sample(range(100), 2)],
                    FieldNames.IN_CITATION_COUNT:
                    len([str(x) for x in random.sample(range(100), 2)]),
                    FieldNames.KEY_PHRASES:
                    random.sample(WORDS, 3),
                    FieldNames.YEAR:
                    2011,
                    FieldNames.PAPER_ID:
                    str(i),
                    FieldNames.VENUE:
                    'v-{}'.format(random.randint(1, 5))
                }, tf)
            tf.write('\n')

    Corpus.build(target_file, source_file)
Exemplo n.º 2
0
 def _verify(db_filename, corpus_json):
     try:
         Corpus.build(db_filename=db_filename, source_json=corpus_json)
     except Exception as e:
         logging.critical(
             "Failed to build corpus {} for file {}".format(
                 db_filename, corpus_json))
         print(e)
Exemplo n.º 3
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        authors = Counter()
        key_phrases = Counter()
        years = Counter()
        venues = Counter()
        num_docs_with_kp = 0

        in_citations_counts = []
        out_citations_counts = []
        for doc in corpus:
            authors.update(doc.authors)
            key_phrases.update(doc.key_phrases)
            if len(doc.key_phrases) > 0:
                num_docs_with_kp += 1
            in_citations_counts.append(doc.in_citation_count)
            out_citations_counts.append(doc.out_citation_count)
            years.update([doc.year])
            venues.update([doc.venue])

        training_years = [corpus[doc_id].year for doc_id in corpus.train_ids]
        validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids]
        testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids]

        print("No. of documents = {}".format(len(corpus)))
        print("Unique number of authors = {}".format(len(authors)))
        print("Unique number of key phrases = {}".format(len(key_phrases)))
        print("Unique number of venues = {}".format(len(venues)))
        print("No. of docs with key phrases = {}".format(num_docs_with_kp))
        print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts),
                                                          np.std(in_citations_counts)))
        print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts),
                                                           np.std(out_citations_counts)))
        print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids),
                                                                np.min(training_years),
                                                                np.max(training_years)))
        print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids),
                                                                  np.min(validation_years),
                                                                  np.max(validation_years)))
        print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids),
                                                               np.min(testing_years),
                                                               np.max(testing_years)))
        print(authors.most_common(10))
Exemplo n.º 4
0
def test_corpus_iterator():
    corpus = Corpus.load('/tmp/foo.sqlite')
    iter_ids = []
    for doc in corpus:
        iter_ids.append(doc.id)
    overlap_n = len(set(iter_ids).intersection(set(corpus.all_ids)))
    assert overlap_n == corpus.n_docs
Exemplo n.º 5
0
def test_featurizer_and_data_gen():
    build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
    corpus = Corpus.load('/tmp/foo.sqlite')
    featurizer = features.Featurizer()
    featurizer.fit(corpus, max_df_frac=1.0)

    dg = features.DataGenerator(corpus, featurizer)
    gen = dg.triplet_generator(paper_ids=corpus.all_ids,
                               candidate_ids=corpus.all_ids,
                               batch_size=128,
                               neg_to_pos_ratio=5)

    # make sure we can get features
    for i in range(10):
        print(i)
        X, y = next(gen)

    # correct batch size
    assert len(y) >= 128
    # positives, hard negatives, easy negatives
    assert len(np.unique(y)) == 3
    # correct padding
    assert X['query-abstract-txt'].shape[1] == featurizer.max_abstract_len
    assert X['query-title-txt'].shape[1] == featurizer.max_title_len
    # no new words
    assert set(featurizer.word_indexer.word_to_index.keys()).difference(
        WORDS) == set()

    q, ex, labels = next(dg._listwise_examples(corpus.all_ids, corpus.all_ids))

    # query id should not be in candidates
    assert q.id not in [i.id for i in ex]

    # pos ids should be out_citations
    pos_docs = [i.id for i, j in zip(ex, labels) if j == np.max(labels)]
    assert set(pos_docs) == set(q.out_citations)

    # neg ids should be NOT out_citations
    neg_docs = [i.id for i, j in zip(ex, labels) if j < np.max(labels)]
    assert np.all([i not in neg_docs for i in q.out_citations])

    # test variable margin off
    dg = features.DataGenerator(corpus, featurizer, use_variable_margin=False)
    gen = dg.triplet_generator(paper_ids=corpus.all_ids,
                               candidate_ids=corpus.all_ids,
                               batch_size=128,
                               neg_to_pos_ratio=5)

    X, y = next(gen)
    print(dg.margins_offset_dict)
    assert len(np.unique(y)) == 2
Exemplo n.º 6
0
def _gold_citations(doc_id: str, corpus: Corpus, min_citations: int,
                    candidate_ids_pool: set):
    gold_citations_1 = set(corpus.get_citations(doc_id))

    if doc_id in gold_citations_1:
        gold_citations_1.remove(doc_id)

    citations_of_citations = []
    for c in gold_citations_1:
        citations_of_citations.extend(corpus.get_citations(c))

    gold_citations_2 = set(citations_of_citations).union(gold_citations_1)

    if doc_id in gold_citations_2:
        gold_citations_2.remove(doc_id)

    gold_citations_1.intersection_update(candidate_ids_pool)
    gold_citations_2.intersection_update(candidate_ids_pool)

    if len(gold_citations_1) < min_citations:
        return [], []

    return gold_citations_1, gold_citations_2
    def main(self, args):
        logging.info("Reading Open Corpus file from: {}".format(
            self.input_path))
        logging.info("Writing json file to: {}".format(self.output_path))

        dp = DatasetPaths()

        assert os.path.exists(self.input_path)
        assert not os.path.exists(self.output_path)
        assert not os.path.exists(dp.get_pkl_path('oc'))

        with open(self.output_path, 'w') as f:
            for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)):
                if 'year' not in obj:
                    continue
                translated_obj = {
                    FieldNames.PAPER_ID:
                    obj['id'],
                    FieldNames.TITLE_RAW:
                    obj['title'],
                    FieldNames.ABSTRACT_RAW:
                    obj['paperAbstract'],
                    FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
                    FieldNames.IN_CITATION_COUNT:
                    len(obj['inCitations']),
                    FieldNames.KEY_PHRASES:
                    obj['keyPhrases'],
                    FieldNames.OUT_CITATIONS:
                    obj['outCitations'],
                    FieldNames.URLS:
                    obj['pdfUrls'],
                    FieldNames.S2_URL:
                    obj['s2Url'],
                    FieldNames.VENUE:
                    obj['venue'],
                    FieldNames.YEAR:
                    obj['year'],
                    FieldNames.TITLE:
                    ' '.join(global_tokenizer(obj['title'])),
                    FieldNames.ABSTRACT:
                    ' '.join(global_tokenizer(obj['paperAbstract']))
                }
                f.write(json.dumps(translated_obj))
                f.write("\n")
        f.close()
        oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc'))
        pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
Exemplo n.º 8
0
    def setUpClass(cls):
        build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
        corpus = Corpus.load('/tmp/foo.sqlite')

        options = ModelOptions(**{})

        featurizer = Featurizer(max_title_len=options.max_title_len,
                                max_abstract_len=options.max_abstract_len)
        featurizer.fit(corpus, max_df_frac=1.0)

        options.n_features = featurizer.n_features
        options.n_authors = featurizer.n_authors
        options.n_venues = featurizer.n_venues
        options.n_keyphrases = featurizer.n_keyphrases

        cls.corpus = corpus
        cls.featurizer = featurizer
        cls.options = options
Exemplo n.º 9
0
def test_data_isolation():
    build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
    corpus = Corpus.load('/tmp/foo.sqlite')

    assert len(set(corpus.train_ids).intersection(set(corpus.valid_ids))) == 0
    assert len(set(corpus.train_ids).intersection(set(corpus.test_ids))) == 0
    assert len(set(corpus.valid_ids).intersection(set(corpus.test_ids))) == 0

    featurizer = features.Featurizer()
    featurizer.fit(corpus, max_df_frac=1.0)
    dg = features.DataGenerator(corpus, featurizer)

    query, examples, labels = next(dg._listwise_examples(corpus.train_ids))
    examples_ids = [doc.id for doc in examples]

    assert len(set(examples_ids).intersection(set(corpus.valid_ids))) == 0
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.valid_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids))
    examples_ids = [doc.id for doc in examples]

    assert len(set(examples_ids).intersection(set(corpus.train_ids))) > 0
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.test_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids))
    examples_ids = [doc.id for doc in examples]
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.test_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids + corpus.test_ids))
    examples_ids = [doc.id for doc in examples]
Exemplo n.º 10
0
with open(output_path, 'w') as f:
    for obj in tqdm.tqdm(file_util.read_json_lines(input_path)):
        if 'year' not in obj:
            continue
        translated_obj = {
            FieldNames.PAPER_ID: obj['id'],
            FieldNames.TITLE_RAW: obj['title'],
            FieldNames.ABSTRACT_RAW: obj['paperAbstract'],
            FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
            FieldNames.IN_CITATION_COUNT: 0,
            FieldNames.KEY_PHRASES: obj['keyPhrases'],
            FieldNames.OUT_CITATIONS: obj['outCitations'],
            FieldNames.URLS: obj['pdfUrls'],
            FieldNames.S2_URL: obj['s2Url'],
            FieldNames.VENUE: obj['venue'],
            FieldNames.YEAR: obj['year'],
            FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])),
            FieldNames.ABSTRACT:
            ' '.join(global_tokenizer(obj['paperAbstract']))
        }
        s += 1
        if s == 10:
            break
        f.write(json.dumps(translated_obj))
        f.write("\n")
f.close()

oc_corpus = Corpus.build(dp.get_db_path('oc'), [output_path])
with open(output_pkl_path, 'wb') as f:
    pickle.dump(oc_corpus, f, -1)
Exemplo n.º 11
0
            FieldNames.PAPER_ID:
            obj['id'],
            FieldNames.TITLE_RAW:
            obj['title'],
            FieldNames.ABSTRACT_RAW:
            obj['paperAbstract'],
            FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
            #FieldNames.IN_CITATION_COUNT: len(obj['inCitations']),
            FieldNames.KEY_PHRASES:
            obj['keyPhrases'],
            #FieldNames.OUT_CITATIONS: obj['outCitations'],
            FieldNames.URLS:
            obj['pdfUrls'],
            FieldNames.S2_URL:
            obj['s2Url'],
            FieldNames.VENUE:
            obj['venue'],
            FieldNames.YEAR:
            obj['year'],
            FieldNames.TITLE:
            ' '.join(global_tokenizer(obj['title'])),
            FieldNames.ABSTRACT:
            ' '.join(global_tokenizer(obj['paperAbstract']))
        }
        f.write(json.dumps(translated_obj))
        f.write("\n")
f.close()

oc_corpus = Corpus.load(dp.get_db_path('oc'))
pickle.dump(oc_corpus, open(output_pkl_path, 'wb'))
    def main(self, args):

        if self.dataset_name == 'dblp':
            input_path = DatasetPaths.DBLP_GOLD_DIR
            output_path = DatasetPaths.DBLP_CORPUS_JSON
        elif self.dataset_name == 'pubmed':
            input_path = DatasetPaths.PUBMED_GOLD_DIR
            output_path = DatasetPaths.PUBMED_CORPUS_JSON
        else:
            assert False

        logging.info("Reading Gold data from {}".format(input_path))
        logging.info("Writing corpus to {}".format(output_path))
        assert os.path.exists(input_path)
        assert not os.path.exists(output_path)

        papers_file = os.path.join(input_path, "papers.txt")
        abstracts_file = os.path.join(input_path, "abstracts.txt")
        keyphrases_file = os.path.join(input_path, "paper_keyphrases.txt")
        citations_file = os.path.join(input_path, "paper_paper.txt")
        authors_file = os.path.join(input_path, "paper_author.txt")

        venues_file = os.path.join(input_path, "paper_venue.txt")

        paper_titles = {}
        paper_years = {}
        paper_abstracts = {}
        paper_keyphrases = {}
        paper_citations = {}
        paper_in_citations = {}
        paper_authors = {}
        paper_venues = {}

        bad_ids = set()
        for line in file_util.read_lines(abstracts_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if len(parts) == 2:
                paper_abstracts[paper_id] = parts[1]
            else:
                paper_abstracts[paper_id] = ""

            if paper_abstracts[paper_id] == "":
                bad_ids.add(paper_id)

        for line in file_util.read_lines(papers_file):
            parts = line.split('\t')
            paper_id = int(parts[0])
            paper_years[paper_id] = int(parts[2])
            paper_titles[paper_id] = parts[3]

        for line in file_util.read_lines(keyphrases_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_keyphrases:
                paper_keyphrases[paper_id] = []

            for kp in parts[1:]:
                kp = kp.strip()
                if len(kp) > 0:
                    paper_keyphrases[paper_id].append(kp[:-4])

        for line in file_util.read_lines(citations_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_citations:
                paper_citations[paper_id] = []
            c = int(parts[1])
            if c in bad_ids:
                continue
            paper_citations[paper_id].append(str(c))

            if c not in paper_in_citations:
                paper_in_citations[c] = []
            if paper_id not in paper_in_citations:
                paper_in_citations[paper_id] = []

            paper_in_citations[c].append(paper_id)

        for line in file_util.read_lines(authors_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_authors:
                paper_authors[paper_id] = []

            paper_authors[paper_id].append(parts[1])

        for line in file_util.read_lines(venues_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            paper_venues[paper_id] = parts[1]

        test_paper_id = 13
        print("==== Test Paper Details ====")
        print(paper_titles[test_paper_id])
        print(paper_years[test_paper_id])
        print(paper_abstracts[test_paper_id])
        print(paper_keyphrases[test_paper_id])
        print(paper_citations[test_paper_id])
        print(paper_in_citations[test_paper_id])
        print(paper_authors[test_paper_id])
        print(paper_venues[test_paper_id])
        print("==== Test Paper Details ====")

        def _print_len(x, name=''):
            print("No. of {} = {}".format(name, len(x)))

        _print_len(paper_titles, 'Titles')
        _print_len(paper_years, 'Years')
        _print_len(paper_abstracts, 'Abstracts')
        _print_len(paper_keyphrases, 'KeyPhrases')
        _print_len(paper_citations, 'Paper Citations')
        _print_len(paper_in_citations, 'Paper In citations')
        _print_len(paper_authors, ' Authors')
        _print_len(paper_venues, ' Venues')

        logging.info("Skipped {} papers due to insufficient data".format(
            len(bad_ids)))

        corpus = {}
        for id, title in tqdm.tqdm(paper_titles.items()):
            if id in bad_ids:
                continue
            doc = document_from_dict({
                FieldNames.PAPER_ID:
                str(id),
                FieldNames.TITLE:
                ' '.join(global_tokenizer(title)),
                FieldNames.ABSTRACT:
                ' '.join(global_tokenizer(paper_abstracts[id])),
                FieldNames.OUT_CITATIONS:
                paper_citations.get(id, []),
                FieldNames.YEAR:
                paper_years[id],
                FieldNames.AUTHORS:
                paper_authors.get(id, []),
                FieldNames.KEY_PHRASES:
                paper_keyphrases[id],
                FieldNames.OUT_CITATION_COUNT:
                len(paper_citations.get(id, [])),
                FieldNames.IN_CITATION_COUNT:
                len(paper_in_citations.get(id, [])),
                FieldNames.VENUE:
                paper_venues.get(id, ''),
                FieldNames.TITLE_RAW:
                title,
                FieldNames.ABSTRACT_RAW:
                paper_abstracts[id]
            })
            corpus[id] = doc

        with open(output_path, 'w') as f:
            for _, doc in corpus.items():
                doc_json = dict_from_document(doc)
                f.write(json.dumps(doc_json))
                f.write("\n")

        dp = DatasetPaths()
        Corpus.build(dp.get_db_path(self.dataset_name),
                     dp.get_json_path(self.dataset_name))
Exemplo n.º 13
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        if self.ranker_type == 'none':
            citation_ranker = NoneRanker()
        elif self.ranker_type == 'neural':
            assert self.citation_ranker_dir is not None
            ranker_featurizer, ranker_models = model_from_directory(
                self.citation_ranker_dir, on_cpu=True)
            citation_ranker = Ranker(
                corpus=corpus,
                featurizer=ranker_featurizer,
                citation_ranker=ranker_models['citeomatic'],
                num_candidates_to_rank=100)
        else:
            assert False

        candidate_results_map = {}
        if self.num_candidates is None:
            if self.dataset_type == 'oc':
                num_candidates_list = [100]
            else:
                num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100]
        else:
            num_candidates_list = [self.num_candidates]

        for num_candidates in num_candidates_list:

            if self.candidate_selector_type == 'bm25':
                index_path = dp.get_bm25_index_path(self.dataset_type)
                candidate_selector = BM25CandidateSelector(
                    corpus, index_path, num_candidates, False)
            elif self.candidate_selector_type == 'ann':
                assert self.paper_embedder_dir is not None
                featurizer, models = model_from_directory(
                    self.paper_embedder_dir, on_cpu=True)
                candidate_selector = self._make_ann_candidate_selector(
                    corpus=corpus,
                    featurizer=featurizer,
                    embedding_model=models['embedding'],
                    num_candidates=num_candidates)
            elif self.candidate_selector_type == 'oracle':
                candidate_selector = OracleCandidateSelector(corpus)
            else:
                assert False

            results = eval_text_model(corpus,
                                      candidate_selector,
                                      citation_ranker,
                                      papers_source=self.split,
                                      n_eval=self.n_eval)
            candidate_results_map[num_candidates] = results

        best_k = -1
        best_metric = 0.0
        metric_key = self.metric + "_1"
        for k, v in candidate_results_map.items():
            if best_metric < v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]:
                best_k = k
                best_metric = v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]

        print(json.dumps(candidate_results_map, indent=4, sort_keys=True))
        print(best_k)
        print(best_metric)
Exemplo n.º 14
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model