Пример #1
0
 def test_dont_remove_gene(self):
     result = preprocessing.clean_text("the gene WK320 is associated with")
     self.assertEqual(result, "the gene WK320 is associated with")
Пример #2
0
 def test_remove_letter_reference(self):
     result = preprocessing.clean_text("from [a] we see that [B] should")
     self.assertEqual(result, "from  we see that  should")
Пример #3
0
 def test_remove_number_comma(self):
     result = preprocessing.clean_text("the numbers range from 1,2,3")
     self.assertEqual(result, "the numbers range from")
Пример #4
0
 def test_remove_number(self):
     result = preprocessing.clean_text("the result increased by 22.7%")
     self.assertEqual(result, "the result increased by")
Пример #5
0
 def test_remove_figure(self):
     result = preprocessing.clean_text("see Figure A")
     self.assertEqual(result, "see")
Пример #6
0
 def test_remove_fig(self):
     result = preprocessing.clean_text("From fig. 1 we see")
     self.assertEqual(result, "From  we see")
Пример #7
0
if __name__ == '__main__':
    from src.data import Corpus, get_data, get_vocabulary, get_labels
    from src.preprocessing import clean_text, build_adj_matrix
    from src.models.gcn import GCN

    seed = 0
    val_split = 0.1

    vocab = get_vocabulary('data/20ng-vocabulary.txt')
    labels = get_labels('data/20ng-labels.txt')
    corpus = get_data('data/train-20news.txt', labels)
    test_corpus = get_data('data/test-20news.txt', labels)

    # Mask out unknown words
    clean_text(corpus, vocab)
    clean_text(test_corpus, vocab)

    # Split validation set
    corpus.shuffle(seed)
    len_train = int(len(corpus) * (1 - val_split))
    train_corpus = Corpus(corpus[:len_train])
    val_corpus = Corpus(corpus[len_train:])

    num_documents = len(train_corpus) + len(val_corpus) + len(test_corpus)
    train_adj_matrix = build_adj_matrix(train_corpus,
                                        vocab,
                                        num_documents,
                                        doc_offset=0)
    val_adj_matrix = build_adj_matrix(val_corpus,
                                      vocab,