def test_dont_remove_gene(self): result = preprocessing.clean_text("the gene WK320 is associated with") self.assertEqual(result, "the gene WK320 is associated with")
def test_remove_letter_reference(self): result = preprocessing.clean_text("from [a] we see that [B] should") self.assertEqual(result, "from we see that should")
def test_remove_number_comma(self): result = preprocessing.clean_text("the numbers range from 1,2,3") self.assertEqual(result, "the numbers range from")
def test_remove_number(self): result = preprocessing.clean_text("the result increased by 22.7%") self.assertEqual(result, "the result increased by")
def test_remove_figure(self): result = preprocessing.clean_text("see Figure A") self.assertEqual(result, "see")
def test_remove_fig(self): result = preprocessing.clean_text("From fig. 1 we see") self.assertEqual(result, "From we see")
if __name__ == '__main__': from src.data import Corpus, get_data, get_vocabulary, get_labels from src.preprocessing import clean_text, build_adj_matrix from src.models.gcn import GCN seed = 0 val_split = 0.1 vocab = get_vocabulary('data/20ng-vocabulary.txt') labels = get_labels('data/20ng-labels.txt') corpus = get_data('data/train-20news.txt', labels) test_corpus = get_data('data/test-20news.txt', labels) # Mask out unknown words clean_text(corpus, vocab) clean_text(test_corpus, vocab) # Split validation set corpus.shuffle(seed) len_train = int(len(corpus) * (1 - val_split)) train_corpus = Corpus(corpus[:len_train]) val_corpus = Corpus(corpus[len_train:]) num_documents = len(train_corpus) + len(val_corpus) + len(test_corpus) train_adj_matrix = build_adj_matrix(train_corpus, vocab, num_documents, doc_offset=0) val_adj_matrix = build_adj_matrix(val_corpus, vocab,