def test_get_embeddings(self): v = SennaVocab() v.add("!") E = v.get_embeddings() e_exclamation = np.array([float(e) for e in """ -1.03682 1.77856 -0.693547 1.5948 1.5799 0.859243 1.15221 -0.976317 0.745304 -0.494589 0.308086 0.25239 -0.1976 1.26203 0.813864 -0.940734 -0.215163 0.11645 0.525697 1.95766 0.394232 1.27717 0.710788 -0.389351 0.161775 -0.106038 1.14148 0.607948 0.189781 -1.06022 0.280702 0.0251156 -0.198067 2.33027 0.408584 0.350751 -0.351293 1.77318 -0.723457 -0.13806 -1.47247 0.541779 -2.57005 -0.227714 -0.817816 -0.552209 0.360149 -0.10278 -0.36428 -0.64853 """.split()]) self.assertTrue(np.allclose(e_exclamation, E[v.word2index["!"]]))
def test_get_embeddings(self): v = SennaVocab() v.add("!") E = v.get_embeddings() e_exclamation = np.array([ float(e) for e in """ -1.03682 1.77856 -0.693547 1.5948 1.5799 0.859243 1.15221 -0.976317 0.745304 -0.494589 0.308086 0.25239 -0.1976 1.26203 0.813864 -0.940734 -0.215163 0.11645 0.525697 1.95766 0.394232 1.27717 0.710788 -0.389351 0.161775 -0.106038 1.14148 0.607948 0.189781 -1.06022 0.280702 0.0251156 -0.198067 2.33027 0.408584 0.350751 -0.351293 1.77318 -0.723457 -0.13806 -1.47247 0.541779 -2.57005 -0.227714 -0.817816 -0.552209 0.360149 -0.10278 -0.36428 -0.64853 """.split() ]) self.assertTrue(np.allclose(e_exclamation, E[v["!"]]))
train_file = os.path.join(mydir, 'SemEval2010_task8_training', 'TRAIN_FILE.TXT') test_file = os.path.join(mydir, 'SemEval2010_task8_testing_keys', 'TEST_FILE_FULL.TXT') logging.basicConfig(level=logging.INFO) logging.info('starting preprocessing') if os.path.isfile('train.json') and os.path.isfile('test.json') and not args['--force']: logging.info('train.json and test.json already exists. Skipping proprocessing.') else: nlp = English() with open('train.json', 'wb') as f: json.dump(parse_file(train_file, nlp), f, indent=2) with open('test.json', 'wb') as f: json.dump(parse_file(test_file, nlp), f, indent=2) logging.info('starting numericalization') word_vocab = SennaVocab() rel_vocab = Vocab() with open('train.json') as f: train = json.load(f) with open('test.json') as f: test = json.load(f) numericalize(train, word_vocab, rel_vocab, add=True) word_vocab = word_vocab.prune_rares(cutoff=2) word_vocab = word_vocab.sort_by_decreasing_count() rel_vocab = rel_vocab.sort_by_decreasing_count() train = numericalize(train, word_vocab, rel_vocab, add=False) test = numericalize(test, word_vocab, rel_vocab, add=False) with open('vocab.pkl', 'wb') as f: