def testVocab(self): """Test word2vec vocabulary building.""" corpus = LeeCorpus() total_words = sum(len(sentence) for sentence in corpus) # try vocab building explicitly, using all words model = word2vec.Word2Vec(min_count=1, hs=1, negative=0) model.build_vocab(corpus) self.assertTrue(len(model.vocab) == 6981) # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus self.assertEqual(sum(v.count for v in model.vocab.values()), total_words) # make sure the binary codes are correct numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0]) # test building vocab with default params model = word2vec.Word2Vec(hs=1, negative=0) model.build_vocab(corpus) self.assertTrue(len(model.vocab) == 1750) numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0]) # no input => "RuntimeError: you must first build vocabulary before training the model" self.assertRaises(RuntimeError, word2vec.Word2Vec, []) # input not empty, but rather completely filtered out self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words + 1)
def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0) self.models_equal(model, model2)
def testRuleWithMinCount(self): """Test that returning RULE_DEFAULT from trim_rule triggers min_count.""" model = word2vec.Word2Vec(sentences + [["occurs_only_once"]], min_count=2, trim_rule=_rule) self.assertTrue("human" not in model.vocab) self.assertTrue("occurs_only_once" not in model.vocab) self.assertTrue("interface" in model.vocab)
def test_sg_neg(self): """Test skipgram w/ negative sampling""" model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) self.model_sanity(model)
def test_sg_hs(self): """Test skipgram w/ hierarchical softmax""" model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) self.model_sanity(model)
def testLargeMmap(self): """Test storing/loading the entire model.""" model = word2vec.Word2Vec(sentences, min_count=1) # test storing the internal arrays into separate files model.save(testfile(), sep_limit=0) self.models_equal(model, word2vec.Word2Vec.load(testfile())) # make sure mmaping the arrays back works, too self.models_equal(model, word2vec.Word2Vec.load(testfile(), mmap='r'))
def testParallel(self): """Test word2vec parallel training.""" if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) return corpus = utils.RepeatCorpus(LeeCorpus(), 10000) for workers in [2, 4]: model = word2vec.Word2Vec(corpus, workers=workers) sims = model.most_similar('israeli')
def testPersistenceWord2VecFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'par2vec_word2vec.vocab') model.save_word2vec_format(testfile(), testvocab, binary=True) binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format( testfile(), testvocab, binary=True) self.assertEqual(model.vocab['human'].count, binary_model_with_vocab.vocab['human'].count)
def testScoring(self): """Test word2vec scoring.""" model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) # just score and make sure they exist scores = model.score(sentences, len(sentences)) self.assertEqual(len(scores), len(sentences))
def testSimilarities(self): """Test similarity and n_similarity methods.""" # The model is trained using CBOW model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences) self.assertTrue( model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue( model.n_similarity(['graph'], ['trees']) == model.similarity( 'graph', 'trees'))
def test_cbow_neg_multi_n02(self): """Test CBOW w/ hierarchical softmax multilayer""" model = word2vec.Word2Vec(sg=0, size=[75, 100], cbow_mean=1, alpha=0.05, window=8, hs=0, negative=15, min_count=5, iter=20, workers=1, batch_words=1000)
def test_cbow_hs(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, min_count=5, iter=10, workers=2, batch_words=1000) self.model_sanity(model)
def test_cbow_neg(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, min_count=5, iter=10, workers=2, sample=0) self.model_sanity(model)
def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with saving and loading via `save` and `load` methods`.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'par2vec_word2vec.vocab') model.save_word2vec_format(testfile(), testvocab, binary=True) binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format( testfile(), testvocab, binary=True) binary_model_with_vocab.save(testfile()) binary_model_with_vocab = word2vec.Word2Vec.load(testfile()) self.assertEqual(model.vocab['human'].count, binary_model_with_vocab.vocab['human'].count)
def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.save_word2vec_format(testfile(), binary=True) binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) binary_model.init_sims(replace=False) self.assertTrue(numpy.allclose(model['human'], binary_model['human'])) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) norm_only_model.init_sims(replace=True) self.assertFalse( numpy.allclose(model['human'], norm_only_model['human'])) self.assertTrue( numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human']))
def testPersistenceWord2VecFormatNonBinary(self): """Test storing/loading the entire model in word2vec non-binary format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.save_word2vec_format(testfile(), binary=False) text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False) text_model.init_sims(False) self.assertTrue( numpy.allclose(model['human'], text_model['human'], atol=1e-6)) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False) norm_only_model.init_sims(True) self.assertFalse( numpy.allclose(model['human'], norm_only_model['human'], atol=1e-6)) self.assertTrue( numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human'], atol=1e-4))
def testLocking(self): """Test word2vec training doesn't change locked vectors.""" corpus = LeeCorpus() # build vocabulary, don't train yet for sg in range(2): # test both cbow and sg model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) model.build_vocab(corpus) # remember two vectors locked0 = numpy.copy(model.syn0[0]) unlocked1 = numpy.copy(model.syn0[1]) # lock the vector in slot 0 against change model.syn0_lockf[0] = 0.0 model.train(corpus) self.assertFalse((unlocked1 == model.syn0[1] ).all()) # unlocked vector should vary self.assertTrue((locked0 == model.syn0[0] ).all()) # locked vector should not vary
def testLambdaRule(self): """Test that lambda trim_rule works.""" rule = lambda word, count, min_count: utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule) self.assertTrue("human" not in model.vocab)
def testRule(self): """Test applying vocab trim_rule to build_vocab instead of constructor.""" model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences, trim_rule=_rule) self.assertTrue("human" not in model.vocab)
def testPersistenceWithConstructorRule(self): """Test storing/loading the entire model with a vocab trimming rule passed in the constructor.""" model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule) model.save(testfile()) self.models_equal(model, word2vec.Word2Vec.load(testfile()))
def testPersistence(self): """Test storing/loading the entire model.""" model = word2vec.Word2Vec(sentences, min_count=1) model.save(testfile()) self.models_equal(model, word2vec.Word2Vec.load(testfile()))
def testRNG(self): """Test word2vec results identical with identical RNG seed.""" model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) self.models_equal(model, model2)