Пример #1
0
def create_fse_model(sentences):
    sentences = [sent["sentence"] for sent in sentences]
    print("SIF create indexes for embeddings")
    ft = load_fasttext_model()
    model = SIF(ft)
    idx_sentences = IndexedList(sentences)
    model.train(idx_sentences)
    return model, idx_sentences
Пример #2
0
    def test_broken_vocab(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        w2v.build_vocab([l.split() for l in open(CORPUS, "r")])
        for k in w2v.wv.vocab:
            w2v.wv.vocab[k].count = np.nan

        model = SIF(w2v)
        with self.assertRaises(RuntimeError):
            model.train(self.sentences)
Пример #3
0
    def test_save_issue(self):
        model = SIF(W2V)
        model.train(self.sentences)

        p = Path("fse/test/test_data/test_emb.model")
        model.save(str(p))
        model = SIF.load(str(p))
        p.unlink()

        self.assertEqual(2, len(model.svd_res))
        model.sv.similar_by_sentence("test sentence".split(), model=model)
Пример #4
0
 def test_parameter_sanity(self):
     with self.assertRaises(ValueError):
         m = SIF(W2V, alpha=-1)
         m._check_parameter_sanity()
     with self.assertRaises(ValueError):
         m = SIF(W2V, components=-1)
         m._check_parameter_sanity()
     with self.assertRaises(ValueError):
         m = SIF(W2V)
         m.word_weights = np.ones_like(m.word_weights) + 2
         m._check_parameter_sanity()
Пример #5
0
 def setUp(self):
     self.sentences = IndexedLineDocument(CORPUS)
     self.model = SIF(W2V, lang_freq="en")
Пример #6
0
class TestSIFFunctions(unittest.TestCase):
    def setUp(self):
        self.sentences = IndexedLineDocument(CORPUS)
        self.model = SIF(W2V, lang_freq="en")

    def test_parameter_sanity(self):
        with self.assertRaises(ValueError):
            m = SIF(W2V, alpha=-1)
            m._check_parameter_sanity()
        with self.assertRaises(ValueError):
            m = SIF(W2V, components=-1)
            m._check_parameter_sanity()
        with self.assertRaises(ValueError):
            m = SIF(W2V)
            m.word_weights = np.ones_like(m.word_weights) + 2
            m._check_parameter_sanity()

    def test_pre_train_calls(self):
        self.model._pre_train_calls()

    def test_post_train_calls(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5))

    def test_post_train_calls_no_removal(self):
        self.model.components = 0
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5))

    def test_post_inference_calls(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()

        output = np.ones((200, 10), dtype=np.float32)
        self.model._post_inference_calls(output=output)
        self.assertTrue(np.allclose(output, 0, atol=1e-5))

    def test_post_inference_calls_no_svd(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model.svd_res = None
        with self.assertRaises(RuntimeError):
            self.model._post_inference_calls(output=None)

    def test_post_inference_calls_no_removal(self):
        self.model.components = 0
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.model._post_inference_calls(output=None)
        self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5))

    def test_dtype_sanity_word_weights(self):
        self.model.word_weights = np.ones_like(self.model.word_weights,
                                               dtype=int)
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_dtype_sanity_svd_vals(self):
        self.model.svd_res = (
            np.ones_like(self.model.word_weights, dtype=int),
            np.array(0, dtype=np.float32),
        )
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_dtype_sanity_svd_vecs(self):
        self.model.svd_res = (
            np.array(0, dtype=np.float32),
            np.ones_like(self.model.word_weights, dtype=int),
        )
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_compute_sif_weights(self):
        cs = 1095661426
        w = "Good"
        pw = 1.916650481770269e-08
        alpha = self.model.alpha
        sif = alpha / (alpha + pw)

        idx = self.model.wv.vocab[w].index
        self.model._compute_sif_weights()
        self.assertTrue(np.allclose(self.model.word_weights[idx], sif))

    def test_train(self):
        output = self.model.train(self.sentences)
        self.assertEqual((100, 1450), output)
        self.assertTrue(np.isfinite(self.model.sv.vectors).all())
        self.assertEqual(2, len(self.model.svd_res))

    def test_save_issue(self):
        model = SIF(W2V)
        model.train(self.sentences)

        p = Path("fse/test/test_data/test_emb.model")
        model.save(str(p))
        model = SIF.load(str(p))
        p.unlink()

        self.assertEqual(2, len(model.svd_res))
        model.sv.similar_by_sentence("test sentence".split(), model=model)

    def test_broken_vocab(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        w2v.build_vocab([l.split() for l in open(CORPUS, "r")])
        for k in w2v.wv.vocab:
            w2v.wv.vocab[k].count = np.nan

        model = SIF(w2v)
        with self.assertRaises(RuntimeError):
            model.train(self.sentences)
class TestSIFFunctions(unittest.TestCase):
    def setUp(self):
        self.sentences = IndexedLineDocument(CORPUS)
        self.model = SIF(W2V, lang_freq="en")

    def test_parameter_sanity(self):
        with self.assertRaises(ValueError):
            m = SIF(W2V, alpha=-1)
            m._check_parameter_sanity()
        with self.assertRaises(ValueError):
            m = SIF(W2V, components=-1)
            m._check_parameter_sanity()
        with self.assertRaises(ValueError):
            m = SIF(W2V)
            m.word_weights = np.ones_like(m.word_weights) + 2
            m._check_parameter_sanity()

    def test_pre_train_calls(self):
        self.model._pre_train_calls()

    def test_post_train_calls(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.assertTrue(np.allclose(self.model.sv.vectors, 0, atol=1e-5))

    def test_post_train_calls_no_removal(self):
        self.model.components = 0
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5))

    def test_post_inference_calls(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()

        output = np.ones((200, 10), dtype=np.float32)
        self.model._post_inference_calls(output=output)
        self.assertTrue(np.allclose(output, 0, atol=1e-5))

    def test_post_inference_calls_no_svd(self):
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model.svd_res = None
        with self.assertRaises(RuntimeError):
            self.model._post_inference_calls(output=None)

    def test_post_inference_calls_no_removal(self):
        self.model.components = 0
        self.model.sv.vectors = np.ones((200, 10), dtype=np.float32)
        self.model._post_train_calls()
        self.model._post_inference_calls(output=None)
        self.assertTrue(np.allclose(self.model.sv.vectors, 1, atol=1e-5))

    def test_dtype_sanity_word_weights(self):
        self.model.word_weights = np.ones_like(self.model.word_weights,
                                               dtype=int)
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_dtype_sanity_svd_vals(self):
        self.model.svd_res = (np.ones_like(self.model.word_weights, dtype=int),
                              np.array(0, dtype=np.float32))
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_dtype_sanity_svd_vecs(self):
        self.model.svd_res = (np.array(0, dtype=np.float32),
                              np.ones_like(self.model.word_weights, dtype=int))
        with self.assertRaises(TypeError):
            self.model._check_dtype_santiy()

    def test_compute_sif_weights(self):
        cs = 1095661426
        w = "Good"
        pw = 1.916650481770269e-08
        alpha = self.model.alpha
        sif = alpha / (alpha + pw)

        idx = self.model.wv.vocab[w].index
        self.model._compute_sif_weights()
        self.assertTrue(np.allclose(self.model.word_weights[idx], sif))

    def test_train(self):
        output = self.model.train(self.sentences)
        self.assertEqual((100, 1450), output)
        self.assertTrue(np.isfinite(self.model.sv.vectors).all())