def test_SIF_works(self): """ test that components_to_remove works in grouped embedding merging """ # Just there to integration test normalization meanEmbeddings_n = embedding.groupedEmbedding( given_tdocs(), given_tgroups(), model=given_incomplete_keyedVectors(), weights=given_normalized_tweights(), word2SentenceMerge=mergingmethods.sumMerge, sentence2GroupMerge=mergingmethods.avgMerge, sentence2GroupKwargs={ "components_to_remove": 1, "normalize": True }, verbose=False) meanEmbeddings = embedding.groupedEmbedding( given_tdocs(), given_tgroups(), model=given_incomplete_keyedVectors(), weights=given_normalized_tweights(), word2SentenceMerge=mergingmethods.sumMerge, sentence2GroupMerge=mergingmethods.avgMerge, sentence2GroupKwargs={ "components_to_remove": 1, "normalize": False }, verbose=False) w2vec = given_incomplete_keyedVectors() for key in meanEmbeddings: self.assertEqual(len(meanEmbeddings[key]), w2vec.vector_size) self.assertEqual(len(meanEmbeddings_n[key]), w2vec.vector_size)
def test_given_oov_applies_correctly(self): """ Test that OOV words apply equally and don't break pipeline """ tdocs = given_tdocs() tgroups = given_tgroups() w2vec = given_incomplete_keyedVectors() meanEmbeddingsManual = embedding.groupedEmbedding( tdocs, tgroups, model=w2vec, weights=None, word2SentenceMerge=np.mean, word2SentenceKwargs={"axis": 0}, sentence2GroupMerge=np.mean, sentence2GroupKwargs={"axis": 0}, verbose=False) meanEmbeddings = embedding.groupedEmbedding( tdocs, tgroups, model=w2vec, weights=None, word2SentenceMerge=mergingmethods.avgMerge, sentence2GroupMerge=mergingmethods.avgMerge, verbose=False) for key in meanEmbeddings: np.testing.assert_array_almost_equal(meanEmbeddingsManual[key], meanEmbeddings[key])
def test_given_unique_pooling_applies_correctly(self): """ test function from mergingmethods.py works with tautological equivalent """ tdocs = given_tdocs() tgroups = given_tgroups() w2vec = given_mock_keyedVectors() meanEmbeddingsManual = embedding.groupedEmbedding( tdocs, tgroups, model=w2vec, word2SentenceMerge='unique', sentence2GroupMerge=np.mean, sentence2GroupKwargs={"axis": 0}, verbose=False) meanEmbeddings = embedding.groupedEmbedding( tdocs, tgroups, model=w2vec, word2SentenceMerge='unique', sentence2GroupMerge=mergingmethods.avgMerge, verbose=False) for key in meanEmbeddings: np.testing.assert_array_almost_equal(meanEmbeddingsManual[key], meanEmbeddings[key])
def test_given_list_input_input_untouched(self): tdocs = given_tdocs() tgroups = given_tgroups() w2vec = given_mock_keyedVectors() gembeddings = embedding.groupedEmbedding(tdocs, tgroups, model=w2vec, verbose=False) self.assertTrue(type(tdocs) == list and type(tgroups) == list, msg="input should be unchanged") self.assertEqual(len(gembeddings), len(set(tgroups)), msg="groupe dict should be # of categories")
def test_given_weights_tautological_works(self): """ test that weights work normally in a tautological check """ meanEmbeddings = embedding.groupedEmbedding( given_tdocs(), given_tgroups(), model=given_incomplete_keyedVectors(), weights=given_normalized_tweights(), word2SentenceMerge=mergingmethods.sumMerge, sentence2GroupMerge=mergingmethods.sumMerge, verbose=False) w2vec = given_incomplete_keyedVectors() for key in meanEmbeddings: self.assertEqual(len(meanEmbeddings[key]), w2vec.vector_size)
def test_given_weight_embedding_mismatch_raises(self): """weight dimension != embedding dimension should raise""" bad_weights = given_normalized_tweights() bad_weights[0] = bad_weights[0][:-1] bad_weights[1] = bad_weights[0][:-2] with self.assertRaises(ValueError): try: meanEmbeddings = embedding.groupedEmbedding( given_tdocs(), given_tgroups(), model=given_incomplete_keyedVectors(), weights=bad_weights, word2SentenceMerge=mergingmethods.sumMerge, sentence2GroupMerge=mergingmethods.sumMerge, verbose=False) del meanEmbeddings # avoid pylint warning except IndexError: # can happen on edgecase of dropped word raise ValueError