def test_vectorize_should_ignore_unused_words (self): text = 'legal ultra' vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.2d.txt')) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'legal 1 2', 'nothing 12 13', 'ultra 4 5']) with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.transform([text]) mock.assert_called_with('glove.2d.txt') self.assertEqual([[2.5, 3.5]], result.tolist())
def test_vectorize_sentence (self): text = 'abobrinha pepino mamao' vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.txt')) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'abobrinha 1 2 3', 'pepino 4 5 6', 'mamao 7 8 9']) with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.transform([text]) mock.assert_called_with('glove.txt') self.assertEqual([[4, 5, 6]], result.tolist())
def get_extractor(extractor_name, ngram_range, embedding_file=None): if extractor_name == 'tfidf': return TfidfVectorizer(ngram_range=(1, int(ngram_range))), chi2, { 'feature_selection__k': [100, 300, 500, 'all'] } elif extractor_name == 'embeddings_glove': return AverageEmbeddingVectorizer( GloveLoader(embedding_file)), f_classif, { 'feature_selection__k': ['all'] } else: return AverageEmbeddingVectorizer( SELoader(embedding_file)), f_classif, { 'feature_selection__k': ['all'] }
def test_sentences_with_no_word_in_index_with_3d (self): corpus = ['natal sem nenhuma palavra', 'outra sentenca'] vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.3d.txt')) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'outra 1 1 1', 'sentenca 1 1 1', 'ultra 4 5 6']) with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.transform(corpus) mock.assert_called_with('glove.3d.txt') self.assertEqual([ [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]], result.tolist())
def test_vectorize_should_ignore_words_not_in_embeddings (self): corpus = ['nova alternativa de jogo', 'alternativa terceira alternativa legal', 'outra sentenca'] vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.2d.txt')) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'nova 3 9', 'nada 3 10', 'de 9 1', 'jogo 12 2', 'terceira 4 11', 'outra 13 9', 'sentenca 1 1', 'legal 1 2', 'nothing 12 13', 'ultra 4 5']) with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.transform(corpus) mock.assert_called_with('glove.2d.txt') self.assertEqual([ [8.0, 4.0], [2.5, 6.5], [7.0, 5.0]], result.tolist())
def test_vectorize_should_run_on_multiple_sentences (self): corpus = ['nova alternativa de jogo', 'terceira alternativa legal', 'outra sentenca'] vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.2d.txt')) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'nova 3 9', 'alternativa 13 11', 'nada 3 10', 'de 9 1', 'jogo 13 1', 'terceira 4 11', 'outra 13 9', 'sentenca 1 1', 'legal 1 2', 'nothing 12 13', 'ultra 4 5']) with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.transform(corpus) mock.assert_called_with('glove.2d.txt') self.assertAlmostEqual([ [9.5, 5.5], [6, 8], [7, 5]], result.tolist())
def test_vectorize_implements_fit_transform_interface (self): corpus = ['nova alternativa de jogo', 'alternativa terceira alternativa legal', 'outra sentenca'] vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.2d.txt')) vectorizer.fit = Mock(return_value=vectorizer) mock = mock_open() mock.return_value.__iter__.return_value = iter([ 'nova 3 9', 'nada 3 10', 'de 9 1', 'jogo 12 2', 'terceira 4 11', 'outra 13 9', 'sentenca 1 1', 'legal 1 2', 'nothing 12 13', 'ultra 4 5']) y_stub = [] with patch('lib.embedding_vectorizer.open', mock): result = vectorizer.fit_transform(corpus, y_stub) mock.assert_called_with('glove.2d.txt') self.assertEqual([ [8.0, 4.0], [2.5, 6.5], [7.0, 5.0]], result.tolist()) vectorizer.fit.assert_called_once_with(corpus, y_stub)
def test_vectorize_implements_fit_interface (self): X_stub, y_stub = [], [] vectorizer = AverageEmbeddingVectorizer( GloveLoader('glove.2d.txt')) result = vectorizer.fit(X_stub, y_stub) self.assertEqual(vectorizer, result)