def _embedd_tokens( tokens: Collection[List[str]], language: str, progress_callback: Callable ) -> Tuple[np.ndarray, np.ndarray, Dict[str, Set[int]]]: """ Embedd document and words and create a mapping dictionary between words and documents """ # extract words word2doc = defaultdict(set) for i, doc_tokens in enumerate(tokens): for t in doc_tokens: word2doc[t].add(i) words = list(word2doc.keys()) # TODO: currently embedding report success unify them to report progress float ticks = iter(np.linspace(0, 1, len(tokens) + len(words))) def emb_cb(sucess: bool): if sucess: progress_callback(next(ticks)) # embedd documents embedder = DocumentEmbedder(language=language) # tokens is tranformedt to list in case it is np.ndarray doc_embs = np.array(embedder.transform(list(tokens), emb_cb)) # embedd words word_embs = np.array(embedder.transform([[w] for w in words], emb_cb)) return doc_embs, word_embs, word2doc
def test_persistent_caching(self): self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder(self.corpus[[0]]) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder.clear_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0)
def test_cache_for_different_aggregators(self): embedder = DocumentEmbedder(aggregator='max') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder.transform(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(aggregator='min') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder.transform(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 2)
def run_pretrained_embedder(corpus: Corpus, language: str, aggregator: str, state: TaskState) -> Tuple[Corpus, Corpus]: """Runs DocumentEmbedder. Parameters ---------- corpus : Corpus Corpus on which transform is performed. language : str ISO 639-1 (two-letter) code of desired language. aggregator : str Aggregator which creates document embedding (single vector) from word embeddings (multiple vectors). Allowed values are mean, sum, max, min. state : TaskState State object. Returns ------- Corpus New corpus with additional features. """ embedder = DocumentEmbedder(language=language, aggregator=aggregator) ticks = iter(np.linspace(0., 100., len(corpus))) def advance(success=True): if state.is_interruption_requested(): embedder.set_cancelled() if success: state.set_progress_value(next(ticks)) new_corpus, skipped_corpus = embedder(corpus, processed_callback=advance) return new_corpus, skipped_corpus
def test_cache_for_different_languages(self): embedder = DocumentEmbedder(language='sl') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(language='sl') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder.clear_cache() self.embedder.clear_cache()
def _embedding_similarity( corpus: Corpus, words: List[str], callback: Callable, embedding_language: str, ) -> np.ndarray: language = LANGS_TO_ISO[embedding_language] # make sure there will be only embeddings in X after calling the embedder corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus) emb = DocumentEmbedder(language) cb_part = len(corpus) / (len(corpus) + len(words)) documet_embeddings, skipped = emb.transform( corpus, wrap_callback(callback, 0, cb_part)) assert skipped is None words = [[w] for w in words] word_embeddings = np.array( emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part))) return cosine_similarity(documet_embeddings.X, word_embeddings)
def _embedding_similarity( corpus: Corpus, words: List[str], callback: Callable, embedding_language: str, ) -> np.ndarray: ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words))) # TODO: currently embedding report success unify them to report progress float def emb_cb(sucess: bool): if sucess: callback(next(ticks)) language = LANGS_TO_ISO[embedding_language] # make sure there will be only embeddings in X after calling the embedder corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus) emb = DocumentEmbedder(language) documet_embeddings, skipped = emb(corpus, emb_cb) assert skipped is None word_embeddings = np.array(emb([[w] for w in words], emb_cb)) return cosine_similarity(documet_embeddings.X, word_embeddings)
def setUp(self): self.embedder = DocumentEmbedder() # default params self.corpus = Corpus.from_file('deerwester')
class DocumentEmbedderTest(unittest.TestCase): def setUp(self): self.embedder = DocumentEmbedder() # default params self.corpus = Corpus.from_file('deerwester') def tearDown(self): self.embedder.clear_cache() @patch(PATCH_METHOD) def test_with_empty_corpus(self, mock): self.assertEqual(len(self.embedder(self.corpus[:0])), 0) mock.request.assert_not_called() mock.get_response.assert_not_called() self.assertEqual(self.embedder._embedder._cache._cache_dict, dict()) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_success_subset(self): res = self.embedder(self.corpus[[0]]) assert_array_equal(res.X, [[0.3, 1]]) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_success_shapes(self): res = self.embedder(self.corpus) self.assertEqual(res.X.shape, (len(self.corpus), 2)) self.assertEqual(len(res.domain), len(self.corpus.domain) + 2) @patch(PATCH_METHOD, make_dummy_post(b'')) def test_empty_response(self): with self.assertWarns(RuntimeWarning): res = self.embedder(self.corpus[[0]]) self.assertEqual(res.X.shape, (0, 0)) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'str')) def test_invalid_response(self): with self.assertWarns(RuntimeWarning): res = self.embedder(self.corpus[[0]]) self.assertEqual(res.X.shape, (0, 0)) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}')) def test_invalid_json_key(self): with self.assertWarns(RuntimeWarning): res = self.embedder(self.corpus[[0]]) self.assertEqual(res.X.shape, (0, 0)) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_persistent_caching(self): self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder(self.corpus[[0]]) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder.clear_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cache_for_different_languages(self): embedder = DocumentEmbedder(language='sl') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(language='sl') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder.clear_cache() self.embedder.clear_cache() @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cache_for_different_aggregators(self): embedder = DocumentEmbedder(aggregator='max') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(aggregator='min') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 2) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_with_statement(self): with self.embedder as embedder: res = embedder(self.corpus[[0]]) assert_array_equal(res.X, [[0.3, 1]]) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cancel(self): self.assertFalse(self.embedder._embedder._cancelled) self.embedder._embedder._cancelled = True with self.assertRaises(Exception): self.embedder(self.corpus[[0]]) @patch(PATCH_METHOD, side_effect=OSError) def test_connection_error(self, _): embedder = DocumentEmbedder() with self.assertRaises(ConnectionError): embedder(self.corpus[[0]]) def test_invalid_parameters(self): with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(language='eng') with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(aggregator='average') def test_invalid_corpus_type(self): with self.assertRaises(ValueError): self.embedder(self.corpus[0])
def test_invalid_parameters(self): with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(language='eng') with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(aggregator='average')
def test_connection_error(self, _): embedder = DocumentEmbedder() with self.assertRaises(ConnectionError): embedder(self.corpus[[0]])
class DocumentEmbedderTest(unittest.TestCase): def setUp(self): self.embedder = DocumentEmbedder() # default params self.corpus = Corpus.from_file('deerwester') def tearDown(self): self.embedder.clear_cache() @patch(PATCH_METHOD) def test_with_empty_corpus(self, mock): self.assertIsNone(self.embedder.transform(self.corpus[:0])[0]) self.assertIsNone(self.embedder.transform(self.corpus[:0])[1]) mock.request.assert_not_called() mock.get_response.assert_not_called() self.assertEqual(self.embedder._embedder._cache._cache_dict, dict()) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_success_subset(self): res, skipped = self.embedder.transform(self.corpus[[0]]) assert_array_equal(res.X, [[0.3, 1]]) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.assertIsNone(skipped) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_success_shapes(self): res, skipped = self.embedder.transform(self.corpus) self.assertEqual(res.X.shape, (len(self.corpus), 2)) self.assertEqual(len(res.domain.variables), len(self.corpus.domain.variables) + 2) self.assertIsNone(skipped) @patch(PATCH_METHOD, make_dummy_post(b'')) def test_empty_response(self): with self.assertWarns(RuntimeWarning): res, skipped = self.embedder.transform(self.corpus[[0]]) self.assertIsNone(res) self.assertEqual(len(skipped), 1) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'str')) def test_invalid_response(self): with self.assertWarns(RuntimeWarning): res, skipped = self.embedder.transform(self.corpus[[0]]) self.assertIsNone(res) self.assertEqual(len(skipped), 1) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embeddings": [0.3, 1]}')) def test_invalid_json_key(self): with self.assertWarns(RuntimeWarning): res, skipped = self.embedder.transform(self.corpus[[0]]) self.assertIsNone(res) self.assertEqual(len(skipped), 1) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_persistent_caching(self): self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder.transform(self.corpus[[0]]) self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 1) self.embedder.clear_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cache_for_different_languages(self): embedder = DocumentEmbedder(language='sl') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder.transform(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() self.embedder = DocumentEmbedder() self.assertEqual(len(self.embedder._embedder._cache._cache_dict), 0) self.embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(language='sl') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder.clear_cache() self.embedder.clear_cache() @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cache_for_different_aggregators(self): embedder = DocumentEmbedder(aggregator='max') embedder.clear_cache() self.assertEqual(len(embedder._embedder._cache._cache_dict), 0) embedder.transform(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder._embedder._cache.persist_cache() embedder = DocumentEmbedder(aggregator='min') self.assertEqual(len(embedder._embedder._cache._cache_dict), 1) embedder.transform(self.corpus[[0]]) self.assertEqual(len(embedder._embedder._cache._cache_dict), 2) @patch(PATCH_METHOD, make_dummy_post(b'{"embedding": [0.3, 1]}')) def test_cancel(self): self.assertFalse(self.embedder._embedder._cancelled) self.embedder._embedder._cancelled = True with self.assertRaises(Exception): self.embedder.transform(self.corpus[[0]]) @patch(PATCH_METHOD, side_effect=OSError) def test_connection_error(self, _): embedder = DocumentEmbedder() with self.assertRaises(ConnectionError): embedder.transform(self.corpus[[0]]) def test_invalid_parameters(self): with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(language='eng') with self.assertRaises(ValueError): self.embedder = DocumentEmbedder(aggregator='average') def test_remove_temporary_proxy_solution(self): """ When it starts to fail: - remove this test - remove temporary implementation of get_proxy() function in text.__inint__ - set minimum version of Orange on 3.33 """ import Orange self.assertGreater("3.34.0", Orange.__version__)