def test_apply(self): transformed_vtcorp = self.transformer._apply(self.vtcorp) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label) text_data_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[0]) text_obj_name = os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_names[2]) MmCorpus.serialize(text_data_name, transformed_vtcorp) transformed_vtcorp.save(text_obj_name) self.assertTrue(self.loader.has_text_corpora(self.transformation_label)) self.temporary_files.extend([ os.path.join(self.data_root, self.loader.layout.corpus_dir, transformed_name) for transformed_name in transformed_names]) transformed_vtcorp = TransformedCorpus.load(text_obj_name) self.assertIsInstance(transformed_vtcorp, TransformedCorpus) self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus) self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary')) print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary) self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
def test_indexing(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) docs = list(corpus) for idx, doc in enumerate(docs): self.assertEqual(doc, corpus[idx]) self.assertEqual(doc, corpus[np.int64(idx)]) self.assertEqual(docs, list(corpus[:])) self.assertEqual(docs[0:], list(corpus[0:])) self.assertEqual(docs[0:-1], list(corpus[0:-1])) self.assertEqual(docs[2:4], list(corpus[2:4])) self.assertEqual(docs[::2], list(corpus[::2])) self.assertEqual(docs[::-1], list(corpus[::-1])) # make sure sliced corpora can be iterated over multiple times c = corpus[:] self.assertEqual(docs, list(c)) self.assertEqual(docs, list(c)) self.assertEqual(len(docs), len(corpus)) self.assertEqual(len(docs), len(corpus[:])) self.assertEqual(len(docs[::2]), len(corpus[::2])) def _get_slice(corpus, slice_): # assertRaises for python 2.6 takes a callable return corpus[slice_] # make sure proper input validation for sliced corpora is done self.assertRaises(ValueError, _get_slice, corpus, {1}) self.assertRaises(ValueError, _get_slice, corpus, 1.0) # check sliced corpora that use fancy indexing c = corpus[[1, 3, 4]] self.assertEqual([d for i, d in enumerate(docs) if i in [1, 3, 4]], list(c)) self.assertEqual([d for i, d in enumerate(docs) if i in [1, 3, 4]], list(c)) self.assertEqual(len(corpus[[0, 1, -1]]), 3) self.assertEqual(len(corpus[np.asarray([0, 1, -1])]), 3) # check that TransformedCorpus supports indexing when the underlying # corpus does, and throws an error otherwise corpus_ = TransformedCorpus(DummyTransformer(), corpus) if hasattr(corpus, 'index') and corpus.index is not None: self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1) self.assertRaises(ValueError, _get_slice, corpus_, {1}) transformed_docs = [ val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4] ] self.assertEqual( transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc)) self.assertEqual(3, len(corpus_[[1, 3, 4]])) else: self.assertRaises(RuntimeError, _get_slice, corpus_, [1, 3, 4]) self.assertRaises(RuntimeError, _get_slice, corpus_, {1}) self.assertRaises(RuntimeError, _get_slice, corpus_, 1.0)
def _apply(self, corpus, chunksize=None): """Apply transformation in :func:`__getitem__` to the entire corpus. Does this by returning gensim's :class:`TransformedCorpus` object that applies the transformation over the entire corpus. This is essentially a generalization of gensim's VocabTransform class with added facilities for backward feature mapping. :type corpus: gensim.interfaces.CorpusABC :param corpus: The corpus to transform. """ if not isinstance(corpus, TextCorpus): logging.warn('Frequency-based transformer applied on non-text' + ' corpus; returning TransformedCorpus.') transformed_corpus = TransformedCorpus(self, corpus, chunksize) return transformed_corpus transformed_corpus = TransformedCorpus(self, corpus, chunksize) return transformed_corpus # Text corpora: do deep copy, filter transform dictionary # Potentially expensive if called on a corpus that stores a lot of # information. transformed_corpus = deepcopy(corpus) # Apply dictionary transformations if hasattr(transformed_corpus, 'dictionary'): print 'Compactifying dictionary...' transformed_corpus.dictionary.filter_tokens(good_ids=list(self.allowed_features)) transformed_corpus.dictionary.compactify() print 'After compactification: %i features.' % len(transformed_corpus.dictionary) if hasattr(transformed_corpus, 'allow_dict_updates'): transformed_corpus.allow_dict_updates = False if hasattr(corpus, 'label'): if corpus.label: transformed_corpus.label = corpus.label + self.label else: transformed_corpus.label = self.label logging.info('Transformed corpus dictonary has %i features.' % len(transformed_corpus.dictionary)) return transformed_corpus
def _apply(self, corpus, chunksize=None, **kwargs): """Apply the transformation to a whole corpus and get the result as another corpus. Parameters ---------- corpus : iterable of list of (int, number) Corpus in sparse Gensim bag-of-words format. chunksize : int, optional If provided, a more effective processing will performed. Returns ------- :class:`~gensim.interfaces.TransformedCorpus` Transformed corpus. """ return TransformedCorpus(self, corpus, chunksize, **kwargs)
def _apply(self, corpus, chunksize=None, **kwargs): """Apply the transformation to a whole corpus and get the result as another corpus. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). chunksize : int, optional If provided, a more effective processing will performed. Returns ------- :class:`~gensim.interfaces.TransformedCorpus` Transformed corpus. """ return TransformedCorpus(self, corpus, chunksize, **kwargs)
def load_tfidf_corpus(self): return TransformedCorpus.load(self.tfidf_corpus)