def fit_transform(self, raw_documents, y=None, vector_source=None, stats_hdf_file=None, cv_fold=-1, train_time_extractor=None, decode_time_extractor=None): self.cv_fold = cv_fold self.feature_extractor = train_time_extractor self.decode_time_extractor = decode_time_extractor self.thesaurus = vector_source self.handler = get_token_handler(self.train_token_handler, self.k, self.sim_compressor, self.thesaurus) # requested stats that to go HDF, store the name so we can record stats to that name at decode time too self.stats_hdf_file_ = stats_hdf_file self.stats = get_stats_recorder(self.debug_level, stats_hdf_file, 'tr', cv_fold, self.k) # a different stats recorder will be used for the testing data # ########## BEGIN super.fit_transform ########## # this is a modified version of super.fit_transform which works with an empty vocabulary self._validate_vocabulary() max_df = self.max_df min_df = self.min_df max_features = self.max_features vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_) X = X.tocsc() if self.binary: X.data.fill(1) if not self.fixed_vocabulary_: if vocabulary: X = self._sort_features(X, vocabulary) n_doc = X.shape[0] max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else int(round(max_df * n_doc))) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else int(round(min_df * n_doc))) if max_doc_count < min_doc_count: raise ValueError( "max_df corresponds to < documents than min_df") X, self.stop_words_ = self._limit_features(X, vocabulary, max_doc_count, min_doc_count, max_features) self.vocabulary_ = vocabulary # ######### END super.fit_transform ########## if (self.thesaurus and hasattr(self.thesaurus, 'get_nearest_neighbours') and hasattr(self.thesaurus.get_nearest_neighbours, 'cache_info')): logging.info('NN cache info: %s', self.thesaurus.get_nearest_neighbours.cache_info()) logging.info('Matrix shape is %r after vectorization', X.shape) return X, self.vocabulary_
def transform(self, raw_documents): self.feature_extractor = self.decode_time_extractor if not hasattr(self, 'vocabulary_'): self._check_vocabulary() if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0: raise ValueError("Vocabulary wasn't fitted or is empty!") # record stats separately for the test set self.stats = get_stats_recorder(self.debug_level, self.stats_hdf_file_, 'ev', self.cv_fold, self.k) if self.random_neighbour_thesaurus: # this is a bit of hack and a waste of effort, since a thesaurus will have been loaded first logging.info('Building random neighbour vector source with vocabulary of size %d', len(self.vocabulary_)) self.thesaurus.k = self.k self.thesaurus.vocab = list(self.vocabulary_.keys()) self.handler = get_token_handler(self.decode_token_handler, self.k, self.sim_compressor, self.thesaurus) # todo can't populate at this stage of the pipeline, because the vocabulary might # change if feature selection is enabled. Trying to do this will result in attempts to compose # features that we do not know how to compose because these have not been removed by FS # if self.thesaurus: # logging.info('Populating vector source %s prior to transform', self.thesaurus) # self.thesaurus.populate_vector_space(self.vocabulary_.keys()) # BEGIN a modified version of super.transform that works when vocabulary is empty _, X = self._count_vocab(raw_documents, fixed_vocab=True) if self.binary: X.data.fill(1) # END super.transform if (self.thesaurus and hasattr(self.thesaurus, 'get_nearest_neighbours') and hasattr(self.thesaurus.get_nearest_neighbours, 'cache_info')): logging.info('NN cache info: %s', self.thesaurus.get_nearest_neighbours.cache_info()) return X, self.vocabulary_