class HashTfidfVectoriser: def __init__(self, n_features): self.hashing_vectoriser = HashingVectorizer(n_features=n_features, alternate_sign=False) self.tfidf_transformer = TfidfTransformer() self.words_by_hashes_dict = {} self.last_data = None def words_by_hash(self, hash): return self.words_by_hashes_dict[hash] def fit_transform(self, data): self.last_data = data[:] for i in range(len(data)): data[i] = re.sub("\d+", "", data[i]) self.words_by_hashes_dict = {} words_list = self.hashing_vectoriser.build_analyzer()("\n".join(data)) unique_words = set(words_list) hashes = self.hashing_vectoriser.transform(unique_words).indices for w, h in zip(unique_words, hashes): old_list = self.words_by_hashes_dict.get(h, []) old_list.append(w) self.words_by_hashes_dict[h] = old_list return self.tfidf_transformer.fit_transform( self.hashing_vectoriser.fit_transform(data))
def cleaner_str(s): cleaner = HashingVectorizer(decode_error = 'ignore', analyzer = 'word', ngram_range = (1,1), stop_words = 'english') c = cleaner.build_analyzer() s = (" ").join(c(s)) return s
# This mapping is completely stateless and the dimensionality of the output space is explicitly fixed in advance (here we use a modulo `2 ** 20` which means roughly 1M dimensions). The makes it possible to workaround the limitations of the vocabulary based vectorizer both for parallelizability and online / out-of-core learning. # %% [markdown] {"deletable": true, "editable": true} # The `HashingVectorizer` class is an alternative to the `CountVectorizer` (or `TfidfVectorizer` class with `use_idf=False`) that internally uses the murmurhash hash function: # %% {"deletable": true, "editable": true} from sklearn.feature_extraction.text import HashingVectorizer h_vectorizer = HashingVectorizer(encoding='latin-1') h_vectorizer # %% [markdown] {"deletable": true, "editable": true} # It shares the same "preprocessor", "tokenizer" and "analyzer" infrastructure: # %% {"deletable": true, "editable": true} analyzer = h_vectorizer.build_analyzer() analyzer('This is a test sentence.') # %% [markdown] {"deletable": true, "editable": true} # We can vectorize our datasets into a scipy sparse matrix exactly as we would have done with the `CountVectorizer` or `TfidfVectorizer`, except that we can directly call the `transform` method: there is no need to `fit` as `HashingVectorizer` is a stateless transformer: # %% {"deletable": true, "editable": true} docs_train, y_train = train['data'], train['target'] docs_valid, y_valid = test['data'][:12500], test['target'][:12500] docs_test, y_test = test['data'][12500:], test['target'][12500:] # %% [markdown] {"deletable": true, "editable": true} # The dimension of the output is fixed ahead of time to `n_features=2 ** 20` by default (nearly 1M features) to minimize the rate of collision on most classification problem while having reasonably sized linear models (1M weights in the `coef_` attribute): # %% {"deletable": true, "editable": true} h_vectorizer.transform(docs_train)