-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_feature_vect.py
578 lines (523 loc) · 23.1 KB
/
my_feature_vect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
# so far basically just copied from
#https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/feature_extraction/text.py#L511
# the idea is to use this as a stub to create our own feature extractor
from __future__ import unicode_literals
import array
from collections import Mapping, defaultdict
import numbers
from operator import itemgetter
import re
import unicodedata
import numpy as np
import scipy.sparse as sp
import six
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import VectorizerMixin
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.hashing import FeatureHasher
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.utils import deprecated
from sklearn.utils.fixes import frombuffer_empty, bincount
from sklearn.utils.validation import check_is_fitted
def _document_frequency(X):
"""Count the number of non-zero values for each feature in sparse X."""
if sp.isspmatrix_csr(X):
return bincount(X.indices, minlength=X.shape[1])
else:
return np.diff(sp.csc_matrix(X, copy=False).indptr)
class StanceVectorizer(BaseEstimator, VectorizerMixin):
def __init__(self, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None,
lowercase=True, preprocessor=None, tokenizer=None,
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), analyzer='word',
max_df=1.0, min_df=1, max_features=None,
vocabulary=None, binary=False, dtype=np.int64):
self.input = input
self.encoding = encoding
self.decode_error = decode_error
self.strip_accents = strip_accents
self.preprocessor = preprocessor
self.tokenizer = tokenizer
self.analyzer = analyzer
self.lowercase = lowercase
self.token_pattern = token_pattern
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
if max_df < 0 or min_df < 0:
raise ValueError("negative value for max_df or min_df")
self.max_features = max_features
if max_features is not None:
if (not isinstance(max_features, numbers.Integral) or
max_features <= 0):
raise ValueError(
"max_features=%r, neither a positive integer nor None"
% max_features)
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
self.dtype = dtype
def fit(self, raw_documents, y=None):
"""Learn a vocabulary dictionary of all tokens in the raw documents.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
self
"""
self.fit_transform(raw_documents)
return self
def fit_transform(self, raw_documents, y=None):
"""Learn the vocabulary dictionary and return term-document matrix.
Each document is a tuple of headline,body
This is equivalent to fit followed by transform, but more efficiently
implemented.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
X : array, [n_samples, n_features]
Document-term matrix.
"""
vocabulary, X = self._extract_features(raw_documents)
self.vocabulary_ = vocabulary
return X
def transform(self, raw_documents):
"""Transform documents to document-term matrix.
Extract token counts out of raw text documents using the vocabulary
fitted with fit or the one provided to the constructor.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
X : sparse matrix, [n_samples, n_features]
Document-term matrix.
"""
if isinstance(raw_documents, six.string_types):
raise ValueError(
"Iterable over raw text documents expected, "
"string object received.")
if not hasattr(self, 'vocabulary_'):
self._validate_vocabulary()
self._check_vocabulary()
# use the same matrix-building strategy as fit_transform
_, X = self._extract_features(raw_documents)
if self.binary:
X.data.fill(1)
return X
def _extract_features(self,raw_documents):
j_indices = []
indptr = _make_int_array()
values = _make_int_array()
indptr.append(0)
vocabulary = defaultdict(int)
for doc in raw_documents:
feature_counter = {}
#for feature,feature_idx in contains_keywords(doc).items():
for feature,feature_idx in self.body_words_in_headline(doc).items():
vocabulary[feature]+=1
try:
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
j_indices.extend(feature_counter.keys())
values.extend(feature_counter.values())
indptr.append(len(j_indices))
j_indices = np.asarray(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = frombuffer_empty(values, dtype=np.intc)
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sort_indices()
return vocabulary, X
def get_feature_names(self):
"""Array mapping from feature integer indices to feature name"""
self._check_vocabulary()
return [t for t, i in sorted(six.iteritems(self.vocabulary_),
key=itemgetter(1))]
def body_words_in_headline(self,doc):
""" For each word in the headline, create a feature. The value of the
feature will be the number of times that word appears in the body"""
features = defaultdict(int)
analyze = self.build_analyzer()
headline_tokens=analyze(doc[0])
body_tokens=analyze(doc[1])
#headline_token_counts=defaultdict(int)
body_token_counts=defaultdict(int)
for token in body_tokens:
body_token_counts[token]+=1
for token in headline_tokens:
if token in body_token_counts:
features[token] +=1
return features
class CountVectorizer(BaseEstimator, VectorizerMixin):
"""Convert a collection of text documents to a matrix of token counts
This implementation produces a sparse representation of the counts using
scipy.sparse.coo_matrix.
If you do not provide an a-priori dictionary and you do not use an analyzer
that does some kind of feature selection then the number of features will
be equal to the vocabulary size found by analyzing the data.
Read more in the :ref:`User Guide <text_feature_extraction>`.
Parameters
----------
input : string {'filename', 'file', 'content'}
If 'filename', the sequence passed as an argument to fit is
expected to be a list of filenames that need reading to fetch
the raw content to analyze.
If 'file', the sequence items must have a 'read' method (file-like
object) that is called to fetch the bytes in memory.
Otherwise the input is expected to be the sequence strings or
bytes items are expected to be analyzed directly.
encoding : string, 'utf-8' by default.
If bytes or files are given to analyze, this encoding is used to
decode.
decode_error : {'strict', 'ignore', 'replace'}
Instruction on what to do if a byte sequence is given to analyze that
contains characters not of the given `encoding`. By default, it is
'strict', meaning that a UnicodeDecodeError will be raised. Other
values are 'ignore' and 'replace'.
strip_accents : {'ascii', 'unicode', None}
Remove accents during the preprocessing step.
'ascii' is a fast method that only works on characters that have
an direct ASCII mapping.
'unicode' is a slightly slower method that works on any characters.
None (default) does nothing.
analyzer : string, {'word', 'char', 'char_wb'} or callable
Whether the feature should be made of word or character n-grams.
Option 'char_wb' creates character n-grams only from text inside
word boundaries.
If a callable is passed it is used to extract the sequence of features
out of the raw, unprocessed input.
preprocessor : callable or None (default)
Override the preprocessing (string transformation) stage while
preserving the tokenizing and n-grams generation steps.
tokenizer : callable or None (default)
Override the string tokenization step while preserving the
preprocessing and n-grams generation steps.
Only applies if ``analyzer == 'word'``.
ngram_range : tuple (min_n, max_n)
The lower and upper boundary of the range of n-values for different
n-grams to be extracted. All values of n such that min_n <= n <= max_n
will be used.
stop_words : string {'english'}, list, or None (default)
If 'english', a built-in stop word list for English is used.
If a list, that list is assumed to contain stop words, all of which
will be removed from the resulting tokens.
Only applies if ``analyzer == 'word'``.
If None, no stop words will be used. max_df can be set to a value
in the range [0.7, 1.0) to automatically detect and filter stop
words based on intra corpus document frequency of terms.
lowercase : boolean, True by default
Convert all characters to lowercase before tokenizing.
token_pattern : string
Regular expression denoting what constitutes a "token", only used
if ``analyzer == 'word'``. The default regexp select tokens of 2
or more alphanumeric characters (punctuation is completely ignored
and always treated as a token separator).
max_df : float in range [0.0, 1.0] or int, default=1.0
When building the vocabulary ignore terms that have a document
frequency strictly higher than the given threshold (corpus-specific
stop words).
If float, the parameter represents a proportion of documents, integer
absolute counts.
This parameter is ignored if vocabulary is not None.
min_df : float in range [0.0, 1.0] or int, default=1
When building the vocabulary ignore terms that have a document
frequency strictly lower than the given threshold. This value is also
called cut-off in the literature.
If float, the parameter represents a proportion of documents, integer
absolute counts.
This parameter is ignored if vocabulary is not None.
max_features : int or None, default=None
If not None, build a vocabulary that only consider the top
max_features ordered by term frequency across the corpus.
This parameter is ignored if vocabulary is not None.
vocabulary : Mapping or iterable, optional
Either a Mapping (e.g., a dict) where keys are terms and values are
indices in the feature matrix, or an iterable over terms. If not
given, a vocabulary is determined from the input documents. Indices
in the mapping should not be repeated and should not have any gap
between 0 and the largest index.
binary : boolean, default=False
If True, all non zero counts are set to 1. This is useful for discrete
probabilistic models that model binary events rather than integer
counts.
dtype : type, optional
Type of the matrix returned by fit_transform() or transform().
Attributes
----------
vocabulary_ : dict
A mapping of terms to feature indices.
stop_words_ : set
Terms that were ignored because they either:
- occurred in too many documents (`max_df`)
- occurred in too few documents (`min_df`)
- were cut off by feature selection (`max_features`).
This is only available if no vocabulary was given.
See also
--------
HashingVectorizer, TfidfVectorizer
Notes
-----
The ``stop_words_`` attribute can get large and increase the model size
when pickling. This attribute is provided only for introspection and can
be safely removed using delattr or set to None before pickling.
"""
def __init__(self, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None,
lowercase=True, preprocessor=None, tokenizer=None,
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), analyzer='word',
max_df=1.0, min_df=1, max_features=None,
vocabulary=None, binary=False, dtype=np.int64):
self.input = input
self.encoding = encoding
self.decode_error = decode_error
self.strip_accents = strip_accents
self.preprocessor = preprocessor
self.tokenizer = tokenizer
self.analyzer = analyzer
self.lowercase = lowercase
self.token_pattern = token_pattern
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
if max_df < 0 or min_df < 0:
raise ValueError("negative value for max_df or min_df")
self.max_features = max_features
if max_features is not None:
if (not isinstance(max_features, numbers.Integral) or
max_features <= 0):
raise ValueError(
"max_features=%r, neither a positive integer nor None"
% max_features)
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
self.dtype = dtype
def _sort_features(self, X, vocabulary):
"""Sort features by name
Returns a reordered matrix and modifies the vocabulary in place
"""
sorted_features = sorted(six.iteritems(vocabulary))
map_index = np.empty(len(sorted_features), dtype=np.int32)
for new_val, (term, old_val) in enumerate(sorted_features):
vocabulary[term] = new_val
map_index[old_val] = new_val
X.indices = map_index.take(X.indices, mode='clip')
return X
def _limit_features(self, X, vocabulary, high=None, low=None,
limit=None):
"""Remove too rare or too common features.
Prune features that are non zero in more samples than high or less
documents than low, modifying the vocabulary, and restricting it to
at most the limit most frequent.
This does not prune samples with zero features.
"""
if high is None and low is None and limit is None:
return X, set()
# Calculate a mask based on document frequencies
dfs = _document_frequency(X)
tfs = np.asarray(X.sum(axis=0)).ravel()
mask = np.ones(len(dfs), dtype=bool)
if high is not None:
mask &= dfs <= high
if low is not None:
mask &= dfs >= low
if limit is not None and mask.sum() > limit:
mask_inds = (-tfs[mask]).argsort()[:limit]
new_mask = np.zeros(len(dfs), dtype=bool)
new_mask[np.where(mask)[0][mask_inds]] = True
mask = new_mask
new_indices = np.cumsum(mask) - 1 # maps old indices to new
removed_terms = set()
for term, old_index in list(six.iteritems(vocabulary)):
if mask[old_index]:
vocabulary[term] = new_indices[old_index]
else:
del vocabulary[term]
removed_terms.add(term)
kept_indices = np.where(mask)[0]
if len(kept_indices) == 0:
raise ValueError("After pruning, no terms remain. Try a lower"
" min_df or a higher max_df.")
return X[:, kept_indices], removed_terms
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
vocabulary = self.vocabulary_
else:
# Add a new value when a new vocabulary item is seen
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
analyze = self.build_analyzer()
j_indices = []
indptr = _make_int_array()
values = _make_int_array()
indptr.append(0)
for doc in raw_documents:
feature_counter = {}
for feature in analyze(doc):
try:
feature_idx = vocabulary[feature]
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
j_indices.extend(feature_counter.keys())
values.extend(feature_counter.values())
indptr.append(len(j_indices))
if not fixed_vocab:
# disable defaultdict behaviour
vocabulary = dict(vocabulary)
if not vocabulary:
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")
j_indices = np.asarray(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = frombuffer_empty(values, dtype=np.intc)
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sort_indices()
return vocabulary, X
def fit(self, raw_documents, y=None):
"""Learn a vocabulary dictionary of all tokens in the raw documents.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
self
"""
self.fit_transform(raw_documents)
return self
def fit_transform(self, raw_documents, y=None):
"""Learn the vocabulary dictionary and return term-document matrix.
This is equivalent to fit followed by transform, but more efficiently
implemented.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
X : array, [n_samples, n_features]
Document-term matrix.
"""
# We intentionally don't call the transform method to make
# fit_transform overridable without unwanted side effects in
# TfidfVectorizer.
if isinstance(raw_documents, six.string_types):
raise ValueError(
"Iterable over raw text documents expected, "
"string object received.")
self._validate_vocabulary()
max_df = self.max_df
min_df = self.min_df
max_features = self.max_features
vocabulary, X = self._count_vocab(raw_documents,
self.fixed_vocabulary_)
if self.binary:
X.data.fill(1)
if not self.fixed_vocabulary_:
X = self._sort_features(X, vocabulary)
n_doc = X.shape[0]
max_doc_count = (max_df
if isinstance(max_df, numbers.Integral)
else max_df * n_doc)
min_doc_count = (min_df
if isinstance(min_df, numbers.Integral)
else min_df * n_doc)
if max_doc_count < min_doc_count:
raise ValueError(
"max_df corresponds to < documents than min_df")
X, self.stop_words_ = self._limit_features(X, vocabulary,
max_doc_count,
min_doc_count,
max_features)
self.vocabulary_ = vocabulary
return X
def transform(self, raw_documents):
"""Transform documents to document-term matrix.
Extract token counts out of raw text documents using the vocabulary
fitted with fit or the one provided to the constructor.
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
Returns
-------
X : sparse matrix, [n_samples, n_features]
Document-term matrix.
"""
if isinstance(raw_documents, six.string_types):
raise ValueError(
"Iterable over raw text documents expected, "
"string object received.")
if not hasattr(self, 'vocabulary_'):
self._validate_vocabulary()
self._check_vocabulary()
# use the same matrix-building strategy as fit_transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True)
if self.binary:
X.data.fill(1)
return X
def inverse_transform(self, X):
"""Return terms per document with nonzero entries in X.
Parameters
----------
X : {array, sparse matrix}, shape = [n_samples, n_features]
Returns
-------
X_inv : list of arrays, len = n_samples
List of arrays of terms.
"""
self._check_vocabulary()
if sp.issparse(X):
# We need CSR format for fast row manipulations.
X = X.tocsr()
else:
# We need to convert X to a matrix, so that the indexing
# returns 2D objects
X = np.asmatrix(X)
n_samples = X.shape[0]
terms = np.array(list(self.vocabulary_.keys()))
indices = np.array(list(self.vocabulary_.values()))
inverse_vocabulary = terms[np.argsort(indices)]
return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
for i in range(n_samples)]
def get_feature_names(self):
"""Array mapping from feature integer indices to feature name"""
self._check_vocabulary()
return [t for t, i in sorted(six.iteritems(self.vocabulary_),
key=itemgetter(1))]
def _make_int_array():
"""Construct an array.array of a type suitable for scipy.sparse indices."""
return array.array(str("i"))
def contains_keywords(doc):
features={}
keywords = set(['good', 'bad', 'ugly'])
for keyword in keywords:
if keyword in doc:
features[keyword] = 1
else:
features[keyword] = 0
return features