def validate_vocabulary(self): vocabulary = self.vocabulary if vocabulary is not None: if isinstance(vocabulary, set): vocabulary = sorted(vocabulary) if not isinstance(vocabulary, Mapping): vocab = {} for i, t in enumerate(vocabulary): if vocab.setdefault(t, i) != i: msg = "Duplicate term in vocabulary: %r" % t raise ValueError(msg) vocabulary = vocab else: indices = set(six.itervalues(vocabulary)) if len(indices) != len(vocabulary): raise ValueError("Vocabulary contains repeated indices.") for i in xrange(len(vocabulary)): if i not in indices: msg = ("Vocabulary of size %d doesn't contain index " "%d." % (len(vocabulary), i)) raise ValueError(msg) if not vocabulary: raise ValueError("empty vocabulary passed to fit") self.fixed_vocabulary_ = True self.vocabulary_ = dict(vocabulary) else: self.fixed_vocabulary_ = False
def _term_counts_to_matrix(self, n_doc, i_indices, j_indices, values): """Construct COO matrix from indices and values. i_indices and j_indices should be constructed with _make_int_array. """ # array("i") corresponds to np.intc, which is also what scipy.sparse # wants for indices, so they won't be copied by the coo_matrix ctor. # The length check works around a bug in old NumPy versions: # http://projects.scipy.org/numpy/ticket/1943 if len(i_indices) > 0: i_indices = np.frombuffer(i_indices, dtype=np.intc) if len(j_indices) > 0: j_indices = np.frombuffer(j_indices, dtype=np.intc) if self.dtype == np.intc and len(values) > 0: values = np.frombuffer(values, dtype=np.intc) else: # In Python 3.2, SciPy 0.10.1, the coo_matrix ctor won't accept an # array.array. values = np.asarray(values, dtype=self.dtype) shape = (n_doc, max(six.itervalues(self.vocabulary_)) + 1) spmatrix = sp.coo_matrix((values, (i_indices, j_indices)), shape=shape, dtype=self.dtype) if self.binary: spmatrix.data.fill(1) return spmatrix
def topological_sort(deps): ''' Topologically sort a DAG, represented by a dict of child => set of parents. The dependency dict is destroyed during operation. Uses the Kahn algorithm: http://en.wikipedia.org/wiki/Topological_sorting Not a particularly good implementation, but we're just running it on tiny graphs. ''' order = [] available = set() def _move_available(): to_delete = [] for n, parents in iteritems(deps): if not parents: available.add(n) to_delete.append(n) for n in to_delete: del deps[n] _move_available() while available: n = available.pop() order.append(n) for parents in itervalues(deps): parents.discard(n) _move_available() if available: raise ValueError("dependency cycle found") return order
def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64): self.input = input self.encoding = encoding self.decode_error = decode_error self.strip_accents = strip_accents self.preprocessor = preprocessor self.tokenizer = tokenizer self.analyzer = analyzer self.lowercase = lowercase self.token_pattern = token_pattern self.stop_words = stop_words self.max_df = max_df self.min_df = min_df if max_df < 0 or min_df < 0: raise ValueError("negative value for max_df of min_df") self.max_features = max_features if max_features is not None: if (not isinstance(max_features, numbers.Integral) or max_features <= 0): raise ValueError( "max_features=%r, neither a positive integer nor None" % max_features) self.ngram_range = ngram_range if vocabulary is not None: if not isinstance(vocabulary, Mapping): vocab = {} for i, t in enumerate(vocabulary): if vocab.setdefault(t, i) != i: msg = "Duplicate term in vocabulary: %r" % t raise ValueError(msg) vocabulary = vocab else: indices = set(six.itervalues(vocabulary)) if len(indices) != len(vocabulary): raise ValueError("Vocabulary contains repeated indices.") for i in xrange(len(vocabulary)): if i not in indices: msg = ("Vocabulary of size %d doesn't contain index " "%d." % (len(vocabulary), i)) raise ValueError(msg) if not vocabulary: raise ValueError("empty vocabulary passed to fit") self.fixed_vocabulary = True self.vocabulary_ = dict(vocabulary) else: self.fixed_vocabulary = False self.binary = binary self.dtype = dtype
def _term_count_dicts_to_matrix(self, term_count_dicts): i_indices = [] j_indices = [] values = [] vocabulary = self.vocabulary_ for i, term_count_dict in enumerate(term_count_dicts): for term, count in six.iteritems(term_count_dict): j = vocabulary.get(term) if j is not None: i_indices.append(i) j_indices.append(j) values.append(count) # free memory as we go term_count_dict.clear() shape = (i + 1, max(six.itervalues(vocabulary)) + 1) spmatrix = sp.coo_matrix((values, (i_indices, j_indices)), shape=shape, dtype=self.dtype) if self.binary: spmatrix.data.fill(1) return spmatrix
def __init__(self, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64): self.input = input self.encoding = encoding self.decode_error = decode_error if charset is not None: warnings.warn("The charset parameter is deprecated as of version " "0.14 and will be removed in 0.16. Use encoding " "instead.", DeprecationWarning) self.encoding = charset if charset_error is not None: warnings.warn("The charset_error parameter is deprecated as of " "version 0.14 and will be removed in 0.16. Use " "decode_error instead.", DeprecationWarning) self.decode_error = charset_error self.strip_accents = strip_accents self.preprocessor = preprocessor self.tokenizer = tokenizer self.analyzer = analyzer self.lowercase = lowercase self.token_pattern = token_pattern self.stop_words = stop_words self.max_df = max_df self.min_df = min_df if max_df < 0 or min_df < 0: raise ValueError("negative value for max_df of min_df") self.max_features = max_features if max_features is not None: if (not isinstance(max_features, numbers.Integral) or max_features <= 0): raise ValueError( "max_features=%r, neither a positive integer nor None" % max_features) self.ngram_range = ngram_range if vocabulary is not None: if not isinstance(vocabulary, Mapping): vocabulary = dict((t, i) for i, t in enumerate(vocabulary)) if not vocabulary: raise ValueError("empty vocabulary passed to fit") indices = set(six.itervalues(vocabulary)) if len(indices) != len(vocabulary): raise ValueError("Vocabulary contains repeated indices.") for i in xrange(len(vocabulary)): if i not in indices: msg = "Vocabulary of size %d doesn't contain index %d." raise ValueError(msg % (len(vocabulary), i)) self.fixed_vocabulary = True self.vocabulary_ = dict(vocabulary) else: self.fixed_vocabulary = False self.binary = binary self.dtype = dtype
def __init__(self, input='content', encoding='utf-8', charset=None, decode_error='strict', charset_error=None, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64): self.input = input self.encoding = encoding self.decode_error = decode_error if charset is not None: warnings.warn( "The charset parameter is deprecated as of version " "0.14 and will be removed in 0.16. Use encoding " "instead.", DeprecationWarning) self.encoding = charset if charset_error is not None: warnings.warn( "The charset_error parameter is deprecated as of " "version 0.14 and will be removed in 0.16. Use " "decode_error instead.", DeprecationWarning) self.decode_error = charset_error self.strip_accents = strip_accents self.preprocessor = preprocessor self.tokenizer = tokenizer self.analyzer = analyzer self.lowercase = lowercase self.token_pattern = token_pattern self.stop_words = stop_words self.max_df = max_df self.min_df = min_df if max_df < 0 or min_df < 0: raise ValueError("negative value for max_df of min_df") self.max_features = max_features if max_features is not None: if (not isinstance(max_features, numbers.Integral) or max_features <= 0): raise ValueError( "max_features=%r, neither a positive integer nor None" % max_features) self.ngram_range = ngram_range if vocabulary is not None: if not isinstance(vocabulary, Mapping): vocab = {} for i, t in enumerate(vocabulary): if vocab.setdefault(t, i) != i: msg = "Duplicate term in vocabulary: %r" % t raise ValueError(msg) vocabulary = vocab else: indices = set(six.itervalues(vocabulary)) if len(indices) != len(vocabulary): raise ValueError("Vocabulary contains repeated indices.") for i in xrange(len(vocabulary)): if i not in indices: msg = ("Vocabulary of size %d doesn't contain index " "%d." % (len(vocabulary), i)) raise ValueError(msg) if not vocabulary: raise ValueError("empty vocabulary passed to fit") self.fixed_vocabulary = True self.vocabulary_ = dict(vocabulary) else: self.fixed_vocabulary = False self.binary = binary self.dtype = dtype
def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors. This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- vectors : array, [n_samples, n_features] """ # We intentionally don't call the transform method to make # fit_transform overridable without unwanted side effects in # TfidfVectorizer. fixed_vocab = self.fixed_vocabulary if fixed_vocab: vocab = self.vocabulary_ vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1 else: vocab = {} vocab_max_ind = 0 # Result of document conversion to term count arrays. row_ind = _make_int_array() col_ind = _make_int_array() feature_values = _make_int_array() term_counts = Counter() # term counts across entire corpus (count each term maximum once per # document) document_counts = Counter() analyze = self.build_analyzer() for n_doc, doc in enumerate(raw_documents): term_count_current = Counter(analyze(doc)) term_counts.update(term_count_current) if not fixed_vocab: for term in six.iterkeys(term_count_current): if term not in vocab: vocab[term] = vocab_max_ind vocab_max_ind += 1 document_counts.update(six.iterkeys(term_count_current)) for term, count in six.iteritems(term_count_current): if term in vocab: row_ind.append(n_doc) col_ind.append(vocab[term]) feature_values.append(count) n_doc += 1 if fixed_vocab: # XXX max_df, min_df and max_features have no effect # with a fixed vocabulary. i_indices = row_ind j_indices = col_ind values = feature_values else: max_features = self.max_features max_df = self.max_df min_df = self.min_df max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc) # filter out stop words: terms that occur in almost all documents if max_doc_count < n_doc or min_doc_count > 1: stop_words = set(t for t, dc in six.iteritems(document_counts) if not min_doc_count <= dc <= max_doc_count) else: stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: terms = set(term_counts) - stop_words else: # extract the most frequent terms for the vocabulary terms = set() for t, tc in term_counts.most_common(): if t not in stop_words: terms.add(t) if len(terms) >= max_features: break # store the learned stop words to make it easier to debug the value # of max_df self.stop_words_ = stop_words # free memory term_counts.clear() document_counts.clear() # store map from term name to feature integer index: we sort the # terms to have reproducible outcome for the vocabulary structure: # otherwise the mapping from feature name to indices might depend # on the memory layout of the machine. Furthermore sorted terms # might make it possible to perform binary search in the feature # names array. terms = sorted(terms) # reorder term indices reorder_indices = dict((vocab[term], i) for i, term in enumerate(terms)) self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms))) # create term count arrays with new vocabulary structure i_indices = _make_int_array() j_indices = _make_int_array() values = _make_int_array() for i, col in enumerate(col_ind): if col in reorder_indices: i_indices.append(row_ind[i]) j_indices.append(reorder_indices[col_ind[i]]) values.append(feature_values[i]) # free memory del reorder_indices del row_ind del col_ind del feature_values if not vocab: msg = "Empty vocabulary; " if fixed_vocab: msg += "%r passed to constructor." % vocab else: msg += "perhaps your documents contain stop words only?" raise ValueError(msg) # the term_counts and document_counts might be useful statistics, are # we really sure want we want to drop them? They take some memory but # can be useful for corpus introspection return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
def _parse_specs(specs, Ks): ''' Set up the different functions we need to call. Returns: - a dict mapping base estimator functions to _FuncInfo objects. If the function needs_alpha, then the alphas attribute is an array of alpha values and pos is a corresponding array of indices. Otherwise, alphas is None and pos is a list containing a single index. Indices are >= 0 if they correspond to something in a spec, and negative if they're just used for a meta estimator but not directly requested. - an OrderedDict mapping functions to _MetaFuncInfo objects. alphas and pos are like for _FuncInfo; deps is a list of indices which should be passed to the estimator. Note that these might be other meta functions; this list is guaranteed to be in an order such that all dependencies are resolved before calling that function. If no such order is possible, raise ValueError. - the number of meta-only results # TODO: update doctests for _parse_specs >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9']) ({<function alpha_div at 0x10954f848>: _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3])}, OrderedDict([ (<function hellinger at 0x10954fc80>, _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])), (<function renyi at 0x10954fcf8>, _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3])) ]), 3) >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2']) ({<function alpha_div at 0x10954f848>: _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]), <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[-4]) }, OrderedDict([ (<function hellinger at 0x10954fc80>, _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])), (<function l2 at 0x10954fde8>, _MetaFuncInfo(alphas=None, pos=[3], deps=[-4])), (<function renyi at 0x10954fcf8>, _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3])) ]), 4) >>> _parse_specs(['renyi:.8', 'hellinger', 'renyi:.9', 'l2', 'linear']) ({<function alpha_div at 0x10954f848>: _FuncInfo(alphas=[0.8, 0.5, 0.9], pos=[-1, -2, -3]), <function linear at 0x10954f758>: _FuncInfo(alphas=None, pos=[4]) }, OrderedDict([ (<function hellinger at 0x10954fc80>, _MetaFuncInfo(alphas=None, pos=[1], deps=[array(-2)])), (<function l2 at 0x10954fde8>, _MetaFuncInfo(alphas=None, pos=[3], deps=[4])), (<function renyi at 0x10954fcf8>, _MetaFuncInfo(alphas=[0.8, 0.9], pos=[0, 2], deps=[-1, -3])) ]), 3) ''' funcs = {} metas = {} meta_deps = defaultdict(set) def add_func(func, alpha=None, pos=None): needs_alpha = getattr(func, 'needs_alpha', False) is_meta = hasattr(func, 'needs_results') d = metas if is_meta else funcs if func not in d: if needs_alpha: args = {'alphas': [alpha], 'pos': [pos]} else: args = {'alphas': None, 'pos': [pos]} if not is_meta: d[func] = _FuncInfo(**args) else: d[func] = _MetaFuncInfo(deps=[], **args) for req in func.needs_results: if callable(req.alpha): req_alpha = req.alpha(alpha) else: req_alpha = req.alpha add_func(req.func, alpha=req_alpha) meta_deps[func].add(req.func) meta_deps[req.func] # make sure required func is in there else: # already have an entry for the func # need to give it this pos, if it's not None # and also make sure that the alpha is present info = d[func] if not needs_alpha: if pos is not None: if info.pos != [None]: msg = "{} passed more than once" raise ValueError(msg.format(func_name)) info.pos[0] = pos else: # needs alpha try: idx = info.alphas.index(alpha) except ValueError: # this is a new alpha value we haven't seen yet info.alphas.append(alpha) info.pos.append(pos) if is_meta: for req in func.needs_results: if callable(req.alpha): req_alpha = req.alpha(alpha) else: req_alpha = req.alpha add_func(req.func, alpha=req_alpha) else: # repeated alpha value if pos is not None: if info.pos[idx] is not None: msg = "{} with alpha {} passed more than once" raise ValueError(msg.format(func_name, alpha)) info.pos[idx] = pos # add functions for each spec for i, spec in enumerate(specs): func_name, alpha = (spec.split(':', 1) + [None])[:2] if alpha is not None: alpha = float(alpha) try: func = func_mapping[func_name] except KeyError: msg = "'{}' is not a known function type" raise ValueError(msg.format(func_name)) needs_alpha = getattr(func, 'needs_alpha', False) if needs_alpha and alpha is None: msg = "{} needs alpha but not passed in spec '{}'" raise ValueError(msg.format(func_name, spec)) elif not needs_alpha and alpha is not None: msg = "{} doesn't need alpha but is passed in spec '{}'" raise ValueError(msg.format(func_name, spec)) add_func(func, alpha, i) # number things that are dependencies only meta_counter = itertools.count(-1, step=-1) for info in itertools.chain(itervalues(funcs), itervalues(metas)): for i, pos in enumerate(info.pos): if pos is None: info.pos[i] = next(meta_counter) # fill in the dependencies for metas for func, info in iteritems(metas): deps = info.deps assert deps == [] for req in func.needs_results: f = req.func req_info = (metas if hasattr(f, 'needs_results') else funcs)[f] if req.alpha is not None: if callable(req.alpha): req_alpha = req.alpha(info.alphas) else: req_alpha = req.alpha find_alpha = np.vectorize(req_info.alphas.index, otypes=[int]) pos = np.asarray(req_info.pos)[find_alpha(req_alpha)] if np.isscalar(pos): deps.append(pos[()]) else: deps.extend(pos) else: pos, = req_info.pos deps.append(pos) # topological sort of metas meta_order = topological_sort(meta_deps) metas_ordered = OrderedDict( (f, metas[f]) for f in meta_order if hasattr(f, 'needs_results')) return funcs, metas_ordered, -next(meta_counter) - 1