Пример #1
0
def _set_up_funcs(funcs, metas_ordered, Ks, dim, X_ns=None, Y_ns=None):
    # replace functions with partials of args
    def replace_func(func, info):
        needs_alpha = getattr(func, 'needs_alpha', False)

        new = None
        args = (Ks, dim)
        if needs_alpha:
            args = (info.alphas, ) + args

        if hasattr(func, 'chooser_fn'):
            args += (X_ns, Y_ns)
            if (getattr(func, 'needs_all_ks', False)
                    and getattr(func.chooser_fn, 'returns_ks', False)):
                new, K = func.chooser_fn(*args)
                new.K_needed = K
            else:
                new = func.chooser_fn(*args)
        else:
            new = partial(func, *args)

        for attr in dir(func):
            if not (attr.startswith('__') or attr.startswith('func_')):
                setattr(new, attr, getattr(func, attr))
        return new

    rep_funcs = dict(
        (replace_func(f, info), info) for f, info in iteritems(funcs))
    rep_metas_ordered = OrderedDict(
        (replace_func(f, info), info) for f, info in iteritems(metas_ordered))

    return rep_funcs, rep_metas_ordered
Пример #2
0
def _clone_h2o_obj(estimator, ignore=False, **kwargs):
    # do initial clone
    est = clone(estimator)

    # set kwargs:
    if kwargs:
        for k, v in six.iteritems(kwargs):
            setattr(est, k, v)

    # check on h2o estimator
    if isinstance(estimator, H2OPipeline):
        # the last step from the original estimator
        e = estimator.steps[-1][1]
        if isinstance(e, H2OEstimator):
            last_step = est.steps[-1][1]

            # so it's the last step
            for k, v in six.iteritems(e._parms):
                k, v = _kv_str(k, v)

                # if (not k in PARM_IGNORE) and (not v is None):
                #   e._parms[k] = v
                last_step._parms[k] = v

                # otherwise it's an BaseH2OFunctionWrapper
    return est
Пример #3
0
def _clone_h2o_obj(estimator, ignore=False, **kwargs):
    # do initial clone
    est = clone(estimator)

    # set kwargs:
    if kwargs:
        for k, v in six.iteritems(kwargs):
            setattr(est, k, v)

    # check on h2o estimator
    if isinstance(estimator, H2OPipeline):
        # the last step from the original estimator
        e = estimator.steps[-1][1]
        if isinstance(e, H2OEstimator):
            last_step = est.steps[-1][1]

            # so it's the last step
            for k, v in six.iteritems(e._parms):
                k, v = _kv_str(k, v)

                # if (not k in PARM_IGNORE) and (not v is None):
                #   e._parms[k] = v
                last_step._parms[k] = v

                # otherwise it's an BaseH2OFunctionWrapper
    return est
Пример #4
0
def _set_up_funcs(funcs, metas_ordered, Ks, dim, X_ns=None, Y_ns=None):
    # replace functions with partials of args
    def replace_func(func, info):
        needs_alpha = getattr(func, 'needs_alpha', False)

        new = None
        args = (Ks, dim)
        if needs_alpha:
            args = (info.alphas,) + args

        if hasattr(func, 'chooser_fn'):
            args += (X_ns, Y_ns)
            if (getattr(func, 'needs_all_ks', False) and
                    getattr(func.chooser_fn, 'returns_ks', False)):
                new, K = func.chooser_fn(*args)
                new.K_needed = K
            else:
                new = func.chooser_fn(*args)
        else:
            new = partial(func, *args)

        for attr in dir(func):
            if not (attr.startswith('__') or attr.startswith('func_')):
                setattr(new, attr, getattr(func, attr))
        return new

    rep_funcs = dict(
        (replace_func(f, info), info) for f, info in iteritems(funcs))
    rep_metas_ordered = OrderedDict(
        (replace_func(f, info), info) for f, info in iteritems(metas_ordered))

    return rep_funcs, rep_metas_ordered
 def get_params(self, deep=True):
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #6
0
 def get_params(self, deep=True):
     if not deep:
         return super(EnsembleClassifier, self).get_params(deep=False)
     else:
         out = self.named_clfs.copy()
         for name, step in six.iteritems(self.named_clfs):
             for k, v in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, k)] = v
         return out
 def get_params(self, deep=True):
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(self.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #8
0
 def get_params(self, deep=True):
     if not deep:
         return super(Pipeline, self).get_params(deep=False)
     else:
         out = self.named_steps.copy()
         for name, step in six.iteritems(self.named_steps):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
 def get_params(self, deep=True):
     """Get classifier parameter names for GridSearch"""
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['{0}__{1}'.format(name, key)] = value
         return out
Пример #10
0
 def get_params(self, deep=True):
     """ Return estimator parameter names for GridSearch support"""
     if not deep:
         return super(EnsembleClassifier, self).get_params(deep=False)
     else:
         out = self.named_clfs.copy()
         for name, step in six.iteritems(self.named_clfs):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
 def get_params(self, deep=True):
     """ Get classifier parameter names for GridSearch"""
     if not deep:
         return super(FWLS_Classifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #12
0
 def get_params(self, deep=True):
     """Return estimator parameter names for GridSearch support."""
     if not deep:
         return super(EnsembleVoteClassifier, self).get_params(deep=False)
     else:
         out = self.named_clfs.copy()
         for name, step in six.iteritems(self.named_clfs):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #13
0
 def get_params(self, deep=True):
     '''Получить имена параметров классификатора для GridSearch"'''
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         out=self.named_classifirs.copy()
         for name, step in six.iteritems(self.named_classifirs): #six.iteritems-Возвращает итератор по элементам словаря.
             for key, value in six.iteritems(step.get_parms(deep=True)):
                 out['%s_%s' % (name, key)] = value
         return out
 def get_params(self, deep=True):
     """ Get classifier parameter names for GridSearch"""
     if not deep:
         return super(SLS_Classifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #15
0
 def get_params(self, deep=True):
     """Return estimator parameter names for GridSearch support"""
     if not deep:
         return super(HybridFeatureVotingClassifier, self).get_params(deep=False)
     else:
         out = super(HybridFeatureVotingClassifier, self).get_params(deep=False)
         out.update(self.named_estimators.copy())
         for name, step in six.iteritems(self.named_estimators):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #16
0
    def get_params(self, deep=True):
        """Получить имена парметров классификатора для GridSearch"""

        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s %s' % (name, key)] = value
            return out
Пример #17
0
    def get_params(self, deep=True):
        if not deep:
            return super(SparkPipeline, self).get_params(deep=False)
        else:
            out = self.named_steps.copy()
            for name, step in six.iteritems(self.named_steps):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value

            out.update(super(SparkPipeline, self).get_params(deep=False))
            return out
Пример #18
0
	def get_params(self, deep=True):
		"""Return estimator parameter names for GridSearch support"""
		if not deep:
			return super(MultiLabelVotingClassifier, self).get_params(deep=False)
		else:
			out = super(MultiLabelVotingClassifier, self).get_params(deep=False)
			out.update(self.named_estimators.copy())
			for name, step in six.iteritems(self.named_estimators):
				for key, value in six.iteritems(step.get_params(deep=True)):
					out['%s__%s' % (name, key)] = value
			return out
Пример #19
0
    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for k, v in six.iteritems(step.get_params(deep=True)):
                    out["%s__%s" % (name, k)] = v

            return out
Пример #20
0
 def get_params(self, deep=True):
     """ GridSearchの実行時に分類金パラメータ名を取得 """
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         # キューを"分類器の名前__パラメータ名",
         # バリューをパラメータの値とするディクショナリを生成
         out = self.name_classifiers.copy()
         for name, step in six.iteritems(self.name_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #21
0
 def get_params(self, deep=True):
     """
     get classifier parameter names for GridSearch
     useful for grid search for hyperparameter-tuning
     access the parameters of individual classifiers in the ensemble
     """
     if not deep:
         return super(MajorityVoteClassifier, self).get_params(deep=False)
     else:
         out = self.named_classifiers.copy()
         for name, step in six.iteritems(self.named_classifiers):
             for key, value in six.iteritems(step.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
    def get_params(self, deep=False):
        """Return estimator parameter names for GridSearch support"""

        if not deep:
            return super(StackingRegressor, self).get_params(deep=False)
        else:
            # TODO: this will not work, need to implement `named_estimators`
            raise NotImplementedError("`deep` attribute not yet supported.")
            out = super(StackingRegressor, self).get_params(deep=False)
            out.update(self.named_estimators.copy())
            for name, step in six.iteritems(self.named_estimators):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out
Пример #23
0
    def get_params(self, deep=True):
        """Return estimator parameter names for GridSearch support."""
        if not deep:
            return super(StackingCVClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value

            out.update(self.named_meta_classifier.copy())
            for name, step in six.iteritems(self.named_meta_classifier):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out
Пример #24
0
    def get_params(self, deep=True):
        """Return estimator parameter names for GridSearch support."""
        if not deep:
            return super(StackingCVClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value

            out.update(self.named_meta_classifier.copy())
            for name, step in six.iteritems(self.named_meta_classifier):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out
Пример #25
0
    def get_feature_names(self):
        """Array mapping from feature integer indices to feature name"""
        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
            raise ValueError("Vocabulary wasn't fitted or is empty!")

        return [t for t, i in sorted(six.iteritems(self.vocabulary_),
                                     key=itemgetter(1))]
Пример #26
0
    def mypp(params, offset=0, printer=repr):
        # Do a multi-line justified repr:
        options = np.get_printoptions()
        np.set_printoptions(precision=5, threshold=64, edgeitems=2)
        params_list = list()
        this_line_length = offset
        line_sep = ',\n' + (1 + offset // 2) * ' '
        for i, (k, v) in enumerate(sorted(six.iteritems(params))):
            if type(v) is float:
                # use str for representing floating point numbers
                # this way we get consistent representation across
                # architectures and versions.
                this_repr = '%s=%s' % (k, str(v))
            else:
                # use repr of the rest
                this_repr = '%s=%s' % (k, printer(v))
            if len(this_repr) > 500000000:
                this_repr = this_repr[:300] + '...' + this_repr[-100:]
            if i > 0:
                if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr):
                    params_list.append(line_sep)
                    this_line_length = len(line_sep)
                else:
                    params_list.append(', ')
                    this_line_length += 2
            params_list.append(this_repr)
            this_line_length += len(this_repr)

        np.set_printoptions(**options)
        lines = ''.join(params_list)
        # Strip trailing space to avoid nightmare in doctests
        lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))
        return lines
Пример #27
0
    def transform(self, raw_documents):
        """Extract token counts out of raw text documents using the vocabulary
        fitted with fit or the one provided in the constructor.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : sparse matrix, [n_samples, n_features]
        """
        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
            raise ValueError("Vocabulary wasn't fitted or is empty!")

        # raw_documents can be an iterable so we don't know its size in
        # advance

        # result of document conversion to term count arrays
        i_indices = _make_int_array()
        j_indices = _make_int_array()
        values = _make_int_array()

        analyze = self.build_analyzer()
        for n_doc, doc in enumerate(raw_documents):
            term_counts = Counter(analyze(doc))

            for term, count in six.iteritems(term_counts):
                if term in self.vocabulary_:
                    i_indices.append(n_doc)
                    j_indices.append(self.vocabulary_[term])
                    values.append(count)
        n_doc += 1
        return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
Пример #28
0
def _pprint(params, offset=0, printer=repr):
    """
    From: https://github.com/scikit-learn/scikit-learn/blob/51a765a/sklearn/base.py
    With line 142-143 removed
    """
    # Do a multi-line justified repr:
    options = np.get_printoptions()
    np.set_printoptions(precision=5, threshold=64, edgeitems=2)
    params_list = list()
    this_line_length = offset
    line_sep = ',\n' + (1 + offset // 2) * ' '
    for i, (k, v) in enumerate(sorted(six.iteritems(params))):
        if type(v) is float:
            # use str for representing floating point numbers
            # this way we get consistent representation across
            # architectures and versions.
            this_repr = '%s=%s' % (k, str(v))
        else:
            # use repr of the rest
            this_repr = '%s=%s' % (k, printer(v))
        if i > 0:
            if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr):
                params_list.append(line_sep)
                this_line_length = len(line_sep)
            else:
                params_list.append(', ')
                this_line_length += 2
        params_list.append(this_repr)
        this_line_length += len(this_repr)

    np.set_printoptions(**options)
    lines = ''.join(params_list)
    # Strip trailing space to avoid nightmare in doctests
    lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))
    return lines
Пример #29
0
def _new_base_estimator(est, clonable_kwargs):
    """When the grid searches are pickled, the estimator
    has to be dropped out. When we load it back in, we have
    to reinstate a new one, since the fit is predicated on
    being able to clone a base estimator, we've got to have
    an estimator to clone and fit.

    Parameters
    ----------

    est : str
        The type of model to build

    Returns
    -------

    estimator : H2OEstimator
        The cloned base estimator
    """
    est_map = {
        'dl':   H2ODeepLearningEstimator,
        'gbm':  H2OGradientBoostingEstimator,
        'glm':  H2OGeneralizedLinearEstimator,
        # 'glrm': H2OGeneralizedLowRankEstimator,
        # 'km'  : H2OKMeansEstimator,
        'nb':   H2ONaiveBayesEstimator,
        'rf':   H2ORandomForestEstimator
    }

    estimator = est_map[est]()  # initialize the new ones
    for k, v in six.iteritems(clonable_kwargs):
        k, v = _kv_str(k, v)
        estimator._parms[k] = v

    return estimator
Пример #30
0
def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example),
                         group,
                         msg=('type_of_target(%r) should be %r, got %r' %
                              (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = 'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)

    try:
        from pandas import SparseSeries
    except ImportError:
        pass
    y = SparseSeries([1, 0, 0, 1, 0])
    msg = "y cannot be class 'SparseSeries'."
    assert_raises_regex(ValueError, msg, type_of_target, y)
Пример #31
0
    def get_feature_names(self):
        """Array mapping from feature integer indices to feature name"""
        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
            raise ValueError("Vocabulary wasn't fitted or is empty!")

        return [t for t, i in sorted(six.iteritems(self.vocabulary_),
                                     key=itemgetter(1))]
Пример #32
0
    def _fit(self, X, y=None, **fit_params):
        # self._validate_steps()

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        for name, transform in self.steps[:-1]:
            start_time = time.time()

            if transform is None:
                pass
            elif hasattr(transform, "fit_transform"):
                Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
            else:
                Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
                    .transform(Xt)
            self.pipeline_info.add_preprocessor_timing(
                name,
                time.time() - start_time)
        if self._final_estimator is None:
            return Xt, {}
        return Xt, fit_params_steps[self.steps[-1][0]]
Пример #33
0
def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg=('type_of_target(%r) should be %r, got %r'
                              % (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = 'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)

    try:
        from pandas import SparseSeries
    except ImportError:
        raise SkipTest("Pandas not found")

    y = SparseSeries([1, 0, 0, 1, 0])
    msg = "y cannot be class 'SparseSeries'."
    assert_raises_regex(ValueError, msg, type_of_target, y)
Пример #34
0
    def transform(self, X):
        """Apply the schema normalization.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, "fit_cols_")
        X, _ = check_dataframe(X, cols=self.cols)

        # validate that fit cols in test set
        cols = self.fit_cols_
        validate_test_set_columns(cols, X.columns)

        # normalize
        for k, v in six.iteritems(self.schema):
            X[k] = X[k].astype(v)

        return X  # DataFrame
Пример #35
0
    def _fit_transform(self, X, y=None, **fit_params):
        """ fit and transform X by transforming it by every step in sequence """
        # shallow copy of steps - this should really be steps_
        self.steps = list(self.steps)
        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        for step_idx, (name, transformer) in enumerate(self.steps):
            if transformer is None:
                pass
            else:
                if hasattr(memory, 'cachedir') and memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                Xt, fitted_transformer = fit_transform_one_cached(
                    cloned_transformer, None, Xt, y, **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)

        return Xt
Пример #36
0
def get_grid_results_table(search):
    """Get the grid results from a fit ``RandomizedSearchCV``.
    
    Parameters
    ----------
    search : RandomizedSearchCV
        The pre-fit grid search.
        
    Returns
    -------
    res : pd.DataFrame
        The results dataframe
    """
    # the search results
    res = search.cv_results_

    # unpack the dict
    dct = {
        k: res[k]
        for k in ('mean_fit_time', 'std_fit_time', 'mean_score_time',
                  'std_score_time', 'mean_test_score', 'std_test_score')
    }

    prefix = "param_"
    for k, v in six.iteritems(res):
        if k.startswith(prefix):
            key = k.split(prefix)[-1]
            dct[key] = v.data

    return pd.DataFrame.from_dict(dct)
Пример #37
0
    def restrict(self, support, indices=False):
        """Restrict the features to those in support.

        Parameters
        ----------
        support : array-like
            Boolean mask or list of indices (as returned by the get_support
            member of feature selectors).
        indices : boolean, optional
            Whether support is a list of indices.
        """
        if not indices:
            support = np.where(support)[0]

        names = self.feature_names_
        new_vocab = {}
        for i in support:
            new_vocab[names[i]] = len(new_vocab)

        self.vocabulary_ = new_vocab
        self.feature_names_ = [
            f for f, i in sorted(six.iteritems(new_vocab), key=itemgetter(1))
        ]

        return self
Пример #38
0
    def set_params(self, **params):
        if not params:                                                                                            
            # Simple optimisation to gain speed (inspect is slow)                                                 
            return self 

        # gmm_0
        gmm_0_param_dict = {}
        gmm_1_param_dict = {}

        valid_params = self.get_params(deep=True)            
        for key, value in six.iteritems(params): 
            split = key.split('__',1)
            if len(split) > 1:
                # combined key name
                prefix, name = split
                if prefix.find('gmm_0') >= 0:
                    gmm_0_param_dict[name] = value
                elif prefix.find('gmm_1') >= 0:
                    gmm_1_param_dict[name] = value
                else:
                    raise ValueError('Invalid parameter %s ' 'for estimator %s'
                                     % (key, self.__class__.__name__))                
            else:
                # simple objects case
                if not key in valid_params:
                    raise ValueError('Invalid parameter %s ' 'for estimator %s'
                                     % (key, self.__class__.__name__))                
                setattr(self, key, value)
                
        self.gmm_0.set_params(params=gmm_0_param_dict)
        self.gmm_1.set_params(params=gmm_0_param_dict)
        ## self.gmm_1.set_params(gmm_1_param_dict)
        
        return self
Пример #39
0
    def fit(self, X, y=None):
        """Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).
        y : (ignored)

        Returns
        -------
        self
        """
        # collect all the possible feature names
        feature_names = set()
        for x in X:
            for f, v in six.iteritems(x):
                if isinstance(v, six.string_types):
                    f_v = "%s%s%s" % (f, self.separator, v)
                    if f_v not in self._onehot_dict:
                        self._onehot_dict[f_v] = [f, v]
                    f = f_v
                feature_names.add(f)

        # sort the feature names to define the mapping
        feature_names = sorted(feature_names)
        self.vocabulary_ = dict((f, i) for i, f in enumerate(feature_names))
        self.feature_names_ = feature_names

        return self
Пример #40
0
    def _remove_highandlow(self, cscmatrix, feature_to_pos, high, low):
        """Remove too rare or too common features.

        Prune features that are non zero in more samples than high or less
        documents than low.

        This does not prune samples with zero features.

        """
        kept_indices = []
        removed_indices = set()
        for colptr in xrange(len(cscmatrix.indptr) - 1):
            len_slice = cscmatrix.indptr[colptr + 1] - cscmatrix.indptr[colptr]
            if len_slice <= high and len_slice >= low:
                kept_indices.append(colptr)
            else:
                removed_indices.add(colptr)

        s_kept_indices = set(kept_indices)
        new_mapping = dict((v, i) for i, v in enumerate(kept_indices))
        feature_to_pos = dict((k, new_mapping[v])
                              for k, v in six.iteritems(feature_to_pos)
                              if v in s_kept_indices)

        return cscmatrix[:, kept_indices], feature_to_pos, removed_indices
Пример #41
0
    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'sq_nms_')

        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        sq_nms_ = self.sq_nms_

        # scale by norms
        for nm, the_norm in six.iteritems(sq_nms_):
            X[nm] /= the_norm

        return X if self.as_df else X.as_matrix()
Пример #42
0
    def restrict(self, support, indices=False):
        """Restrict the features to those in support using feature selection.

        This function modifies the estimator in-place.

        """
        if self.has_been_restricted == True:
            return self

        if not indices:
            support = np.where(support)[0]

        names = self.feature_names_
        new_vocab = {}
        for i in support:
            new_vocab[names[i]] = len(new_vocab)

        self.vocabulary_ = new_vocab
        self.feature_names_ = [
            f for f, i in sorted(six.iteritems(new_vocab), key=itemgetter(1))
        ]

        self.has_been_restricted = True

        return self
Пример #43
0
def test_paired_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        S3 = func(csr_matrix(X), csr_matrix(Y))
        assert_array_almost_equal(S, S3)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
Пример #44
0
 def set_params(self, **params):
     """Set the parameters of this solver.
     :returns self : self
         Returns self
     """
     if not params:
         # Simple optimization to gain speed(inspect is slow)
         return self
     valid_params = self.get_params(deep=True)
     from sklearn.externals import six
     for key, value in six.iteritems(params):
         split = key.split('__', 1)
         if len(split) > 1:
             # nested objects case
             name, sub_name = split
             if name not in valid_params:
                 raise ValueError('Invalid parameter %s for estimator %s. '
                                  'Check the list of available parameters '
                                  'with `estimator.get_params().keys()`.' %
                                  (name, self))
             sub_object = valid_params[name]
             sub_object.set_params(**{sub_name: value})
         else:
             # simple objects case
             if key not in valid_params:
                 raise ValueError('Invalid parameter %s for estimator %s. '
                                  'Check the list of available parameters '
                                  'with `estimator.get_params().keys()`.' %
                                  (key, self.__class__.__name__))
             setattr(self, key, value)
     return self
Пример #45
0
    def fit(self, X, y=None):
        """Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).
        y : (ignored)

        Returns
        -------
        self
        """
        # collect all the possible feature names
        feature_names = set()
        for x in X:
            for f, v in six.iteritems(x):
                if isinstance(v, six.string_types):
                    f_v = "%s%s%s" % (f, self.separator, v)
                    if f_v not in self._onehot_dict:
                        self._onehot_dict[f_v] = [f, v]
                    f = f_v
                feature_names.add(f)

        # sort the feature names to define the mapping
        feature_names = sorted(feature_names)
        self.vocabulary_ = dict((f, i) for i, f in enumerate(feature_names))
        self.feature_names_ = feature_names

        return self
def most_common(d):
    '''
	Items of a defaultdict(int) with the heighest values.
	Like Counter.most_common in Python >= 2.7.

	'''
    return sorted(six.iteritems(d), key=operator.itemgetter(1), reverse=True)
Пример #47
0
    def __init__(self, cols=None, as_df=True, trans_col_name=None, **kwargs):

        super(_SelectiveTransformerWrapper, self).__init__(
            cols=cols, as_df=as_df)

        # this is a STATIC attribute of subclasses
        try:
            cls = self._cls
        except AttributeError:
            raise DeveloperError("_SelectiveTransformerWrapper subclasses "
                                 "must contain a static _cls attribute that "
                                 "maps to a sklearn type!")

        # get the (default) parameters for the estimator in question
        # and initialize to default
        self.estimator_ = cls()
        default_est_parms = self.estimator_.get_params(deep=True)

        # set the attributes in the estimator AND in the constructor so this
        # class behaves like sklearn in grid search
        self.estimator_.set_params(**kwargs)

        # set the kwargs here to behave like sklearn
        for k, v in six.iteritems(default_est_parms):
            if kwargs:
                v = kwargs.get(k, v)  # try get from kwargs, fail w def. value
            setattr(self, k, v)

        self.trans_col_name = trans_col_name
Пример #48
0
def test_paired_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y,
        metric=lambda x, y: np.abs(x -y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
Пример #49
0
 def _move_available():
     to_delete = []
     for n, parents in iteritems(deps):
         if not parents:
             available.add(n)
             to_delete.append(n)
     for n in to_delete:
         del deps[n]
Пример #50
0
 def mapper(X, separator=self.separator):
     feature_names = []
     for x in X:
         for f, v in six.iteritems(x):
             if isinstance(v, six.string_types):
                 f = "%s%s%s" % (f, self.separator, v)
             feature_names.append(f)
     accum.add(set(feature_names))
 def update(self, other):
     """Adds counts for elements in other"""
     if isinstance(other, self.__class__):
         for x, n in six.iteritems(other):
             self[x] += n
     else:
         for x in other:
             self[x] += 1
Пример #52
0
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group.startswith("multilabel"):
            assert_, exp = assert_true, "True"
        else:
            assert_, exp = assert_false, "False"
        for example in group_examples:
            assert_(is_multilabel(example), msg="is_multilabel(%r) should be %s" % (example, exp))
Пример #53
0
 def get_params(self, deep=True):
     if not deep:
         return super(FeatureUnion, self).get_params(deep=False)
     else:
         out = dict(self.transformer_list)
         for name, trans in self.transformer_list:
             for key, value in iteritems(trans.get_params(deep=True)):
                 out['%s__%s' % (name, key)] = value
         return out
Пример #54
0
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group.startswith('multilabel'):
            assert_, exp = assert_true, 'True'
        else:
            assert_, exp = assert_false, 'False'
        for example in group_examples:
            assert_(is_multilabel(example),
                    msg='is_multilabel(%r) should be %s' % (example, exp))
Пример #55
0
def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg='type_of_target(%r) should be %r, got %r'
                         % (example, group, type_of_target(example)))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        assert_raises(ValueError, type_of_target, example)
Пример #56
0
 def set_params(self, **params):
     if not params:
         return self
     for key, value in six.iteritems(params):
         split = key.split('__', 1)
         if len(split) > 1:
             print("length is greter than one ", split, value)
         else:
             print("length is one ", split, value)
             setattr(self, key, value)
Пример #57
0
def test_is_label_indicator_matrix():
    for group, group_examples in iteritems(EXAMPLES):
        if group == 'multilabel-indicator':
            assert_, exp = assert_true, 'True'
        else:
            assert_, exp = assert_false, 'False'
        for example in group_examples:
            assert_(is_label_indicator_matrix(example),
                    msg='is_label_indicator_matrix(%r) should be %s'
                    % (example, exp))
Пример #58
0
def test_is_sequence_of_sequences():
    for group, group_examples in iteritems(EXAMPLES):
        if group == 'multilabel-sequences':
            assert_, exp = assert_true, 'True'
        else:
            assert_, exp = assert_false, 'False'
        for example in group_examples:
            assert_(is_sequence_of_sequences(example),
                    msg='is_sequence_of_sequences(%r) should be %s'
                    % (example, exp))