예제 #1
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(
                    X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #2
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score


        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #3
0
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None,
                              verbose=0, fit_params=None):


    print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list


    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X = tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)
    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))

    preds_blocks = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten


        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params))

        cross_val_step+=1

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
예제 #4
0
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None,
                verbose=0, fit_params=None):

    print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list
    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X= tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)

    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))
    scorer = cross_validation.check_scoring(estimator, scoring=scoring)
    scores = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten



        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer,
                        train, test, verbose, None, fit_params))

        cross_val_step += 1


    return np.array(scores)[:, 0]
def train_test_split(dates, *arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")

    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    dtype = options.pop('dtype', None)
    if dtype is not None:
        warnings.warn("dtype option is ignored and will be removed in 0.18.",
                      DeprecationWarning)

    allow_nd = options.pop('allow_nd', None)
    allow_lists = options.pop('allow_lists', None)

    if allow_lists is not None:
        warnings.warn("The allow_lists option is deprecated and will be "
                      "assumed True in 0.18 and removed.", DeprecationWarning)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))
    if allow_nd is not None:
        warnings.warn("The allow_nd option is deprecated and will be "
                      "assumed True in 0.18 and removed.", DeprecationWarning)
    if allow_lists is False or allow_nd is False:
        arrays = [check_array(x, 'csr', allow_nd=allow_nd,
                              force_all_finite=False, ensure_2d=False)
                  if x is not None else x
                  for x in arrays]

    if test_size is None and train_size is None:
        test_size = 0.25
    arrays = indexable(*arrays)
    assert len(dates) == _num_samples(arrays[0]), "There should be as many dates as input samples."
    cv = DateShuffleSplit(dates, test_size=test_size, random_state=random_state)
    train, test = next(iter(cv))
    return list(chain.from_iterable((safe_indexing(a, train),
                                     safe_indexing(a, test)) for a in arrays))
예제 #6
0
def _cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """Generate cross-validated estimates for each input data point

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    cv : cross-validation generator or int, optional, default: None
        A cross-validation generator to use. If int, determines
        the number of folds in StratifiedKFold if y is binary
        or multiclass and estimator is a classifier, or the number
        of folds in KFold otherwise. If None, it is equivalent to cv=3.
        This generator must include all elements in the test set exactly once.
        Otherwise, a ValueError is raised.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    preds : ndarray
        This is the result of calling 'predict'
    """
    X, y = indexable(X, y)

    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    preds_blocks = parallel(delayed(_fit_and_predict)(pylibfm.FM(), X, y,
                                                      train, test, verbose,
                                                      fit_params)
                            for train, test in cv)
    p = np.concatenate([p for p, _ in preds_blocks])
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not _check_is_partition(locs, X.shape[0]):
        raise ValueError('cross_val_predict only works for partitions')
    preds = p.copy()
    preds[locs] = p
    return preds
예제 #7
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y = indexable(X, y)

        cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator))

        base_estimator = clone(self.estimator)
        out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train,
                              test, self.verbose, parameters, self.fit_params,
                              return_parameters=True,
                              error_score=self.error_score)
               for parameters in parameter_iterable
               for train, test in cv]
        self._dask_value = value(out)

        out, = compute(value(out))
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self