예제 #1
0
    def _fit(self, ds, cv_attr=None):
        """General method to fit data"""
                   
        self.scoring, _ = _check_multimetric_scoring(self.estimator, scoring=self.scoring)
        
        X, y = get_ds_data(ds)
        y = LabelEncoder().fit_transform(y)
        indices = self._get_permutation_indices(len(y))
                
        values = []
        
        groups = None
        if cv_attr != None:
            groups = LabelEncoder().fit_transform(ds.sa[cv_attr].value)
        
        
        for idx in indices:
            
            y_ = y[idx]

            scores = cross_validate(self.estimator, X, y_, groups,
                                  self.scoring, self.cv, self.n_jobs,
                                  self.verbose, return_estimator=True, return_splits=True)
            
            values.append(scores)
        
        return values
예제 #2
0
def test_check_scoring_and_check_multimetric_scoring():
    check_scoring_validator_for_single_metric_usecases(check_scoring)
    # To make sure the check_scoring is correctly applied to the constituent
    # scorers
    check_scoring_validator_for_single_metric_usecases(
        check_multimetric_scoring_single_metric_wrapper)

    # For multiple metric use cases
    # Make sure it works for the valid cases
    for scoring in (('accuracy', ), ['precision'], {
            'acc': 'accuracy',
            'precision': 'precision'
    }, ('accuracy', 'precision'), ['precision', 'accuracy'], {
            'accuracy': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score)
    }):
        estimator = LinearSVC(random_state=0)
        estimator.fit([[1], [2], [3]], [1, 1, 0])

        scorers, is_multi = _check_multimetric_scoring(estimator, scoring)
        assert_true(is_multi)
        assert_true(isinstance(scorers, dict))
        assert_equal(sorted(scorers.keys()), sorted(list(scoring)))
        assert_true(
            all([
                isinstance(scorer, _PredictScorer)
                for scorer in list(scorers.values())
            ]))

        if 'acc' in scoring:
            assert_almost_equal(
                scorers['acc'](estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.)
        if 'accuracy' in scoring:
            assert_almost_equal(
                scorers['accuracy'](estimator, [[1], [2], [3]], [1, 0, 0]),
                2. / 3.)
        if 'precision' in scoring:
            assert_almost_equal(
                scorers['precision'](estimator, [[1], [2], [3]], [1, 0, 0]),
                0.5)

    estimator = EstimatorWithFitAndPredict()
    estimator.fit([[1]], [1])

    # Make sure it raises errors when scoring parameter is not valid.
    # More weird corner cases are tested at test_validation.py
    error_message_regexp = ".*must be unique strings.*"
    for scoring in (
        (
            make_scorer(precision_score),  # Tuple of callables
            make_scorer(accuracy_score)),
        [5],
        (make_scorer(precision_score), ),
        (),
        ('f1', 'f1')):
        assert_raises_regexp(ValueError,
                             error_message_regexp,
                             _check_multimetric_scoring,
                             estimator,
                             scoring=scoring)
예제 #3
0
def _skl_check_scorers(scoring, refit):

    scorers, multimetric_ = _check_multimetric_scoring(
        GenSVM(), scoring=scoring
    )
    if multimetric_:
        if refit is not False and (
            not isinstance(refit, six.string_types)
            or
            # This will work for both dict / list (tuple)
            refit not in scorers
        ):
            raise ValueError(
                "For multi-metric scoring, the parameter "
                "refit must be set to a scorer key "
                "to refit an estimator with the best "
                "parameter setting on the whole data and "
                "make the best_* attributes "
                "available for that metric. If this is not "
                "needed, refit should be set to False "
                "explicitly. %r was passed." % refit
            )
        else:
            refit_metric = refit
    else:
        refit_metric = "score"

    return scorers, multimetric_, refit_metric
예제 #4
0
def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
                                              expected_predict_proba_count,
                                              expected_decision_func_count):
    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])

    mock_est = Mock()
    fit_func = Mock(return_value=mock_est)
    predict_func = Mock(return_value=y)

    pos_proba = np.random.rand(X.shape[0])
    proba = np.c_[1 - pos_proba, pos_proba]
    predict_proba_func = Mock(return_value=proba)
    decision_function_func = Mock(return_value=pos_proba)

    mock_est.fit = fit_func
    mock_est.predict = predict_func
    mock_est.predict_proba = predict_proba_func
    mock_est.decision_function = decision_function_func

    scorer_dict, _ = _check_multimetric_scoring(LogisticRegression(), scorers)
    multi_scorer = _MultimetricScorer(**scorer_dict)
    results = multi_scorer(mock_est, X, y)

    assert set(scorers) == set(results)  # compare dict keys

    assert predict_func.call_count == expected_predict_count
    assert predict_proba_func.call_count == expected_predict_proba_count
    assert decision_function_func.call_count == expected_decision_func_count
예제 #5
0
    def fit(self, ds, cv_attr='chunks'):
        """
        Fit the searchlight

        """
        
        A = get_seeds(ds, self.radius)
        
        estimator = self.estimator
            
        self.scoring, _ = _check_multimetric_scoring(estimator, scoring=self.scoring)
        
        X, y = get_ds_data(ds)
        y = LabelEncoder().fit_transform(y)
        groups = LabelEncoder().fit_transform(ds.sa[cv_attr].value)
        
        
        values = []
        indices = self._get_permutation_indices(len(y))
        
        for idx in indices:
            y_ = y[idx]        
            scores = search_light(X, y_, estimator, A, groups,
                                  self.scoring, self.cv, self.n_jobs,
                                  self.verbose)
            
            values.append(scores)
        
        self.scores = values
        
        self._info = self._store_ds_info(ds, cv_attr=cv_attr)

        return self
예제 #6
0
def fit_and_save(estimator,
                 X,
                 y=None,
                 groups=None,
                 scoring=None,
                 cv=None,
                 n_jobs=1,
                 verbose=0,
                 fit_params=None,
                 pre_dispatch='2*n_jobs',
                 return_train_score=True,
                 parameters=dict(),
                 uuid='',
                 url='http://127.0.0.1:8000'):

    import json, requests, numpy
    from sklearn.model_selection._validation import cross_validate

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    _base_scores = [0. for _ in range(cv.get_n_splits(X, y, groups))]

    cv_score = {}
    cv_score.update(
        {'train_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update(
        {'test_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update({'fit_time': _base_scores, 'score_time': _base_scores})

    try:
        cv_score = cross_validate(estimator, X, y, groups, scorers, cv, n_jobs,
                                  verbose, fit_params, pre_dispatch,
                                  return_train_score)
        error = None
    except Exception as e:
        error = '{}: {}'.format(type(e).__name__, str(e))

    try:
        for k, v in cv_score.items():
            if type(v) == type(numpy.array([])):
                cv_score[k] = v.tolist()
        response = requests.post('{url}/grids/{uuid}/results'.format(
            url=url, uuid=uuid),
                                 data={
                                     'gridsearch': uuid,
                                     'params': json.dumps(parameters),
                                     'errors': error,
                                     'cv_data': json.dumps(cv_score)
                                 })

    except requests.exceptions.ConnectionError as e:
        response = None
    if response is None:
        return
    return response
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        """
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        score_function = partial(
            cross_val_score, X=X, y=y, groups=groups, scoring=self.scoring,
            cv=cv, n_jobs=self.n_jobs, verbose=self.verbose,
            fit_params=fit_params)
        self.f = partial(
            _fit_score, mdl=self.estimator, param_names=self.param_names,
            score_function=score_function)

        self.objective = SingleObjective(
            self.f, self.batch_size, self.objective_name)
        self._init_design_chooser()

        self.run_optimization(max_iter=self.max_iter, verbosity=self.verbosity)

        self.best_index_ = self.Y.argmin()
        self.best_params_ = dict(zip(self.param_names,
                                     10 ** self.X[self.best_index_]))
        self.best_score_ = self.Y[self.Y.argmin()]

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        if self.refit:
            self.best_estimator_ = clone(self.estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        return self
예제 #8
0
def cross_validate(estimator, X, mixed_y=None, groups=None, scoring=None, cv=None,
                   n_jobs=1, verbose=0, fit_params=None,
                   pre_dispatch='2*n_jobs', return_train_score="warn"):
    """Evaluate metric(s) by cross-validation and also record fit/score times."""

    # TODO: wrapper patch, key hard coding?
    _y = mixed_y['classifier'] if isinstance(mixed_y, dict) else mixed_y

    X, y, groups = indexable(X, _y, groups)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, mixed_y, scorers, train, test, verbose, None,
            fit_params, return_train_score=return_train_score,
            return_times=True)
        for train, test in cv.split(X, y, groups))

    if return_train_score:
        train_scores, test_scores, fit_times, score_times = zip(*scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)

    return ret
예제 #9
0
def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs):
    # This wraps the _check_multimetric_scoring to take in single metric
    # scoring parameter so we can run the tests that we will run for
    # check_scoring, for check_multimetric_scoring too for single-metric
    # usecases
    scorers, is_multi = _check_multimetric_scoring(*args, **kwargs)
    # For all single metric use cases, it should register as not multimetric
    assert_false(is_multi)
    if args[0] is not None:
        assert scorers is not None
        names, scorers = zip(*scorers.items())
        assert_equal(len(scorers), 1)
        assert_equal(names[0], 'score')
        scorers = scorers[0]
    return scorers
예제 #10
0
def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs):
    # This wraps the _check_multimetric_scoring to take in single metric
    # scoring parameter so we can run the tests that we will run for
    # check_scoring, for check_multimetric_scoring too for single-metric
    # usecases
    scorers, is_multi = _check_multimetric_scoring(*args, **kwargs)
    # For all single metric use cases, it should register as not multimetric
    assert_false(is_multi)
    if args[0] is not None:
        assert_true(scorers is not None)
        names, scorers = zip(*scorers.items())
        assert_equal(len(scorers), 1)
        assert_equal(names[0], 'score')
        scorers = scorers[0]
    return scorers
예제 #11
0
def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs):
    # This wraps the _check_multimetric_scoring to take in
    # single metric scoring parameter so we can run the tests
    # that we will run for check_scoring, for check_multimetric_scoring
    # too for single-metric usecases

    scorers, is_multi = _check_multimetric_scoring(*args, **kwargs)
    # For all single metric use cases, it should register as not multimetric
    assert not is_multi
    if args[0] is not None:
        assert scorers is not None
        names, scorers = zip(*scorers.items())
        assert len(scorers) == 1
        assert names[0] == 'score'
        scorers = scorers[0]
    return scorers
예제 #12
0
def test_check_scoring_and_check_multimetric_scoring():
    check_scoring_validator_for_single_metric_usecases(check_scoring)
    # To make sure the check_scoring is correctly applied to the constituent
    # scorers
    check_scoring_validator_for_single_metric_usecases(
        check_multimetric_scoring_single_metric_wrapper)

    # For multiple metric use cases
    # Make sure it works for the valid cases
    for scoring in (('accuracy',), ['precision'],
                    {'acc': 'accuracy', 'precision': 'precision'},
                    ('accuracy', 'precision'), ['precision', 'accuracy'],
                    {'accuracy': make_scorer(accuracy_score),
                     'precision': make_scorer(precision_score)}):
        estimator = LinearSVC(random_state=0)
        estimator.fit([[1], [2], [3]], [1, 1, 0])

        scorers, is_multi = _check_multimetric_scoring(estimator, scoring)
        assert_true(is_multi)
        assert_true(isinstance(scorers, dict))
        assert_equal(sorted(scorers.keys()), sorted(list(scoring)))
        assert_true(all([isinstance(scorer, _PredictScorer)
                         for scorer in list(scorers.values())]))

        if 'acc' in scoring:
            assert_almost_equal(scorers['acc'](
                estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.)
        if 'accuracy' in scoring:
            assert_almost_equal(scorers['accuracy'](
                estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.)
        if 'precision' in scoring:
            assert_almost_equal(scorers['precision'](
                estimator, [[1], [2], [3]], [1, 0, 0]), 0.5)

    estimator = EstimatorWithFitAndPredict()
    estimator.fit([[1]], [1])

    # Make sure it raises errors when scoring parameter is not valid.
    # More weird corner cases are tested at test_validation.py
    error_message_regexp = ".*must be unique strings.*"
    for scoring in ((make_scorer(precision_score),  # Tuple of callables
                     make_scorer(accuracy_score)), [5],
                    (make_scorer(precision_score),), (), ('f1', 'f1')):
        assert_raises_regexp(ValueError, error_message_regexp,
                             _check_multimetric_scoring, estimator,
                             scoring=scoring)
예제 #13
0
def test_multimetric_scorer_calls_method_once_regressor_threshold():
    predict_called_cnt = 0

    class MockDecisionTreeRegressor(DecisionTreeRegressor):
        def predict(self, X):
            nonlocal predict_called_cnt
            predict_called_cnt += 1
            return super().predict(X)

    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])

    # no decision function
    clf = MockDecisionTreeRegressor()
    clf.fit(X, y)

    scorers = {'neg_mse': 'neg_mean_squared_error', 'r2': 'roc_auc'}
    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
    scorer = _MultimetricScorer(**scorer_dict)
    scorer(clf, X, y)

    assert predict_called_cnt == 1
예제 #14
0
def test_multimetric_scorer_calls_method_once_classifier_no_decision():
    predict_proba_call_cnt = 0

    class MockKNeighborsClassifier(KNeighborsClassifier):
        def predict_proba(self, X):
            nonlocal predict_proba_call_cnt
            predict_proba_call_cnt += 1
            return super().predict_proba(X)

    X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0])

    # no decision function
    clf = MockKNeighborsClassifier(n_neighbors=1)
    clf.fit(X, y)

    scorers = ['roc_auc', 'neg_log_loss']
    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
    scorer = _MultimetricScorer(**scorer_dict)
    scorer(clf, X, y)

    assert predict_proba_call_cnt == 1
예제 #15
0
def test_multimetric_scorer_sanity_check():
    # scoring dictionary returned is the same as calling each scorer seperately
    scorers = {'a1': 'accuracy', 'a2': 'accuracy',
               'll1': 'neg_log_loss', 'll2': 'neg_log_loss',
               'ra1': 'roc_auc', 'ra2': 'roc_auc'}

    X, y = make_classification(random_state=0)

    clf = DecisionTreeClassifier()
    clf.fit(X, y)

    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
    multi_scorer = _MultimetricScorer(**scorer_dict)

    result = multi_scorer(clf, X, y)

    seperate_scores = {
        name: get_scorer(name)(clf, X, y)
        for name in ['accuracy', 'neg_log_loss', 'roc_auc']}

    for key, value in result.items():
        score_name = scorers[key]
        assert_allclose(value, seperate_scores[score_name])
예제 #16
0
파일: cv.py 프로젝트: lixinyuu/sklearn-pso
    def fit(self, X, y=None, groups=None, **fit_params):
        X, y, groups = indexable(X, y, groups)
        self.best_estimator_ = None
        self.best_mem_score_ = float("-inf")
        self.best_mem_params_ = None
        base_estimator = clone(self.estimator)
        cv_orig = check_cv(self.cv,
                           y,
                           classifier=is_classifier(self.estimator))
        n_splits = cv_orig.get_n_splits(X, y, groups)
        self.cv = cv_orig
        self.scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)
        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed." %
                                 self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'
        results = self._fit(X, y, groups)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = self.scorers if self.multimetric_ else self.scorers[
            'score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #17
0
def main(
    inputs,
    infile_estimator,
    infile1,
    infile2,
    outfile_result,
    outfile_object=None,
    outfile_weights=None,
    outfile_y_true=None,
    outfile_y_preds=None,
    groups=None,
    ref_seq=None,
    intervals=None,
    targets=None,
    fasta_path=None,
):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    outfile_weights : str, optional
        File path to save deep learning model weights

    outfile_y_true : str, optional
        File path to target values for prediction

    outfile_y_preds : str, optional
        File path to save deep learning model weights

    groups : str
        File path to dataset containing groups labels

    ref_seq : str
        File path to dataset containing genome sequence file

    intervals : str
        File path to dataset containing interval file

    targets : str
        File path to dataset compressed target bed file

    fasta_path : str
        File path to dataset containing fasta file
    """
    warnings.simplefilter("ignore")

    with open(inputs, "r") as param_handler:
        params = json.load(param_handler)

    #  load estimator
    with open(infile_estimator, "rb") as estimator_handler:
        estimator = load_model(estimator_handler)

    estimator = clean_params(estimator)

    # swap hyperparameter
    swapping = params["experiment_schemes"]["hyperparams_swapping"]
    swap_params = _eval_swap_params(swapping)
    estimator.set_params(**swap_params)

    estimator_params = estimator.get_params()

    # store read dataframe object
    loaded_df = {}

    input_type = params["input_options"]["selected_input"]
    # tabular input
    if input_type == "tabular":
        header = "infer" if params["input_options"]["header1"] else None
        column_option = params["input_options"]["column_selector_options_1"][
            "selected_column_selector_option"
        ]
        if column_option in [
            "by_index_number",
            "all_but_by_index_number",
            "by_header_name",
            "all_but_by_header_name",
        ]:
            c = params["input_options"]["column_selector_options_1"]["col1"]
        else:
            c = None

        df_key = infile1 + repr(header)
        df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True)
        loaded_df[df_key] = df

        X = read_columns(df, c=c, c_option=column_option).astype(float)
    # sparse input
    elif input_type == "sparse":
        X = mmread(open(infile1, "r"))

    # fasta_file input
    elif input_type == "seq_fasta":
        pyfaidx = get_module("pyfaidx")
        sequences = pyfaidx.Fasta(fasta_path)
        n_seqs = len(sequences.keys())
        X = np.arange(n_seqs)[:, np.newaxis]
        for param in estimator_params.keys():
            if param.endswith("fasta_path"):
                estimator.set_params(**{param: fasta_path})
                break
        else:
            raise ValueError(
                "The selected estimator doesn't support "
                "fasta file input! Please consider using "
                "KerasGBatchClassifier with "
                "FastaDNABatchGenerator/FastaProteinBatchGenerator "
                "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
                "in pipeline!"
            )

    elif input_type == "refseq_and_interval":
        path_params = {
            "data_batch_generator__ref_genome_path": ref_seq,
            "data_batch_generator__intervals_path": intervals,
            "data_batch_generator__target_path": targets,
        }
        estimator.set_params(**path_params)
        n_intervals = sum(1 for line in open(intervals))
        X = np.arange(n_intervals)[:, np.newaxis]

    # Get target y
    header = "infer" if params["input_options"]["header2"] else None
    column_option = params["input_options"]["column_selector_options_2"][
        "selected_column_selector_option2"
    ]
    if column_option in [
        "by_index_number",
        "all_but_by_index_number",
        "by_header_name",
        "all_but_by_header_name",
    ]:
        c = params["input_options"]["column_selector_options_2"]["col2"]
    else:
        c = None

    df_key = infile2 + repr(header)
    if df_key in loaded_df:
        infile2 = loaded_df[df_key]
    else:
        infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
        loaded_df[df_key] = infile2

    y = read_columns(
        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
    )
    if len(y.shape) == 2 and y.shape[1] == 1:
        y = y.ravel()
    if input_type == "refseq_and_interval":
        estimator.set_params(data_batch_generator__features=y.ravel().tolist())
        y = None
    # end y

    # load groups
    if groups:
        groups_selector = (
            params["experiment_schemes"]["test_split"]["split_algos"]
        ).pop("groups_selector")

        header = "infer" if groups_selector["header_g"] else None
        column_option = groups_selector["column_selector_options_g"][
            "selected_column_selector_option_g"
        ]
        if column_option in [
            "by_index_number",
            "all_but_by_index_number",
            "by_header_name",
            "all_but_by_header_name",
        ]:
            c = groups_selector["column_selector_options_g"]["col_g"]
        else:
            c = None

        df_key = groups + repr(header)
        if df_key in loaded_df:
            groups = loaded_df[df_key]

        groups = read_columns(
            groups,
            c=c,
            c_option=column_option,
            sep="\t",
            header=header,
            parse_dates=True,
        )
        groups = groups.ravel()

    # del loaded_df
    del loaded_df

    # cache iraps_core fits could increase search speed significantly
    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    main_est = get_main_estimator(estimator)
    if main_est.__class__.__name__ == "IRAPSClassifier":
        main_est.set_params(memory=memory)

    # handle scorer, convert to scorer dict
    scoring = params["experiment_schemes"]["metrics"]["scoring"]
    if scoring is not None:
        # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
        # Check if secondary_scoring is specified
        secondary_scoring = scoring.get("secondary_scoring", None)
        if secondary_scoring is not None:
            # If secondary_scoring is specified, convert the list into comman separated string
            scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"])

    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # handle test (first) split
    test_split_options = params["experiment_schemes"]["test_split"]["split_algos"]

    if test_split_options["shuffle"] == "group":
        test_split_options["labels"] = groups
    if test_split_options["shuffle"] == "stratified":
        if y is not None:
            test_split_options["labels"] = y
        else:
            raise ValueError(
                "Stratified shuffle split is not " "applicable on empty target values!"
            )

    (
        X_train,
        X_test,
        y_train,
        y_test,
        groups_train,
        _groups_test,
    ) = train_test_split_none(X, y, groups, **test_split_options)

    exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]

    # handle validation (second) split
    if exp_scheme == "train_val_test":
        val_split_options = params["experiment_schemes"]["val_split"]["split_algos"]

        if val_split_options["shuffle"] == "group":
            val_split_options["labels"] = groups_train
        if val_split_options["shuffle"] == "stratified":
            if y_train is not None:
                val_split_options["labels"] = y_train
            else:
                raise ValueError(
                    "Stratified shuffle split is not "
                    "applicable on empty target values!"
                )

        (
            X_train,
            X_val,
            y_train,
            y_val,
            groups_train,
            _groups_val,
        ) = train_test_split_none(X_train, y_train, groups_train, **val_split_options)

    # train and eval
    if hasattr(estimator, "validation_data"):
        if exp_scheme == "train_val_test":
            estimator.fit(X_train, y_train, validation_data=(X_val, y_val))
        else:
            estimator.fit(X_train, y_train, validation_data=(X_test, y_test))
    else:
        estimator.fit(X_train, y_train)

    if hasattr(estimator, "evaluate"):
        steps = estimator.prediction_steps
        batch_size = estimator.batch_size
        generator = estimator.data_generator_.flow(
            X_test, y=y_test, batch_size=batch_size
        )
        predictions, y_true = _predict_generator(
            estimator.model_, generator, steps=steps
        )
        scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)

    else:
        if hasattr(estimator, "predict_proba"):
            predictions = estimator.predict_proba(X_test)
        else:
            predictions = estimator.predict(X_test)

        y_true = y_test
        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
    if outfile_y_true:
        try:
            pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\t", index=False)
            pd.DataFrame(predictions).astype(np.float32).to_csv(
                outfile_y_preds,
                sep="\t",
                index=False,
                float_format="%g",
                chunksize=10000,
            )
        except Exception as e:
            print("Error in saving predictions: %s" % e)

    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        main_est = estimator
        if isinstance(estimator, Pipeline):
            main_est = estimator.steps[-1][-1]

        if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):
            if outfile_weights:
                main_est.save_weights(outfile_weights)
            del main_est.model_
            del main_est.fit_params
            del main_est.model_class_
            if getattr(main_est, "validation_data", None):
                del main_est.validation_data
            if getattr(main_est, "data_generator_", None):
                del main_est.data_generator_

        with open(outfile_object, "wb") as output_handler:
            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
예제 #18
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """
        fit: Run fit with all sets of parameters. Periodically serialize the ``cv_results`` dictionary after fitting
            every ``self.cv_results_save_freq`` number of models.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" `cv`
            instance (e.g., `GroupKFold`).
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv,
                      y,
                      classifier=is_classifier(estimator=estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            raise NotImplementedError(
                'Multimetric scoring is not yet implemented for overridden sequential-based fit method.'
            )
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        # parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)

        results = []
        all_candidate_params = []
        all_out = []

        def evaluate_candidates(candidate_params):
            if isinstance(candidate_params, dict) or isinstance(
                    candidate_params, defaultdict):
                candidate_params = list(candidate_params)
            n_candidates = len(candidate_params)

            if self.verbose > 0:
                print(
                    "Fitting {0} folds for each of {1} remaining candidates, totalling {2} fits"
                    .format(n_splits, n_candidates, n_candidates * n_splits))

            # print('list(cv.split(X, y, groups)): %s' % list(cv.split(X, y, groups)))
            # print('list(product(candidate_params, cv.split(X, y, groups))): %s' % list(product(candidate_params, cv.split(X, y, groups))))

            fold_num = 0
            for parameters, (train, test) in product(candidate_params,
                                                     cv.split(X, y, groups)):
                print('product index/fold number: %d' % fold_num)
                # print('\tparams: %s' % parameters)
                # print('\ttrain: %s' % train)
                # print('\ttest: %s' % test)
                out = _fit_and_score(estimator=clone(base_estimator),
                                     X=X,
                                     y=y,
                                     train=train,
                                     test=test,
                                     parameters=parameters,
                                     **fit_and_score_kwargs)
                print('\tout: %s' % out)

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)
                # nonlocal keyword is exactly what it sounds like, uses the outer function scope: w3schools.com/python/ref_keyword_nonlocal.asp
                nonlocal results
                # results = self._format_results(all_candidate_params, scorers, n_splits, all_out)
                result = self._format_result(candidate_param=parameters,
                                             scorer=scorers,
                                             n_splits=n_splits,
                                             out=out)
                results.append(result)
                self.cv_results.append(result)
                # Just finished training a model, should cv_results be saved?
                if fold_num % self.cv_results_save_freq == 0:
                    self._save_cv_results()
                fold_num += 1
            return self.cv_results

        self._run_search(evaluate_candidates)

        if self.refit or not self.multimetric_:
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, (int, np.integer)):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(self.cv_results)):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = 0
                self.best_score_ = self.cv_results[0]['test_score']
                for i, cv_result in enumerate(self.cv_results):
                    if cv_result['test_score'] >= self.best_score_:
                        self.best_index_ = i
                        self.best_score_ = cv_result['test_score']
            self.best_params_ = self.cv_results[self.best_index_]['params']

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']
        self.n_splits_ = n_splits
        return self
예제 #19
0
    def fit(self, X, y=None, groups=None, type="Classifier", **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        if self.fit_params is not None:
            warnings.warn(
                '"fit_params" as a constructor argument was '
                'deprecated in version 0.19 and will be removed '
                'in version 0.21. Pass fit parameters to the '
                '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn(
                    'Ignoring fit_params passed as a constructor '
                    'argument in favor of keyword arguments to '
                    'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        #estimator = self.estimator

        if type == "Classification":
            from keras.wrappers.scikit_learn import KerasClassifier
            estimator = KerasClassifier(build_fn=self.estimator, verbose=0)
        else:
            from keras.wrappers.scikit_learn import KerasRegressor
            estimator = KerasRegressor(build_fn=self.estimator, verbose=0)
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            clone(estimator), scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = self.estimator
        pre_dispatch = self.pre_dispatch
        # One of the main changes is instead of using the _fit_and_score from sklearn.model_selection._validation
        # We use a modified one (_fit_and_score_keras) that clears the session after each iteration
        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(_fit_and_score_keras2)(
                    base_estimator,
                    X,
                    y,
                    scorers,
                    train,
                    test,
                    self.verbose,
                    parameters,
                    fit_params=fit_params,
                    return_train_score=self.return_train_score,
                    return_n_test_samples=True,
                    return_times=True,
                    return_parameters=False,
                    error_score=self.error_score,
                    type=type)  # Passing the session (Keras backend) argument
                for parameters, (
                    train,
                    test) in product(candidate_params, cv.split(X, y, groups)))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name,
                   test_scores[scorer_name],
                   splits=True,
                   rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                _store('train_%s' % scorer_name,
                       train_scores[scorer_name],
                       splits=True)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" %
                                       refit_metric][self.best_index_]

        if self.refit:
            from keras import backend as K
            import tensorflow as tf
            tf.logging.set_verbosity(
                tf.logging.ERROR
            )  # This is useful to avoid the info log of tensorflow
            # The next 4 lines are for avoiding tensorflow to allocate all the GPU memory
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            K.set_session(sess)

            self.best_estimator_ = clone(estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #20
0
    def fit(self, X, y=None, dy=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        if self.fit_params is not None:
            warnings.warn('"fit_params" as a constructor argument was '
                          'deprecated in version 0.19 and will be removed '
                          'in version 0.21. Pass fit parameters to the '
                          '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn('Ignoring fit_params passed as a constructor '
                              'argument in favor of keyword arguments to '
                              'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results_container = [{}]
        with parallel:
            all_candidate_params = []
            all_out = []

            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(delayed(_fit_and_score)(clone(base_estimator),
                                                       X, y, dy,
                                                       train=train, test=test,
                                                       parameters=parameters,
                                                       **fit_and_score_kwargs)
                               for parameters, (train, test)
                               in product(candidate_params,
                                          cv.split(X, y, groups)))

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                # XXX: When we drop Python 2 support, we can use nonlocal
                # instead of results_container
                results_container[0] = self._format_results(
                    all_candidate_params, scorers, n_splits, all_out)
                return results_container[0]

            self._run_search(evaluate_candidates)

        results = results_container[0]

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = results["params"][self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            refit_start_time = time.time()
            if dy is not None:
                self.best_estimator_.fit(X, y, dy, **fit_params)
            elif y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #21
0
def main(inputs, infile_estimator, infile1, infile2,
         outfile_result, outfile_object=None,
         outfile_weights=None, groups=None,
         ref_seq=None, intervals=None, targets=None,
         fasta_path=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    outfile_weights : str, optional
        File path to save deep learning model weights

    groups : str
        File path to dataset containing groups labels

    ref_seq : str
        File path to dataset containing genome sequence file

    intervals : str
        File path to dataset containing interval file

    targets : str
        File path to dataset compressed target bed file

    fasta_path : str
        File path to dataset containing fasta file
    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    #  load estimator
    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    # swap hyperparameter
    swapping = params['experiment_schemes']['hyperparams_swapping']
    swap_params = _eval_swap_params(swapping)
    estimator.set_params(**swap_params)

    estimator_params = estimator.get_params()

    # store read dataframe object
    loaded_df = {}

    input_type = params['input_options']['selected_input']
    # tabular input
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in ['by_index_number', 'all_but_by_index_number',
                             'by_header_name', 'all_but_by_header_name']:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None

        df_key = infile1 + repr(header)
        df = pd.read_csv(infile1, sep='\t', header=header,
                         parse_dates=True)
        loaded_df[df_key] = df

        X = read_columns(df, c=c, c_option=column_option).astype(float)
    # sparse input
    elif input_type == 'sparse':
        X = mmread(open(infile1, 'r'))

    # fasta_file input
    elif input_type == 'seq_fasta':
        pyfaidx = get_module('pyfaidx')
        sequences = pyfaidx.Fasta(fasta_path)
        n_seqs = len(sequences.keys())
        X = np.arange(n_seqs)[:, np.newaxis]
        for param in estimator_params.keys():
            if param.endswith('fasta_path'):
                estimator.set_params(
                    **{param: fasta_path})
                break
        else:
            raise ValueError(
                "The selected estimator doesn't support "
                "fasta file input! Please consider using "
                "KerasGBatchClassifier with "
                "FastaDNABatchGenerator/FastaProteinBatchGenerator "
                "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
                "in pipeline!")

    elif input_type == 'refseq_and_interval':
        path_params = {
            'data_batch_generator__ref_genome_path': ref_seq,
            'data_batch_generator__intervals_path': intervals,
            'data_batch_generator__target_path': targets
        }
        estimator.set_params(**path_params)
        n_intervals = sum(1 for line in open(intervals))
        X = np.arange(n_intervals)[:, np.newaxis]

    # Get target y
    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in ['by_index_number', 'all_but_by_index_number',
                         'by_header_name', 'all_but_by_header_name']:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None

    df_key = infile2 + repr(header)
    if df_key in loaded_df:
        infile2 = loaded_df[df_key]
    else:
        infile2 = pd.read_csv(infile2, sep='\t',
                              header=header, parse_dates=True)
        loaded_df[df_key] = infile2

    y = read_columns(
            infile2,
            c=c,
            c_option=column_option,
            sep='\t',
            header=header,
            parse_dates=True)
    if len(y.shape) == 2 and y.shape[1] == 1:
        y = y.ravel()
    if input_type == 'refseq_and_interval':
        estimator.set_params(
            data_batch_generator__features=y.ravel().tolist())
        y = None
    # end y

    # load groups
    if groups:
        groups_selector = (params['experiment_schemes']['test_split']
                                 ['split_algos']).pop('groups_selector')

        header = 'infer' if groups_selector['header_g'] else None
        column_option = \
            (groups_selector['column_selector_options_g']
                            ['selected_column_selector_option_g'])
        if column_option in ['by_index_number', 'all_but_by_index_number',
                             'by_header_name', 'all_but_by_header_name']:
            c = groups_selector['column_selector_options_g']['col_g']
        else:
            c = None

        df_key = groups + repr(header)
        if df_key in loaded_df:
            groups = loaded_df[df_key]

        groups = read_columns(
                groups,
                c=c,
                c_option=column_option,
                sep='\t',
                header=header,
                parse_dates=True)
        groups = groups.ravel()

    # del loaded_df
    del loaded_df

    # handle memory
    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    # cache iraps_core fits could increase search speed significantly
    if estimator.__class__.__name__ == 'IRAPSClassifier':
        estimator.set_params(memory=memory)
    else:
        # For iraps buried in pipeline
        new_params = {}
        for p, v in estimator_params.items():
            if p.endswith('memory'):
                # for case of `__irapsclassifier__memory`
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    # cache iraps_core fits could increase search
                    # speed significantly
                    new_params[p] = memory
                # security reason, we don't want memory being
                # modified unexpectedly
                elif v:
                    new_params[p] = None
            # handle n_jobs
            elif p.endswith('n_jobs'):
                # For now, 1 CPU is suggested for iprasclassifier
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    new_params[p] = 1
                else:
                    new_params[p] = N_JOBS
            # for security reason, types of callback are limited
            elif p.endswith('callbacks'):
                for cb in v:
                    cb_type = cb['callback_selection']['callback_type']
                    if cb_type not in ALLOWED_CALLBACKS:
                        raise ValueError(
                            "Prohibited callback type: %s!" % cb_type)

        estimator.set_params(**new_params)

    # handle scorer, convert to scorer dict
    scoring = params['experiment_schemes']['metrics']['scoring']
    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # handle test (first) split
    test_split_options = (params['experiment_schemes']
                                ['test_split']['split_algos'])

    if test_split_options['shuffle'] == 'group':
        test_split_options['labels'] = groups
    if test_split_options['shuffle'] == 'stratified':
        if y is not None:
            test_split_options['labels'] = y
        else:
            raise ValueError("Stratified shuffle split is not "
                             "applicable on empty target values!")

    X_train, X_test, y_train, y_test, groups_train, groups_test = \
        train_test_split_none(X, y, groups, **test_split_options)

    exp_scheme = params['experiment_schemes']['selected_exp_scheme']

    # handle validation (second) split
    if exp_scheme == 'train_val_test':
        val_split_options = (params['experiment_schemes']
                                   ['val_split']['split_algos'])

        if val_split_options['shuffle'] == 'group':
            val_split_options['labels'] = groups_train
        if val_split_options['shuffle'] == 'stratified':
            if y_train is not None:
                val_split_options['labels'] = y_train
            else:
                raise ValueError("Stratified shuffle split is not "
                                 "applicable on empty target values!")

        X_train, X_val, y_train, y_val, groups_train, groups_val = \
            train_test_split_none(X_train, y_train, groups_train,
                                  **val_split_options)

    # train and eval
    if hasattr(estimator, 'validation_data'):
        if exp_scheme == 'train_val_test':
            estimator.fit(X_train, y_train,
                          validation_data=(X_val, y_val))
        else:
            estimator.fit(X_train, y_train,
                          validation_data=(X_test, y_test))
    else:
        estimator.fit(X_train, y_train)

    if hasattr(estimator, 'evaluate'):
        scores = estimator.evaluate(X_test, y_test=y_test,
                                    scorer=scorer,
                                    is_multimetric=True)
    else:
        scores = _score(estimator, X_test, y_test, scorer,
                        is_multimetric=True)
    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_result, sep='\t',
              header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        main_est = estimator
        if isinstance(estimator, pipeline.Pipeline):
            main_est = estimator.steps[-1][-1]

        if hasattr(main_est, 'model_') \
                and hasattr(main_est, 'save_weights'):
            if outfile_weights:
                main_est.save_weights(outfile_weights)
            del main_est.model_
            del main_est.fit_params
            del main_est.model_class_
            del main_est.validation_data
            if getattr(main_est, 'data_generator_', None):
                del main_est.data_generator_

        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(estimator, output_handler,
                        pickle.HIGHEST_PROTOCOL)
예제 #22
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        if _HAS_MULTIPLE_METRICS:
            from sklearn.metrics.scorer import _check_multimetric_scoring
            scorer, multimetric = _check_multimetric_scoring(
                estimator, scoring=self.scoring)
            if not multimetric:
                scorer = scorer['score']
            self.multimetric_ = multimetric

            if self.multimetric_:
                if self.refit is not False and (
                        not isinstance(self.refit, str) or
                        # This will work for both dict / list (tuple)
                        self.refit not in scorer):
                    raise ValueError(
                        "For multi-metric scoring, the parameter "
                        "refit must be set to a scorer key "
                        "to refit an estimator with the best "
                        "parameter setting on the whole data and "
                        "make the best_* attributes "
                        "available for that metric. If this is not "
                        "needed, refit should be set to False "
                        "explicitly. %r was passed." % self.refit)
        else:
            scorer = check_scoring(estimator, scoring=self.scoring)
            multimetric = False

        self.scorer_ = scorer

        error_score = self.error_score
        if not (isinstance(error_score, numbers.Number)
                or error_score == 'raise'):
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value.")

        dsk, keys, n_splits = build_graph(
            estimator,
            self.cv,
            self.scorer_,
            list(self._get_param_iterator()),
            X,
            y,
            groups,
            fit_params,
            iid=self.iid,
            refit=self.refit,
            error_score=error_score,
            return_train_score=self.return_train_score,
            cache_cv=self.cache_cv,
            multimetric=multimetric)
        self.dask_graph_ = dsk
        self.n_splits_ = n_splits

        n_jobs = _normalize_n_jobs(self.n_jobs)
        scheduler = _normalize_scheduler(self.scheduler, n_jobs)

        out = scheduler(dsk, keys, num_workers=n_jobs)

        results = handle_deprecated_train_score(out[0],
                                                self.return_train_score)
        self.cv_results_ = results

        if self.refit:
            if _HAS_MULTIPLE_METRICS and self.multimetric_:
                key = self.refit
            else:
                key = 'score'
            self.best_index_ = np.flatnonzero(
                results["rank_test_{}".format(key)] == 1)[0]

            self.best_estimator_ = out[1]

        return self
예제 #23
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed." %
                                 self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'
        self.refit_metric = refit_metric

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []
            all_more_results = defaultdict(list)

            def evaluate_candidates(candidate_params,
                                    X,
                                    y,
                                    groups,
                                    more_results=None):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(
                    delayed(_fit_and_score)(clone(base_estimator),
                                            X,
                                            y,
                                            train=train,
                                            test=test,
                                            parameters=parameters,
                                            **fit_and_score_kwargs)
                    for parameters, (train, test) in product(
                        candidate_params, cv.split(X, y, groups)))

                if len(out) < 1:
                    raise ValueError('No fits were performed. '
                                     'Was the CV iterator empty? '
                                     'Were there no candidates?')
                elif len(out) != n_candidates * n_splits:
                    raise ValueError('cv.split and cv.get_n_splits returned '
                                     'inconsistent results. Expected {} '
                                     'splits, got {}'.format(
                                         n_splits,
                                         len(out) // n_candidates))

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)
                if more_results is not None:
                    for key, value in more_results.items():
                        all_more_results[key].extend(value)

                nonlocal results
                results = self._format_results(all_candidate_params, scorers,
                                               n_splits, all_out,
                                               all_more_results)

                return results

            self._run_search(evaluate_candidates, X, y, groups)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, (int, np.integer)):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #24
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        if _HAS_MULTIPLE_METRICS:
            from sklearn.metrics.scorer import _check_multimetric_scoring
            scorer, multimetric = _check_multimetric_scoring(estimator,
                                                             scoring=self.scoring)
            if not multimetric:
                scorer = scorer['score']
            self.multimetric_ = multimetric

            if self.multimetric_:
                if self.refit is not False and (
                        not isinstance(self.refit, str) or
                        # This will work for both dict / list (tuple)
                        self.refit not in scorer):
                    raise ValueError("For multi-metric scoring, the parameter "
                                     "refit must be set to a scorer key "
                                     "to refit an estimator with the best "
                                     "parameter setting on the whole data and "
                                     "make the best_* attributes "
                                     "available for that metric. If this is not "
                                     "needed, refit should be set to False "
                                     "explicitly. %r was passed." % self.refit)
        else:
            scorer = check_scoring(estimator, scoring=self.scoring)
            multimetric = False

        self.scorer_ = scorer

        error_score = self.error_score
        if not (isinstance(error_score, numbers.Number) or
                error_score == 'raise'):
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value.")

        dsk, keys, n_splits = build_graph(estimator, self.cv, self.scorer_,
                                          list(self._get_param_iterator()),
                                          X, y, groups, fit_params,
                                          iid=self.iid,
                                          refit=self.refit,
                                          error_score=error_score,
                                          return_train_score=self.return_train_score,
                                          cache_cv=self.cache_cv,
                                          multimetric=multimetric)
        self.dask_graph_ = dsk
        self.n_splits_ = n_splits

        n_jobs = _normalize_n_jobs(self.n_jobs)
        scheduler = _normalize_scheduler(self.scheduler, n_jobs)

        out = scheduler(dsk, keys, num_workers=n_jobs)

        results = handle_deprecated_train_score(out[0], self.return_train_score)
        self.cv_results_ = results

        if self.refit:
            if _HAS_MULTIPLE_METRICS and self.multimetric_:
                key = self.refit
            else:
                key = 'score'
            self.best_index_ = np.flatnonzero(
                results["rank_test_{}".format(key)] == 1)[0]

            self.best_estimator_ = out[1]

        return self
예제 #25
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        """
        if self.fit_params is not None:
            warnings.warn('"fit_params" as a constructor argument was '
                          'deprecated in version 0.19 and will be removed '
                          'in version 0.21. Pass fit parameters to the '
                          '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn('Ignoring fit_params passed as a constructor '
                              'argument in favor of keyword arguments to '
                              'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        # X, y, groups = indexable(X, y, groups)
        if groups is not None:
            raise NotImplementedError("groups are not supported")

        # n_splits = cv.get_n_splits(X, y, groups)
        n_splits = min(cv.get_n_splits(X_.transpose(1, 2, 0), y_, None)
                       for X_, y_ in zip(X, y))

        def generate_index(X_list, y_list):
            split = [cv.split(X.transpose(1, 2, 0), y)
                     for X, y in zip(X_list, y_list)]
            for i in range(n_splits):
                yield zip(*[next(s) for s in split])

        generate_index_iter = generate_index(X, y)

        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_fit_and_score)(clone(base_estimator), X, y, scorers, train,
                                  test, self.verbose, parameters,
                                  fit_params=fit_params,
                                  return_train_score=self.return_train_score,
                                  return_n_test_samples=True,
                                  return_times=True, return_parameters=False,
                                  error_score=self.error_score,
                                  return_estimator=True, return_idx=True)
          for parameters, (train, test) in product(
            candidate_params, generate_index_iter))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time, estimators, train_idxs, test_idxs) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time, estimators, train_idxs, test_idxs) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # TODO: replace by a dict in 0.21
        results = (DeprecationDict() if self.return_train_score == 'warn'
                   else {})

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """Store the scores/times to the cv_results_."""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        results['estimators'] = estimators
        results['train_index'] = train_idxs
        results['test_index'] = test_idxs

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name, test_scores[scorer_name],
                   splits=True, rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name, train_scores[scorer_name],
                       splits=True)

                if self.return_train_score == 'warn':
                    for key in set(results.keys()) - prev_keys:
                        message = (
                            'You are accessing a training score ({!r}), '
                            'which will not be available by default '
                            'any more in 0.21. If you need training scores, '
                            'please set return_train_score=True').format(key)
                        # warn on key access
                        results.add_warning(key, message, FutureWarning)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #26
0
def repeated_cross_validate(estimator,
                            X,
                            y=None,
                            groups=None,
                            scoring=None,
                            cv=None,
                            n_jobs=1,
                            n_reps=1,
                            verbose=0,
                            fit_params=None,
                            pre_dispatch='2*n_jobs',
                            return_train_score="warn"):
    if len(cv) != n_reps:
        raise ValueError(
            "Set n_reps = {}. Got only {} cross validators.".format(
                n_reps, len(cv)))

    n_folds = np.unique(
        [cross_validator.get_n_splits() for cross_validator in cv])
    if len(n_folds) != 1:
        raise ValueError(
            "Cross validators are not unified in fold number: {}".format(
                n_folds))
    n_folds = n_folds[0]
    """Evaluate metric(s) by cross-validation and also record fit/score times.

        Read more in the :ref:`User Guide <multimetric_cross_validation>`.

        Parameters
        ----------
        estimator : estimator object implementing 'fit'
            The object to use to fit the data.

        X : array-like
            The data to fit. Can be for example a list, or an array.

        y : array-like, optional, default: None
            The target variable to try to predict in the case of
            supervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        scoring : string, callable, list/tuple, dict or None, default: None
            A single string (see :ref:`scoring_parameter`) or a callable
            (see :ref:`scoring`) to evaluate the predictions on the test set.

            For evaluating multiple metrics, either give a list of (unique) strings
            or a dict with names as keys and callables as values.

            NOTE that when using custom scorers, each scorer should return a single
            value. Metric functions returning a list/array of values can be wrapped
            into multiple scorers that return one value each.

            See :ref:`multimetric_grid_search` for an example.

            If None, the estimator's default scorer (if available) is used.

        cv : array-like, a collection of cross-validation generators, with length n_reps

            Refer :ref:`User Guide <cross_validation>` for the various
            cross-validation strategies that can be used here.

        n_jobs : integer, optional
            The number of CPUs to use to do the computation. -1 means
            'all CPUs'.

        verbose : integer, optional
            The verbosity level.

        fit_params : dict, optional
            Parameters to pass to the fit method of the estimator.

        pre_dispatch : int, or string, optional
            Controls the number of jobs that get dispatched during parallel
            execution. Reducing this number can be useful to avoid an
            explosion of memory consumption when more jobs get dispatched
            than CPUs can process. This parameter can be:

                - None, in which case all the jobs are immediately
                  created and spawned. Use this for lightweight and
                  fast-running jobs, to avoid delays due to on-demand
                  spawning of the jobs

                - An int, giving the exact number of total jobs that are
                  spawned

                - A string, giving an expression as a function of n_jobs,
                  as in '2*n_jobs'

        return_train_score : boolean, optional
            Whether to include train decision_scores.

            Current default is ``'warn'``, which behaves as ``True`` in addition
            to raising a warning when a training score is looked up.
            That default will be changed to ``False`` in 0.21.
            Computing training decision_scores is used to get insights on how different
            parameter settings impact the overfitting/underfitting trade-off.
            However computing the decision_scores on the training set can be computationally
            expensive and is not strictly required to select the parameters that
            yield the best generalization performance.

        Returns
        -------
        repeated_decision_scores : dict of `decision_scores` dicts, of shape=(n_reps,)
    """
    X, y, groups = indexable(X, y, groups)

    # cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    # ---------------------- My Hack ----------------------- #
    # 1) Set parameter `error_score=-1` to `_fit_and_score`  #
    # 2) Created an argument `return_estimator` to           #
    #    `_fit_and_score`                                    #
    # ------------------------------------------------------ #
    tasks = [[
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                None,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=True,
                                error_score=-1)
        for train, test in cross_validator.split(X, y, groups)
    ] for cross_validator in cv]
    # Flatten this list of lists into a simple list
    tasks = itertools.chain.from_iterable(tasks)
    scores = parallel(tasks)

    if return_train_score:
        train_scores, test_scores, fit_times, score_times, estimators = zip(
            *scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times, estimators = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)
    ret['estimator'] = list(estimators)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training decision_scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)
    """
    Now `ret` is a dictionary whose values are all sequences of length `n_folds * n_reps`.
    Split it into `n_reps` sub-dictionaries whose values are of length `n_folds`
    """
    rep_rets = list(_split_dict(ret, chunk_size=n_folds))

    assert len(rep_rets) == n_reps

    for i in range(0, n_reps):
        rep_rets[i]["cross_validator"] = cv[i]

    result = dict(zip(range(0, n_reps), rep_rets))

    return result
예제 #27
0
def cross_validate(estimator,
                   X,
                   y=None,
                   groups=None,
                   scoring=None,
                   cv=None,
                   n_jobs=1,
                   verbose=0,
                   fit_params=None,
                   pre_dispatch='2*n_jobs',
                   return_train_score="warn"):
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    Read more in the :ref:`User Guide <multimetric_cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be for example a list, or an array.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable, list/tuple, dict or None, default: None
        A single string (see :ref:`scoring_parameter`) or a callable
        (see :ref:`scoring`) to evaluate the predictions on the test set.

        For evaluating multiple metrics, either give a list of (unique) strings
        or a dict with names as keys and callables as values.

        NOTE that when using custom scorers, each scorer should return a single
        value. Metric functions returning a list/array of values can be wrapped
        into multiple scorers that return one value each.

        See :ref:`multimetric_grid_search` for an example.

        If None, the estimator's default scorer (if available) is used.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cross_validators are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    return_train_score : boolean, optional
        Whether to include train decision_scores.

        Current default is ``'warn'``, which behaves as ``True`` in addition
        to raising a warning when a training score is looked up.
        That default will be changed to ``False`` in 0.21.
        Computing training decision_scores is used to get insights on how different
        parameter settings impact the overfitting/underfitting trade-off.
        However computing the decision_scores on the training set can be computationally
        expensive and is not strictly required to select the parameters that
        yield the best generalization performance.

    Returns
    -------
    decision_scores : dict of float arrays of shape=(n_splits,)
        Array of results of the estimator for each run of the cross validation.

        A dict of arrays containing the score/time arrays for each scorer is
        returned. The possible keys for this ``dict`` are:

            ``test_score``
                The score array for test decision_scores on each cross_validators split.
            ``train_score``
                The score array for train decision_scores on each cross_validators split.
                This is available only if ``return_train_score`` parameter
                is ``True``.
            ``fit_time``
                The time for fitting the estimator on the train
                set for each cross_validators split.
            ``score_time``
                The time for scoring the estimator on the test set for each
                cross_validators split. (Note time for scoring on the train set is not
                included even if ``return_train_score`` is set to ``True``
            ``estimator``
                A list of estimator objects, one for each training dataset.

    Examples
    --------
    >>> from sklearn import datasets, linear_model
    >>> from sklearn.model_selection import cross_validate
    >>> from sklearn.metrics.scorer import make_scorer
    >>> from sklearn.metrics import confusion_matrix
    >>> from sklearn.svm import LinearSVC
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()

    Single metric evaluation using ``cross_validate``

    >>> cv_results = cross_validate(lasso, X, y, return_train_score=False)
    >>> sorted(cv_results.keys())                         # doctest: +ELLIPSIS
    ['fit_time', 'score_time', 'test_score']
    >>> cv_results['test_score']    # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    array([ 0.33...,  0.08...,  0.03...])

    Multiple metric evaluation using ``cross_validate``
    (please refer the ``scoring`` parameter doc for more information)

    >>> decision_scores = cross_validate(lasso, X, y,
    ...                         scoring=('r2', 'neg_mean_squared_error'))
    >>> print(decision_scores['test_neg_mean_squared_error'])      # doctest: +ELLIPSIS
    [-3635.5... -3573.3... -6114.7...]
    >>> print(decision_scores['train_r2'])                         # doctest: +ELLIPSIS
    [ 0.28...  0.39...  0.22...]

    See Also
    ---------
    :func:`sklearn.model_selection.cross_val_score`:
        Run cross-validation for single metric evaluation.

    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    # ---------------------- My Hack ----------------------- #
    # 1) Set parameter `error_score=-1` to `_fit_and_score`  #
    # 2) Created an argument `return_estimator` to           #
    #    `_fit_and_score`                                    #
    # ------------------------------------------------------ #
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                None,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=True,
                                error_score=-1)
        for train, test in cv.split(X, y, groups))

    if return_train_score:
        train_scores, test_scores, fit_times, score_times, estimators = zip(
            *scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times, estimators = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)
    ret['estimator'] = list(estimators)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training decision_scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)

    ret['cross_validator'] = cv

    return ret
예제 #28
0
    def fit(self, X, y=None, groups=None, **fit_params):
        # fit_params = {}
        # print("fit is being called")
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        if self.fit_params is not None:
            warnings.warn(
                '"fit_params" as a constructor argument was '
                'deprecated in version 0.19 and will be removed '
                'in version 0.21. Pass fit parameters to the '
                '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn(
                    'Ignoring fit_params passed as a constructor '
                    'argument in favor of keyword arguments to '
                    'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        # cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        # results = results_container[0]

        for pg in self.param_grid:

            base_estimator = clone(self.estimator)
            results = self.search_on_grid(X, y, groups, fit_params, scorers,
                                          base_estimator, pg)

            # print(results['parameters'])

            # For multi-metric evaluation, store the best_index_, best_params_ and
            # best_score_ iff refit is one of the scorer names
            # In single metric evaluation, refit_metric is "score"
            if self.refit or not self.multimetric_:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                # self.best_params_ = results["params"][self.best_index_]

                # print(self.best_params_)
                self.update_best_params(results["params"][self.best_index_])
                # print(self.best_params_)
                # print(results["params"][self.best_index_])
                # print(self.best_params_)

                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]

            if self.refit:
                self.best_estimator_ = clone(base_estimator).set_params(
                    **self.best_params_)
                refit_start_time = time.time()
                if y is not None:
                    self.best_estimator_.fit(X, y, **fit_params)
                else:
                    self.best_estimator_.fit(X, **fit_params)
                refit_end_time = time.time()
                self.refit_time_ = refit_end_time - refit_start_time

            # Store the only scorer not as a dict for single metric evaluation
            self.scorer_ = scorers if self.multimetric_ else scorers['score']

            self.cv_results_ = results

        return self
예제 #29
0
def main(
    inputs,
    infile_estimator,
    outfile_eval,
    infile_weights=None,
    infile1=None,
    infile2=None,
):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : strgit
        File path to trained estimator input

    outfile_eval : str
        File path to save the evalulation results, tabular

    infile_weights : str
        File path to weights input

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values
    """
    warnings.filterwarnings("ignore")

    with open(inputs, "r") as param_handler:
        params = json.load(param_handler)

    X_test, y_test = _get_X_y(params, infile1, infile2)

    # load model
    with open(infile_estimator, "rb") as est_handler:
        estimator = load_model(est_handler)

    main_est = estimator
    if isinstance(estimator, Pipeline):
        main_est = estimator.steps[-1][-1]
    if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
        if not infile_weights or infile_weights == "None":
            raise ValueError("The selected model skeleton asks for weights, "
                             "but no dataset for weights was provided!")
        main_est.load_weights(infile_weights)

    # handle scorer, convert to scorer dict
    # Check if scoring is specified
    scoring = params["scoring"]
    if scoring is not None:
        # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
        # Check if secondary_scoring is specified
        secondary_scoring = scoring.get("secondary_scoring", None)
        if secondary_scoring is not None:
            # If secondary_scoring is specified, convert the list into comman separated string
            scoring["secondary_scoring"] = ",".join(
                scoring["secondary_scoring"])

    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    if hasattr(estimator, "evaluate"):
        scores = estimator.evaluate(X_test,
                                    y_test=y_test,
                                    scorer=scorer,
                                    is_multimetric=True)
    else:
        scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)

    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_eval, sep="\t", header=True, index=False)
            def cross_validate(estimator,
                               X,
                               y=None,
                               groups=None,
                               scoring=None,
                               cv='warn',
                               n_jobs=None,
                               verbose=0,
                               fit_params=None,
                               pre_dispatch='2*n_jobs',
                               return_train_score=False,
                               return_estimator=False,
                               error_score='raise-deprecating'):

                X, y, groups = indexable(X, y, groups)

                cv = check_cv(cv, y, classifier=is_classifier(estimator))
                scorers, _ = _check_multimetric_scoring(estimator,
                                                        scoring=scoring)

                def _score(estimator,
                           X_test,
                           y_test,
                           scorer,
                           is_multimetric=False):

                    if is_multimetric:
                        return _multimetric_score(estimator, X_test, y_test,
                                                  scorer)
                    else:
                        if y_test is None:
                            score = scorer(estimator, X_test)
                        else:
                            score = scorer(estimator, X_test, y_test)

                        if hasattr(score, 'item'):
                            try:
                                # e.g. unwrap memmapped scalars
                                score = score.item()
                            except ValueError:
                                # non-scalar?
                                pass

                        if not isinstance(score, numbers.Number):
                            raise ValueError(
                                "scoring must return a number, got %s (%s) "
                                "instead. (scorer=%r)" %
                                (str(score), type(score), scorer))

                    return score

                def _multimetric_score(estimator, X_test, y_test, scorers):
                    """Return a dict of score for multimetric scoring."""
                    scores = {}

                    for name, scorer in scorers.items():
                        if y_test is None:
                            score = scorer(estimator, X_test)
                        else:
                            score = scorer(estimator, X_test, y_test)

                        if hasattr(score, 'item'):
                            try:
                                # e.g. unwrap memmapped scalars
                                score = score.item()
                            except ValueError:
                                # non-scalar?
                                pass
                        scores[name] = score

                        if not isinstance(score, numbers.Number):
                            raise ValueError(
                                "scoring must return a number, got %s (%s) "
                                "instead. (scorer=%s)" %
                                (str(score), type(score), name))
                    return scores

                def _aggregate_score_dicts(scores):

                    out = {}
                    for key in scores[0]:
                        out[key] = np.asarray([score[key] for score in scores])
                    return out

                def _fit_and_score(estimator,
                                   X,
                                   y,
                                   scorer,
                                   train,
                                   test,
                                   verbose,
                                   parameters,
                                   fit_params,
                                   return_train_score=False,
                                   return_parameters=False,
                                   return_n_test_samples=False,
                                   return_times=False,
                                   return_estimator=False,
                                   error_score='raise-deprecating'):

                    start_time = time.time()

                    if verbose > 1:
                        if parameters is None:
                            msg = ''
                        else:
                            msg = '%s' % (', '.join(
                                '%s=%s' % (k, v)
                                for k, v in parameters.items()))
                        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

                    # Adjust length of sample weights
                    fit_params = fit_params if fit_params is not None else {}
                    fit_params = dict([(k, _index_param_value(X, v, train))
                                       for k, v in fit_params.items()])

                    train_scores = {}
                    if parameters is not None:
                        estimator.set_params(**parameters)

                    X_train, y_train = _safe_split(estimator, X, y, train)
                    X_test, y_test = _safe_split(estimator, X, y, test, train)

                    is_multimetric = not callable(scorer)
                    n_scorers = len(scorer.keys()) if is_multimetric else 1

                    try:
                        #########################################
                        ############ FIT CALLED HERE ############
                        #########################################
                        if y_train is None:
                            estimator.fit(X_train, **fit_params)
                        else:
                            estimator.fit(X_train, y_train, **fit_params)
                        #########################################
                    except Exception as e:
                        # Note fit time as time until error
                        fit_time = time.time() - start_time
                        score_time = 0.0
                        if error_score == 'raise':
                            raise
                        elif error_score == 'raise-deprecating':
                            warnings.warn(
                                "From version 0.22, errors during fit will result "
                                "in a cross validation score of NaN by default. Use "
                                "error_score='raise' if you want an exception "
                                "raised or error_score=np.nan to adopt the "
                                "behavior from version 0.22.", FutureWarning)
                            raise
                        elif isinstance(error_score, numbers.Number):
                            if is_multimetric:
                                test_scores = dict(
                                    zip(scorer.keys(), [
                                        error_score,
                                    ] * n_scorers))
                                if return_train_score:
                                    train_scores = dict(
                                        zip(scorer.keys(), [
                                            error_score,
                                        ] * n_scorers))
                            else:
                                test_scores = error_score
                                if return_train_score:
                                    train_scores = error_score
                            warnings.warn(
                                "Estimator fit failed. The score on this train-test"
                                " partition for these parameters will be set to %f. "
                                "Details: \n%s" %
                                (error_score, format_exception_only(
                                    type(e), e)[0]), FitFailedWarning)
                        else:
                            raise ValueError(
                                "error_score must be the string 'raise' or a"
                                " numeric value. (Hint: if using 'raise', please"
                                " make sure that it has been spelled correctly.)"
                            )

                    else:
                        fit_time = time.time() - start_time
                        # _score will return dict if is_multimetric is True
                        test_scores = _score(estimator, X_test, y_test, scorer,
                                             is_multimetric)
                        score_time = time.time() - start_time - fit_time
                        if return_train_score:
                            train_scores = _score(estimator, X_train, y_train,
                                                  scorer, is_multimetric)

                    if verbose > 2:
                        if is_multimetric:
                            for scorer_name, score in test_scores.items():
                                msg += ", %s=%s" % (scorer_name, score)
                        else:
                            msg += ", score=%s" % test_scores
                    if verbose > 1:
                        total_time = score_time + fit_time
                        end_msg = "%s, total=%s" % (
                            msg, logger.short_format_time(total_time))
                        print("[CV] %s %s" %
                              ((64 - len(end_msg)) * '.', end_msg))

                    ret = [train_scores, test_scores
                           ] if return_train_score else [test_scores]

                    if return_n_test_samples:
                        ret.append(_num_samples(X_test))
                    if return_times:
                        ret.extend([fit_time, score_time])
                    if return_parameters:
                        ret.append(parameters)
                    if return_estimator:
                        ret.append(estimator)

                    return ret

                if not context:
                    parallel = Parallel(n_jobs=n_jobs,
                                        verbose=verbose,
                                        pre_dispatch=pre_dispatch)
                else:
                    parallel = cls.Parallel()

                # We clone the estimator to make sure that all the folds are
                # independent, and that it is pickle-able.
                scores = parallel(
                    delayed(_fit_and_score)(
                        clone(estimator),
                        X,
                        y,
                        scorers,
                        train,
                        test,
                        verbose,
                        None,
                        fit_params,
                        return_train_score=return_train_score,
                        return_times=True,
                        return_estimator=return_estimator,
                        error_score=error_score)
                    for train, test in cv.split(X, y, groups))

                zipped_scores = list(zip(*scores))
                if return_train_score:
                    train_scores = zipped_scores.pop(0)
                    train_scores = _aggregate_score_dicts(train_scores)
                if return_estimator:
                    fitted_estimators = zipped_scores.pop()
                test_scores, fit_times, score_times = zipped_scores
                test_scores = _aggregate_score_dicts(test_scores)

                ret = {}
                ret['fit_time'] = np.array(fit_times)
                ret['score_time'] = np.array(score_times)

                if return_estimator:
                    ret['estimator'] = fitted_estimators

                for name in scorers:
                    ret['test_%s' % name] = np.array(test_scores[name])
                    if return_train_score:
                        key = 'train_%s' % name
                        ret[key] = np.array(train_scores[name])

                return ret
예제 #31
0
def main(inputs,
         infile_estimator,
         infile1,
         infile2,
         outfile_result,
         outfile_object=None,
         outfile_weights=None,
         outfile_y_true=None,
         outfile_y_preds=None,
         groups=None,
         ref_seq=None,
         intervals=None,
         targets=None,
         fasta_path=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    outfile_weights : str, optional
        File path to save deep learning model weights

    outfile_y_true : str, optional
        File path to target values for prediction

    outfile_y_preds : str, optional
        File path to save deep learning model weights

    groups : str
        File path to dataset containing groups labels

    ref_seq : str
        File path to dataset containing genome sequence file

    intervals : str
        File path to dataset containing interval file

    targets : str
        File path to dataset compressed target bed file

    fasta_path : str
        File path to dataset containing fasta file
    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    #  load estimator
    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    estimator = clean_params(estimator)

    # swap hyperparameter
    swapping = params['experiment_schemes']['hyperparams_swapping']
    swap_params = _eval_swap_params(swapping)
    estimator.set_params(**swap_params)

    estimator_params = estimator.get_params()

    # store read dataframe object
    loaded_df = {}

    input_type = params['input_options']['selected_input']
    # tabular input
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in [
                'by_index_number', 'all_but_by_index_number', 'by_header_name',
                'all_but_by_header_name'
        ]:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None

        df_key = infile1 + repr(header)
        df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True)
        loaded_df[df_key] = df

        X = read_columns(df, c=c, c_option=column_option).astype(float)
    # sparse input
    elif input_type == 'sparse':
        X = mmread(open(infile1, 'r'))

    # fasta_file input
    elif input_type == 'seq_fasta':
        pyfaidx = get_module('pyfaidx')
        sequences = pyfaidx.Fasta(fasta_path)
        n_seqs = len(sequences.keys())
        X = np.arange(n_seqs)[:, np.newaxis]
        for param in estimator_params.keys():
            if param.endswith('fasta_path'):
                estimator.set_params(**{param: fasta_path})
                break
        else:
            raise ValueError(
                "The selected estimator doesn't support "
                "fasta file input! Please consider using "
                "KerasGBatchClassifier with "
                "FastaDNABatchGenerator/FastaProteinBatchGenerator "
                "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
                "in pipeline!")

    elif input_type == 'refseq_and_interval':
        path_params = {
            'data_batch_generator__ref_genome_path': ref_seq,
            'data_batch_generator__intervals_path': intervals,
            'data_batch_generator__target_path': targets
        }
        estimator.set_params(**path_params)
        n_intervals = sum(1 for line in open(intervals))
        X = np.arange(n_intervals)[:, np.newaxis]

    # Get target y
    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in [
            'by_index_number', 'all_but_by_index_number', 'by_header_name',
            'all_but_by_header_name'
    ]:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None

    df_key = infile2 + repr(header)
    if df_key in loaded_df:
        infile2 = loaded_df[df_key]
    else:
        infile2 = pd.read_csv(infile2,
                              sep='\t',
                              header=header,
                              parse_dates=True)
        loaded_df[df_key] = infile2

    y = read_columns(infile2,
                     c=c,
                     c_option=column_option,
                     sep='\t',
                     header=header,
                     parse_dates=True)
    if len(y.shape) == 2 and y.shape[1] == 1:
        y = y.ravel()
    if input_type == 'refseq_and_interval':
        estimator.set_params(data_batch_generator__features=y.ravel().tolist())
        y = None
    # end y

    # load groups
    if groups:
        groups_selector = (params['experiment_schemes']['test_split']
                           ['split_algos']).pop('groups_selector')

        header = 'infer' if groups_selector['header_g'] else None
        column_option = \
            (groups_selector['column_selector_options_g']
                            ['selected_column_selector_option_g'])
        if column_option in [
                'by_index_number', 'all_but_by_index_number', 'by_header_name',
                'all_but_by_header_name'
        ]:
            c = groups_selector['column_selector_options_g']['col_g']
        else:
            c = None

        df_key = groups + repr(header)
        if df_key in loaded_df:
            groups = loaded_df[df_key]

        groups = read_columns(groups,
                              c=c,
                              c_option=column_option,
                              sep='\t',
                              header=header,
                              parse_dates=True)
        groups = groups.ravel()

    # del loaded_df
    del loaded_df

    # cache iraps_core fits could increase search speed significantly
    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    main_est = get_main_estimator(estimator)
    if main_est.__class__.__name__ == 'IRAPSClassifier':
        main_est.set_params(memory=memory)

    # handle scorer, convert to scorer dict
    scoring = params['experiment_schemes']['metrics']['scoring']
    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # handle test (first) split
    test_split_options = (
        params['experiment_schemes']['test_split']['split_algos'])

    if test_split_options['shuffle'] == 'group':
        test_split_options['labels'] = groups
    if test_split_options['shuffle'] == 'stratified':
        if y is not None:
            test_split_options['labels'] = y
        else:
            raise ValueError("Stratified shuffle split is not "
                             "applicable on empty target values!")

    X_train, X_test, y_train, y_test, groups_train, groups_test = \
        train_test_split_none(X, y, groups, **test_split_options)

    exp_scheme = params['experiment_schemes']['selected_exp_scheme']

    # handle validation (second) split
    if exp_scheme == 'train_val_test':
        val_split_options = (
            params['experiment_schemes']['val_split']['split_algos'])

        if val_split_options['shuffle'] == 'group':
            val_split_options['labels'] = groups_train
        if val_split_options['shuffle'] == 'stratified':
            if y_train is not None:
                val_split_options['labels'] = y_train
            else:
                raise ValueError("Stratified shuffle split is not "
                                 "applicable on empty target values!")

        X_train, X_val, y_train, y_val, groups_train, groups_val = \
            train_test_split_none(X_train, y_train, groups_train,
                                  **val_split_options)

    # train and eval
    if hasattr(estimator, 'config') and hasattr(estimator, 'model_type'):
        if exp_scheme == 'train_val_test':
            estimator.fit(X_train, y_train, validation_data=(X_val, y_val))
        else:
            estimator.fit(X_train, y_train, validation_data=(X_test, y_test))
    else:
        estimator.fit(X_train, y_train)

    if isinstance(estimator, KerasGBatchClassifier):
        scores = {}
        steps = estimator.prediction_steps
        batch_size = estimator.batch_size
        data_generator = estimator.data_generator_

        scores, predictions, y_true = _evaluate_keras_and_sklearn_scores(
            estimator,
            data_generator,
            X_test,
            y=y_test,
            sk_scoring=sk_scoring,
            steps=steps,
            batch_size=batch_size,
            return_predictions=bool(outfile_y_true))

    else:
        scores = {}
        if hasattr(estimator, 'model_') \
                and hasattr(estimator.model_, 'metrics_names'):
            batch_size = estimator.batch_size
            score_results = estimator.model_.evaluate(X_test,
                                                      y=y_test,
                                                      batch_size=batch_size,
                                                      verbose=0)
            metrics_names = estimator.model_.metrics_names
            if not isinstance(metrics_names, list):
                scores[metrics_names] = score_results
            else:
                scores = dict(zip(metrics_names, score_results))

        if hasattr(estimator, 'predict_proba'):
            predictions = estimator.predict_proba(X_test)
        else:
            predictions = estimator.predict(X_test)

        y_true = y_test
        sk_scores = _score(estimator,
                           X_test,
                           y_test,
                           scorer,
                           is_multimetric=True)
        scores.update(sk_scores)

    # handle output
    if outfile_y_true:
        try:
            pd.DataFrame(y_true).to_csv(outfile_y_true, sep='\t', index=False)
            pd.DataFrame(predictions).astype(np.float32).to_csv(
                outfile_y_preds,
                sep='\t',
                index=False,
                float_format='%g',
                chunksize=10000)
        except Exception as e:
            print("Error in saving predictions: %s" % e)
    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        main_est = estimator
        if isinstance(estimator, Pipeline):
            main_est = estimator.steps[-1][-1]

        if hasattr(main_est, 'model_') \
                and hasattr(main_est, 'save_weights'):
            if outfile_weights:
                main_est.save_weights(outfile_weights)
            del main_est.model_
            del main_est.fit_params
            del main_est.model_class_
            main_est.callbacks = []
            if getattr(main_est, 'data_generator_', None):
                del main_est.data_generator_

        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
예제 #32
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                not isinstance(self.refit, six.string_types)
                # This will work for both dict / list (tuple)
                or self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                "refit must be set to a scorer key to refit an estimator with "
                "the best parameter setting on the whole data and make the "
                "best_* attributes available for that metric. If this is not "
                "needed, refit should be set to False explicitly. %r was "
                "passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)

        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
            for parameters in candidate_params
            for train, test in list(cv.split(X, y, groups))]

        # Because the original python code expects a certain order for the
        # elements, we need to respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid,
                                             len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        verbose = self.verbose
        error_score = self.error_score
        return_train_score = self.return_train_score

        def fun(tup):
            # DO NOT REFERENCE TO `self` ANYWHERE IN THIS FUNCTION.
            # IT WILL CAUSE A SPARK-5063 ERROR.
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = _fit_and_score(local_estimator, local_X, local_y, scorers,
                train, test, verbose, parameters, fit_params=fit_params,
                return_train_score=return_train_score,
                return_n_test_samples=True, return_times=True,
                error_score=error_score)
            return (index, res)

        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]
        X_bc.unpersist()
        y_bc.unpersist()

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts,
             fit_time, score_time) = zip(*out)

        else:
            (test_score_dicts, test_sample_counts,
             fit_time, score_time) = zip(*out)

        if self.verbose > 2:
            print('test_sample_counts: {}'.format(test_sample_counts))
            print('fit_time: {}'.format(fit_time))
            print('score_time: {}'.format(score_time))

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.verbose > 1:
            print('TEST')
            print(test_scores)

        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)
            if self.verbose > 1:
                print('TRAIN')
                print(train_scores)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = (np.array(array, dtype=np.float64)
                       .reshape(n_candidates, n_splits))

            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name, test_scores[scorer_name],
                   splits=True, rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name, train_scores[scorer_name],
                       splits=True)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']
        self.cv_results_ = results
        self.n_splits_ = n_splits

        if self.verbose > 1:
            print(self.scorer_)
            print(self.cv_results_)
            print(self.n_splits_)
        return self
예제 #33
0
def _evaluate_keras_and_sklearn_scores(estimator,
                                       data_generator,
                                       X,
                                       y=None,
                                       sk_scoring=None,
                                       steps=None,
                                       batch_size=32,
                                       return_predictions=False):
    """output scores for bother keras and sklearn metrics

    Parameters
    -----------
    estimator : object
        Fitted `galaxy_ml.keras_galaxy_models.KerasGBatchClassifier`.
    data_generator : object
        From `galaxy_ml.preprocessors.ImageDataFrameBatchGenerator`.
    X : 2-D array
        Contains indecies of images that need to be evaluated.
    y : None
        Target value.
    sk_scoring : dict
        Galaxy tool input parameters.
    steps : integer or None
        Evaluation/prediction steps before stop.
    batch_size : integer
        Number of samples in a batch
    return_predictions : bool, default is False
        Whether to return predictions and true labels.
    """
    scores = {}

    generator = data_generator.flow(X, y=y, batch_size=batch_size)
    # keras metrics evaluation
    # handle scorer, convert to scorer dict
    generator.reset()
    score_results = estimator.model_.evaluate_generator(generator, steps=steps)
    metrics_names = estimator.model_.metrics_names
    if not isinstance(metrics_names, list):
        scores[metrics_names] = score_results
    else:
        scores = dict(zip(metrics_names, score_results))

    if sk_scoring['primary_scoring'] == 'default' and\
            not return_predictions:
        return scores

    generator.reset()
    predictions, y_true = _predict_generator(estimator.model_,
                                             generator,
                                             steps=steps)

    # for sklearn metrics
    if sk_scoring['primary_scoring'] != 'default':
        scorer = get_scoring(sk_scoring)
        scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
        sk_scores = gen_compute_scores(y_true,
                                       predictions,
                                       scorer,
                                       is_multimetric=True)
        scores.update(sk_scores)

    if return_predictions:
        return scores, predictions, y_true
    else:
        return scores, None, None
예제 #34
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        """
        if self.fit_params is not None:
            warnings.warn(
                '"fit_params" as a constructor argument was '
                'deprecated in version 0.19 and will be removed '
                'in version 0.21. Pass fit parameters to the '
                '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn(
                    'Ignoring fit_params passed as a constructor '
                    'argument in favor of keyword arguments to '
                    'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        # X, y, groups = indexable(X, y, groups)
        if groups is not None:
            raise NotImplementedError("groups are not supported")

        # n_splits = cv.get_n_splits(X, y, groups)
        n_splits = min(
            cv.get_n_splits(X_.transpose(1, 2, 0), y_, None)
            for X_, y_ in zip(X, y))

        def generate_index(X_list, y_list):
            split = [
                cv.split(X.transpose(1, 2, 0), y)
                for X, y in zip(X_list, y_list)
            ]
            for i in range(n_splits):
                yield zip(*[next(s) for s in split])

        generate_index_iter = generate_index(X, y)

        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(delayed(_fit_and_score)(
                clone(base_estimator),
                X,
                y,
                scorers,
                train,
                test,
                self.verbose,
                parameters,
                fit_params=fit_params,
                return_train_score=self.return_train_score,
                return_n_test_samples=True,
                return_times=True,
                return_parameters=False,
                error_score=self.error_score,
                return_estimator=True,
                return_idx=True) for parameters, (
                    train,
                    test) in product(candidate_params, generate_index_iter))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time, estimators, train_idxs, test_idxs) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time, score_time,
             estimators, train_idxs, test_idxs) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # TODO: replace by a dict in 0.21
        results = (DeprecationDict()
                   if self.return_train_score == 'warn' else {})

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """Store the scores/times to the cv_results_."""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        results['estimators'] = estimators
        results['train_index'] = train_idxs
        results['test_index'] = test_idxs

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name,
                   test_scores[scorer_name],
                   splits=True,
                   rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name,
                       train_scores[scorer_name],
                       splits=True)

                if self.return_train_score == 'warn':
                    for key in set(results.keys()) - prev_keys:
                        message = (
                            'You are accessing a training score ({!r}), '
                            'which will not be available by default '
                            'any more in 0.21. If you need training scores, '
                            'please set return_train_score=True').format(key)
                        # warn on key access
                        results.add_warning(key, message, FutureWarning)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" %
                                       refit_metric][self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #35
0
    def _fit(self, X, y=None, target_col=None):
        """Fit estimator.

        Requiers to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        X, y = _validate_Xyt(X, y, target_col)
        types = detect_types(X)
        self.feature_names_ = X.columns
        self.types_ = types

        y, self.scoring_ = self._preprocess_target(y)
        self.log_ = []

        # reimplement cross-validation so we only do preprocessing once
        # This could/should be solved with dask?
        if isinstance(self, RegressorMixin):
            # this is how inheritance works, right?
            cv = KFold(n_splits=5)
        elif isinstance(self, ClassifierMixin):
            cv = StratifiedKFold(n_splits=5)
        data_preproc = []
        for i, (train, test) in enumerate(cv.split(X, y)):
            # maybe do two levels of preprocessing
            # to search over treatment of categorical variables etc
            # Also filter?
            verbose = self.verbose if i == 0 else 0
            sp = EasyPreprocessor(verbose=verbose, types=types)
            X_train = sp.fit_transform(X.iloc[train], y.iloc[train])
            X_test = sp.transform(X.iloc[test])
            data_preproc.append((X_train, X_test, y.iloc[train], y.iloc[test]))

        estimators = self._get_estimators()
        rank_scoring = self._rank_scoring
        self.current_best_ = {rank_scoring: -np.inf}
        for est in estimators:
            set_random_state(est, self.random_state)
            scorers, _ = _check_multimetric_scoring(est, self.scoring_)
            scores = self._evaluate_one(est, data_preproc, scorers)
            # make scoring configurable
            if scores[rank_scoring] > self.current_best_[rank_scoring]:
                if self.verbose:
                    print("=== new best {} (using {}):".format(
                        scores.name, rank_scoring))
                    print(_format_scores(scores))
                    print()

                self.current_best_ = scores
                best_est = est
        if self.verbose:
            print("\nBest model:\n{}\nBest Scores:\n{}".format(
                nice_repr(best_est), _format_scores(self.current_best_)))
        if self.refit:
            self.est_ = make_pipeline(EasyPreprocessor(), best_est)
            self.est_.fit(X, y)
        return self
def monkeypatch_fit(self, X, y=None, groups=None, **fit_params):
    if self.fit_params is not None:
        warnings.warn('"fit_params" as a constructor argument was '
                      'deprecated in version 0.19 and will be removed '
                      'in version 0.21. Pass fit parameters to the '
                      '"fit" method instead.', DeprecationWarning)
        if fit_params:
            warnings.warn('Ignoring fit_params passed as a constructor '
                          'argument in favor of keyword arguments to '
                          'the "fit" method.', RuntimeWarning)
        else:
            fit_params = self.fit_params
    estimator = self.estimator
    cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

    scorers, self.multimetric_ = _check_multimetric_scoring(
        self.estimator, scoring=self.scoring)

    if self.multimetric_:
        if self.refit is not False and (
                not isinstance(self.refit, six.string_types) or
                # This will work for both dict / list (tuple)
                self.refit not in scorers):
            raise ValueError("For multi-metric scoring, the parameter "
                             "refit must be set to a scorer key "
                             "to refit an estimator with the best "
                             "parameter setting on the whole data and "
                             "make the best_* attributes "
                             "available for that metric. If this is not "
                             "needed, refit should be set to False "
                             "explicitly. %r was passed." % self.refit)
        else:
            refit_metric = self.refit
    else:
        refit_metric = 'score'

    X, y, groups = indexable(X, y, groups)
    n_splits = cv.get_n_splits(X, y, groups)
    # Regenerate parameter iterable for each fit
    candidate_params = list(self._get_param_iterator())
    n_candidates = len(candidate_params)
    if self.verbose > 0:
        print("Fitting {0} folds for each of {1} candidates, totalling"
              " {2} fits".format(n_splits, n_candidates,
                                 n_candidates * n_splits))

    base_estimator = clone(self.estimator)
    pre_dispatch = self.pre_dispatch

    # ===================================================================
    # BEGIN MONKEYPATCH MODIFICATION
    # ===================================================================

    parallel_cv = cv.split(X, y, groups)

    if type(self.pipeline_split_idx) == int and isinstance(base_estimator,
                                                           Pipeline):
        split_idx = self.pipeline_split_idx

        pre_pipe_steps = base_estimator.steps[:split_idx]
        new_pipe_steps = base_estimator.steps[split_idx:]
        memory = base_estimator.memory

        pre_pipe = Pipeline(pre_pipe_steps, memory)

        if len(new_pipe_steps) == 1:
            est_name, base_estimator = new_pipe_steps[0]
        else:
            est_name = None
            base_estimator = Pipeline(new_pipe_steps, memory)

        fit_params_pre_pipe = {}
        steps_pre_pipe = [tup[0] for tup in pre_pipe_steps]
        fit_param_keys = fit_params.keys()

        for pname in fit_param_keys:
            step, param = pname.split('__', 1)

            if step in steps_pre_pipe:
                fit_params_pre_pipe[pname] = fit_params.pop(pname)
            elif step == est_name:
                fit_params[param] = fit_params.pop(pname)

        if est_name is not None:
            for dic in candidate_params:
                for k in dic:
                    step, param = k.split('__', 1)

                    if step == est_name:
                        dic.update({param: dic.pop(k)})

        try:
            X = pre_pipe.fit_transform(X, **fit_params_pre_pipe)
        except TypeError:
            raise RuntimeError('Pipeline before pipeline_split_idx requires '
                               'fitting to y. Please initialize with an '
                               'earlier index.')

    if self.transform_before_grid and isinstance(base_estimator, Pipeline):
        pipe = base_estimator
        est_name, base_estimator = pipe.steps.pop()
        X_cv, y_cv, parallel_cv = [], [], []
        sample_count = 0

        fit_params_est = {}
        fit_param_keys = fit_params.keys()

        for pname in fit_param_keys:
            step, param = pname.split('__', 1)
            if step == est_name:
                fit_params_est[param] = fit_params.pop(pname)

        for dic in candidate_params:
            for k in dic:
                step, param = k.split('__', 1)

                if step == est_name:
                    dic.update({param: dic.pop(k)})

        for (train, test) in cv.split(X, y, groups):
            if y is not None:
                if isinstance(X, pd.DataFrame):
                    pipe.fit(X.iloc[train], y.iloc[train], **fit_params)
                else:
                    pipe.fit(X[train], y[train], **fit_params)
                y_cv.append(y)
            else:
                if isinstance(X, pd.DataFrame):
                    pipe.fit(X.iloc[train], **fit_params)
                else:
                    pipe.fit(X[train], **fit_params)

            X_cv.append(pipe.transform(X))

            train = train + sample_count
            test = test + sample_count
            sample_count += len(train)
            sample_count += len(test)

            parallel_cv.append((train, test))

        if isinstance(X, pd.DataFrame):
            X = pd.concat(tuple(X_cv))
        else:
            X = np.vstack(tuple(X_cv))

        if y is not None:
            if isinstance(y, pd.Series):
                y = pd.concat(tuple(y_cv))
            else:
                y = np.hstack(tuple(y_cv))

            if 'sample_weight' in fit_params_est:
                samp_weight = fit_params_est['sample_weight']
                fit_params_est['sample_weight'] = np.tile(samp_weight,
                                                          len(y_cv))

        fit_params = fit_params_est

    out = Parallel(
        n_jobs=self.n_jobs, verbose=self.verbose,
        pre_dispatch=pre_dispatch
    )(delayed(monkeypatch_fit_and_score)
      (clone(base_estimator), X, y, scorers, train,
                              test, self.verbose, parameters,
                              fit_params=fit_params,
                              return_train_score=self.return_train_score,
                              return_n_test_samples=True,
                              return_times=True, return_parameters=False,
                              error_score=self.error_score)
      for parameters, (train, test) in product(candidate_params,
                                               parallel_cv))

    # ===================================================================
    # END MONKEYPATCH MODIFICATION
    # ===================================================================

    # if one choose to see train score, "out" will contain train score info
    if self.return_train_score:
        (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)
    else:
        (test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)

    # test_score_dicts and train_score dicts are lists of dictionaries and
    # we make them into dict of lists
    test_scores = _aggregate_score_dicts(test_score_dicts)
    if self.return_train_score:
        train_scores = _aggregate_score_dicts(train_score_dicts)

    # TODO: replace by a dict in 0.21
    results = (DeprecationDict() if self.return_train_score == 'warn'
               else {})

    def _store(key_name, array, weights=None, splits=False, rank=False):
        """A small helper to store the scores/times to the cv_results_"""
        # When iterated first by splits, then by parameters
        # We want `array` to have `n_candidates` rows and `n_splits` cols.
        array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                          n_splits)
        if splits:
            for split_i in range(n_splits):
                # Uses closure to alter the results
                results["split%d_%s"
                        % (split_i, key_name)] = array[:, split_i]

        array_means = np.average(array, axis=1, weights=weights)
        results['mean_%s' % key_name] = array_means
        # Weighted std is not directly available in numpy
        array_stds = np.sqrt(np.average((array -
                                         array_means[:, np.newaxis]) ** 2,
                                        axis=1, weights=weights))
        results['std_%s' % key_name] = array_stds

        if rank:
            results["rank_%s" % key_name] = np.asarray(
                rankdata(-array_means, method='min'), dtype=np.int32)

    _store('fit_time', fit_time)
    _store('score_time', score_time)
    # Use one MaskedArray and mask all the places where the param is not
    # applicable for that candidate. Use defaultdict as each candidate may
    # not contain all the params
    param_results = defaultdict(partial(MaskedArray,
                                        np.empty(n_candidates,),
                                        mask=True,
                                        dtype=object))
    for cand_i, params in enumerate(candidate_params):
        for name, value in params.items():
            # An all masked empty array gets created for the key
            # `"param_%s" % name` at the first occurence of `name`.
            # Setting the value at an index also unmasks that index
            param_results["param_%s" % name][cand_i] = value

    results.update(param_results)
    # Store a list of param dicts at the key 'params'
    results['params'] = candidate_params

    # NOTE test_sample counts (weights) remain the same for all candidates
    test_sample_counts = np.array(test_sample_counts[:n_splits],
                                  dtype=np.int)
    for scorer_name in scorers.keys():
        # Computed the (weighted) mean and std for test scores alone
        _store('test_%s' % scorer_name, test_scores[scorer_name],
               splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            prev_keys = set(results.keys())
            _store('train_%s' % scorer_name, train_scores[scorer_name],
                   splits=True)

            if self.return_train_score == 'warn':
                for key in set(results.keys()) - prev_keys:
                    message = (
                        'You are accessing a training score ({!r}), '
                        'which will not be available by default '
                        'any more in 0.21. If you need training scores, '
                        'please set return_train_score=True').format(key)
                    # warn on key access
                    results.add_warning(key, message, FutureWarning)

    # For multi-metric evaluation, store the best_index_, best_params_ and
    # best_score_ iff refit is one of the scorer names
    # In single metric evaluation, refit_metric is "score"
    if self.refit or not self.multimetric_:
        self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
        self.best_params_ = candidate_params[self.best_index_]
        self.best_score_ = results["mean_test_%s" % refit_metric][
            self.best_index_]

    if self.refit:
        self.best_estimator_ = clone(base_estimator).set_params(
            **self.best_params_)
        if y is not None:
            self.best_estimator_.fit(X, y, **fit_params)
        else:
            self.best_estimator_.fit(X, **fit_params)

    # Store the only scorer not as a dict for single metric evaluation
    self.scorer_ = scorers if self.multimetric_ else scorers['score']

    self.cv_results_ = results
    self.n_splits_ = n_splits

    return self