Exemplo n.º 1
0
def _fe_fit_and_score(estimator,
                      X,
                      y,
                      scorers,
                      train,
                      test,
                      verbose,
                      parameters,
                      fit_params,
                      return_train_score=False,
                      return_parameters=False,
                      fold_specific_X_extractor=None):
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if fold_specific_X_extractor:
        # extend by fold-specific features
        X_train_additional = fold_specific_X_extractor(train)
        if X_train_additional is not None:
            X_train = np.concatenate([X_train, X_train_additional], axis=1)
    #
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    #
    test_scores = [
        _score(estimator, X_test, y_test, scorer) for scorer in scorers
    ]
    if return_train_score:
        train_scores = [
            _score(estimator, X_train, y_train, scorer) for scorer in scorers
        ]
    #
    if verbose > 2:
        msg += ", scores=%s" % test_scores
    if verbose > 1:
        print("[CV] %s %s" % ((64 - len(msg)) * '.', msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_parameters:
        ret.append(parameters)
    return ret
Exemplo n.º 2
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    if num_samples(train) == 0 or num_samples(test) == 0:
        raise RuntimeError(
            'Cross validation error in fit_estimator. The total data set '
            'contains %d elements, which were split into a training set '
            'of %d elements and a test set of %d elements. Unfortunately, '
            'you can\'t have a %s set with 0 elements.' % (
                num_samples(X), num_samples(train), num_samples(test),
                'training' if num_samples(train) == 0 else 'test'))

    # adjust length of sample weights
    n_samples = num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    msmbuilder_api = is_msmbuilder_estimator(estimator)
    n_samples_test = num_samples(X_test, is_nested=msmbuilder_api)
    n_samples_train = num_samples(X_train, is_nested=msmbuilder_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
Exemplo n.º 3
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    if num_samples(train) == 0 or num_samples(test) == 0:
        raise RuntimeError(
            'Cross validation error in fit_estimator. The total data set '
            'contains %d elements, which were split into a training set '
            'of %d elements and a test set of %d elements. Unfortunately, '
            'you can\'t have a %s set with 0 elements.' % (
                num_samples(X), num_samples(train), num_samples(test),
                'training' if num_samples(train) == 0 else 'test'))

    # adjust length of sample weights
    n_samples = num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    msmbuilder_api = is_msmbuilder_estimator(estimator)
    n_samples_test = num_samples(X_test, is_nested=msmbuilder_api)
    n_samples_train = num_samples(X_train, is_nested=msmbuilder_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
Exemplo n.º 4
0
def _rfe_train_test(rfe, estimator, X, y, train, test, scorer):
    """
    Return the score for a fit across one fold.
    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    train_score = rfe._fit(
        X_train, y_train, lambda estimator, features: _score(
            estimator, X_test[:, features], y_test, scorer)).scores_

    test_score = rfe._fit(
        X_train, y_train, lambda estimator, features: _score(
            estimator, X_train[:, features], y_train, scorer)).scores_

    return train_score, test_score
Exemplo n.º 5
0
def _fit_and_score_grid(estimator,
                        X,
                        y,
                        scorer,
                        train,
                        test,
                        grid,
                        fit_params,
                        error_score='raise'):
    '''Doc String'''

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        estimator.fit(X_train, y_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            scores = [error_score] * len(grid)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        origParams = estimator.get_params()
        scores = [
            _score(estimator.set_params(**params), X_test, y_test, scorer)
            for params in grid
        ]
        estimator.set_params(**origParams)

    return scores
Exemplo n.º 6
0
    def evaluate(self, scoring='accuracy'):
        from sklearn.model_selection._validation import _score

        sample_x = np.array(
            [tuple(x) for x in self._Xtest[self._ftnames].values])
        return _score(self._estimator, sample_x,
                      self._Xtest.rate.values.ravel().tolist(),
                      check_scoring(self._estimator, scoring=scoring))
Exemplo n.º 7
0
def _multi_time_fit(random_state, baf, X, y, scorer):
    """"""
    baf_i = clone(baf)
    baf_i.random_state = random_state
    baf_i.refit = True
    baf_i._fit(X, y)
    return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X, ), y,
                                  scorer), baf_i.score_
Exemplo n.º 8
0
    def evaluate(self, dataset, pipelines):
        if not self.is_valid(dataset):
            raise AssertionError("Dataset is not appropriate for evaluation")
        # this is a bit akward, but we need to check if at least one pipe
        # have to be run before loading the data. If at least one pipeline
        # need to be run, we have to load all the data.
        # we might need a better granularity, if we query the DB
        run_pipes = {}
        for subject in dataset.subject_list:
            run_pipes.update(
                self.results.not_yet_computed(pipelines, dataset, subject))
        if len(run_pipes) != 0:

            # get the data
            X, y, metadata = self.paradigm.get_data(dataset)

            # encode labels
            le = LabelEncoder()
            y = le.fit_transform(y)

            # extract metadata
            groups = metadata.subject.values
            sessions = metadata.session.values

            scorer = get_scorer(self.paradigm.scoring)

            # perform leave one subject out CV
            cv = LeaveOneGroupOut()
            for train, test in cv.split(X, y, groups):

                subject = groups[test[0]]
                # now we can check if this subject has results
                run_pipes = self.results.not_yet_computed(
                    pipelines, dataset, subject)

                # iterate over pipelines
                for name, clf in run_pipes.items():
                    t_start = time()
                    model = deepcopy(clf).fit(X[train], y[train])
                    duration = time() - t_start

                    # we eval on each session
                    for session in np.unique(sessions[test]):
                        ix = sessions[test] == session
                        score = _score(model, X[test[ix]], y[test[ix]], scorer)

                        res = {
                            "time": duration,
                            "dataset": dataset,
                            "subject": subject,
                            "session": session,
                            "score": score,
                            "n_samples": len(train),
                            "n_channels": X.shape[1],
                            "pipeline": name,
                        }

                        yield res
Exemplo n.º 9
0
def _baf_single_fit(train, test, baf, estimator, X, y, scorer, random_state):
    """"""
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    baf_i = clone(baf)
    baf_i.random_state = random_state
    baf_i._fit(X_train, y_train)
    return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X_test, ),
                                  y_test, scorer), baf_i.score_
Exemplo n.º 10
0
def _rfa_single_fit(rfa, estimator, X, y, train, test, scorer):
    """
    Return the score for a fit across one fold.
    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    return rfa._fit(
        X_train, y_train, lambda estimator, features:
        _score(estimator, X_test[:, features], y_test, scorer)).scores_
Exemplo n.º 11
0
    def _score(self, X, y, scoring=None, clf=None):
        from sklearn.model_selection._validation import _score

        if scoring is None:
            scoring = self._scorer

        if clf is None:
            clf = self._estimator

        return _score(clf, X, y, check_scoring(clf, scoring=scoring))
Exemplo n.º 12
0
    def _score(self, X, y, scoring=None, clf=None):
        from sklearn.model_selection._validation import _score

        if scoring is None:
            scoring = self._scorer

        if clf is None:
            clf = self._estimator

        return _score(clf, X, y, check_scoring(clf, scoring=scoring))
Exemplo n.º 13
0
 def _scoring(self, net, X_test, y_test):
     """Resolve scoring and apply it to data."""
     scorer = check_scoring(net, self.scoring)
     scores = _score(
         estimator=net,
         X_test=X_test,
         y_test=y_test,
         scorer=scorer,
         is_multimetric=False,
     )
     return scores
Exemplo n.º 14
0
 def _scoring(self, net, X_test, y_test):
     """Resolve scoring and apply it to data. Use cached prediction
     instead of running inference again, if available."""
     scorer = check_scoring(net, self.scoring)
     scores = _score(
         estimator=net,
         X_test=X_test,
         y_test=y_test,
         scorer=scorer,
         is_multimetric=False,
     )
     return scores
Exemplo n.º 15
0
 def score_explicit(self, clf, X_train, y_train, X_test, y_test):
     scorer = get_scorer(self.paradigm.scoring)
     t_start = time()
     try:
         model = clf.fit(X_train, y_train)
         score = _score(model, X_test, y_test, scorer)
     except ValueError as e:
         if self.error_score == "raise":
             raise e
         score = self.error_score
     duration = time() - t_start
     return score, duration
Exemplo n.º 16
0
 def _scoring(self, net, X_test, y_test):
     """Resolve scoring and apply it to data. Use cached prediction
     instead of running inference again, if available."""
     scorer = check_scoring(net, self.scoring_)
     scores = _score(
         estimator=net,
         X_test=X_test,
         y_test=y_test,
         scorer=scorer,
         is_multimetric=False,
     )
     return scores
Exemplo n.º 17
0
def _incremental_fit_estimator(estimator, X, y, classes, train, test,
                               train_sizes, scorer, verbose):
    """Train estimator on training subsets incrementally and compute scores."""
    train_scores, test_scores = [], []
    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
    for n_train_samples, partial_train in partitions:
        train_subset = train[:n_train_samples]

        # NOTE: wrapper patch
        X_train, y_train = _patch_split(estimator, X, y, train_subset)
        X_partial_train, y_partial_train = _patch_split(estimator, X, y,
                                                        partial_train)
        X_test, y_test = _patch_split(estimator, X, y, test, train_subset)

        if y_partial_train is None:
            estimator.partial_fit(X_partial_train, classes=classes)
        else:
            estimator.partial_fit(X_partial_train, y_partial_train,
                                  classes=classes)
        train_scores.append(_score(estimator, X_train, y_train, scorer))
        test_scores.append(_score(estimator, X_test, y_test, scorer))
    return np.array((train_scores, test_scores)).T
Exemplo n.º 18
0
def _wrap_score(y_pred, y_true, scorers, is_multimetric):
    start_time = time.time()
    results = {}
    # we use -1 to signify missing predictions because numpy has no integer NaN
    if np.any(y_pred < 0):
        if is_multimetric:
            for name in scorers:
                results[name] = np.nan
        else:
            results["score"] = np.nan
    else:
        estimator = _MockEstimator(y_pred)
        results = _score(estimator, None, y_true, scorers, is_multimetric)
    score_time = time.time() - start_time
    return results, score_time
Exemplo n.º 19
0
def main(inputs, infile_estimator, outfile_eval, infile1=None, infile2=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : strgit
        File path to trained estimator input

    outfile_eval : str
        File path to save the evalulation results, tabular

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values
    """
    warnings.filterwarnings('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    X_test, y_test = _get_X_y(params, infile1, infile2)

    # load model
    estimator = load_model_from_h5(infile_estimator)
    estimator = clean_params(estimator)

    # handle scorer, convert to scorer dict
    scoring = params['scoring']
    scorer = get_scoring(scoring)
    if not isinstance(scorer, (dict, list)):
        scorer = [scoring['primary_scoring']]
    scorer = _check_multimetric_scoring(estimator, scoring=scorer)

    if hasattr(estimator, 'evaluate'):
        scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer)
    else:
        scores = _score(estimator, X_test, y_test, scorer)

    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_eval, sep='\t', header=True, index=False)
Exemplo n.º 20
0
 def score_explicit(self, clf, X_train, y_train, X_test, y_test):
     if not self.mne_labels:
         # convert labels if array, keep them if epochs and mne_labels is set
         le = LabelEncoder()
         y_train = le.fit_transform(y_train)
         y_test = le.transform(y_test)
     scorer = get_scorer(self.paradigm.scoring)
     t_start = time()
     try:
         model = clf.fit(X_train, y_train)
         score = _score(model, X_test, y_test, scorer)
     except ValueError as e:
         if self.error_score == "raise":
             raise e
         score = self.error_score
     duration = time() - t_start
     return score, duration
Exemplo n.º 21
0
 def test_score(self, X, y, cv, scoring):
     '''return test scores of estimator 
     '''
     # test scores
     data_splits = _split_cv(X, y=y, cv=cv, random_state=self.seed)
     # get_scorers = _validation._check_multimetric_scoring
     # scorer, _ = get_scorers(self.estimator, scoring=scoring)
     # is_multimetric = not callable(scorer)
     scorer = self._get_scorer(scoring)
     is_multimetric = not callable(scorer)   
     scores = []
     for item in data_splits:
         x0 = item[0][1]
         y0 = item[1][1]
         scores.append(
             _validation._score(self.estimator, x0, y0, scorer,
                                is_multimetric))
     scores = pd.DataFrame(scores).reset_index(drop=True)
     return scores
Exemplo n.º 22
0
    def _score_on_validation_sets(
        self,
        estimator: BaseEstimator,
        datasets: Dict[str, Tuple[pd.DataFrame, pd.Series]],
    ) -> Dict[str, Dict[str, Any]]:
        results = defaultdict(dict)

        if not datasets:
            return results

        if callable(self.scoring):
            scorers = self.scoring
        elif self.scoring is None or isinstance(self.scoring, str):
            scorers = check_scoring(estimator, self.scoring)
        else:
            scorers = _check_multimetric_scoring(estimator, self.scoring)
        for key, X_y_tuple in datasets.items():
            X_test, y_test = X_y_tuple
            start_time = time()
            try:
                test_scores = _score(estimator, X_test, y_test, scorers)
            except Exception:
                if isinstance(scorers, dict):
                    test_scores = {k: np.nan for k in scorers}
                else:
                    test_scores = np.nan
                warnings.warn(
                    f"Scoring on validation set {key} failed. The score(s) for "
                    f"this set will be set to nan. Details: \n"
                    f"{format_exc()}",
                    UserWarning,
                )
            score_time = time() - start_time
            results[key]["score_time"] = score_time
            if not isinstance(test_scores, dict):
                test_scores = {"score": test_scores}

            for name in test_scores:
                results[key][f"test_{name}"] = test_scores[name]
        return results
Exemplo n.º 23
0
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
    """
    Return the score for a fit across one fold.
    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    oversampler = SMOTE(ratio='minority',
                        random_state=None,
                        k=None,
                        k_neighbors=5,
                        m=None,
                        m_neighbors=10,
                        out_step=0.5,
                        kind='regular',
                        svm_estimator=None,
                        n_jobs=1)
    X_train, y_train = oversampler.fit_sample(X_train, y_train)

    return rfe._fit(
        X_train, y_train, lambda estimator, features: _score(
            estimator, X_test[:, features], y_test, scorer)).scores_
Exemplo n.º 24
0
def nested_fit_and_score(estimator,
                         X,
                         y,
                         scorer,
                         train,
                         test,
                         verbose=1,
                         parameters=None,
                         fit_params=None,
                         return_train_score=False,
                         return_times=False,
                         error_score='raise'):
    """

    """
    from sklearn.externals.joblib.logger import short_format_time

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if verbose > 1:
        LOG.info(
            'CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.',
            len(X_train),
            len(X_train) - sum(y_train), sum(y_train), len(X_test),
            len(X_test) - sum(y_test), sum(y_test))

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            LOG.warning(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r", error_score, e)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        test_score = None
        score_time = 0.0
        if len(set(y_test)) > 1:
            test_score = _score(estimator, X_test, y_test, scorer)
            score_time = time.time() - start_time - fit_time
        else:
            LOG.warning(
                'Test set has no positive labels, scoring has been skipped '
                'in this loop.')

        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

        acc_score = _score(estimator, X_test, y_test,
                           check_scoring(estimator, scoring='accuracy'))

    if verbose > 0:
        total_time = score_time + fit_time
        if test_score is not None:
            LOG.info('Iteration took %s, score=%f, accuracy=%f.',
                     short_format_time(total_time), test_score, acc_score)
        else:
            LOG.info('Iteration took %s, score=None, accuracy=%f.',
                     short_format_time(total_time), acc_score)

    ret = {'test': {'score': test_score, 'accuracy': acc_score}}

    if return_train_score:
        ret['train'] = {'score': train_score}

    if return_times:
        ret['times'] = [fit_time, score_time]

    return ret, estimator
Exemplo n.º 25
0
def _model_fit_and_score(estimator_str,
                         X,
                         y,
                         scorer,
                         train,
                         test,
                         verbose,
                         parameters,
                         fit_params,
                         return_train_score=False,
                         return_parameters=False,
                         return_n_test_samples=False,
                         return_times=False,
                         error_score='raise'):
    """

    """
    if verbose > 1:
        msg = '[CV model=%s]' % estimator_str.upper()
        if parameters is not None:
            msg += ' %s' % (', '.join('%s=%s' % (k, v)
                                      for k, v in parameters.items()))
        LOG.info("%s %s", msg, (89 - len(msg)) * '.')

    estimator = _clf_build(estimator_str)

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        scorer = check_scoring(estimator, scoring=scorer)
        test_score = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info(end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append((estimator_str, parameters))
    return ret
Exemplo n.º 26
0
def _model_fit_and_score(estimator_str, X, y, scorer, train, test, verbose,
                         parameters, fit_params, return_train_score=False,
                         return_parameters=False, return_n_test_samples=False,
                         return_times=False, error_score='raise'):
    """

    """
    if verbose > 1:
        msg = '[CV model=%s]' % estimator_str.upper()
        if parameters is not None:
            msg += ' %s' % (', '.join('%s=%s' % (k, v)
                            for k, v in parameters.items()))
        LOG.info("%s %s", msg, (89 - len(msg)) * '.')

    estimator = _clf_build(estimator_str)

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        scorer = check_scoring(estimator, scoring=scorer)
        test_score = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info(end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append((estimator_str, parameters))
    return ret
Exemplo n.º 27
0
def nested_fit_and_score(
        estimator, X, y, scorer, train, test, verbose=1,
        parameters=None, fit_params=None, return_train_score=False,
        return_times=False, error_score='raise'):
    """

    """
    from sklearn.externals.joblib.logger import short_format_time

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if verbose > 1:
        LOG.info('CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.',
                 len(X_train), len(X_train) - sum(y_train), sum(y_train),
                 len(X_test), len(X_test) - sum(y_test), sum(y_test))

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            LOG.warn("Classifier fit failed. The score on this train-test"
                     " partition for these parameters will be set to %f. "
                     "Details: \n%r", error_score, e)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        test_score = None
        score_time = 0.0
        if len(set(y_test)) > 1:
            test_score = _score(estimator, X_test, y_test, scorer)
            score_time = time.time() - start_time - fit_time
        else:
            LOG.warn('Test set has no positive labels, scoring has been skipped '
                     'in this loop.')

        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

        acc_score = _score(estimator, X_test, y_test,
                           check_scoring(estimator, scoring='accuracy'))

    if verbose > 0:
        total_time = score_time + fit_time
        if test_score is not None:
            LOG.info('Iteration took %s, score=%f, accuracy=%f.',
                     short_format_time(total_time), test_score, acc_score)
        else:
            LOG.info('Iteration took %s, score=None, accuracy=%f.',
                     short_format_time(total_time), acc_score)

    ret = {
        'test': {'score': test_score, 'accuracy': acc_score}
    }

    if return_train_score:
        ret['train'] = {'score': train_score}

    if return_times:
        ret['times'] = [fit_time, score_time]

    return ret, estimator
Exemplo n.º 28
0
def main(inputs, infile_estimator, infile1, infile2,
         outfile_result, outfile_object=None,
         outfile_weights=None, groups=None,
         ref_seq=None, intervals=None, targets=None,
         fasta_path=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    outfile_weights : str, optional
        File path to save deep learning model weights

    groups : str
        File path to dataset containing groups labels

    ref_seq : str
        File path to dataset containing genome sequence file

    intervals : str
        File path to dataset containing interval file

    targets : str
        File path to dataset compressed target bed file

    fasta_path : str
        File path to dataset containing fasta file
    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    #  load estimator
    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    # swap hyperparameter
    swapping = params['experiment_schemes']['hyperparams_swapping']
    swap_params = _eval_swap_params(swapping)
    estimator.set_params(**swap_params)

    estimator_params = estimator.get_params()

    # store read dataframe object
    loaded_df = {}

    input_type = params['input_options']['selected_input']
    # tabular input
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in ['by_index_number', 'all_but_by_index_number',
                             'by_header_name', 'all_but_by_header_name']:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None

        df_key = infile1 + repr(header)
        df = pd.read_csv(infile1, sep='\t', header=header,
                         parse_dates=True)
        loaded_df[df_key] = df

        X = read_columns(df, c=c, c_option=column_option).astype(float)
    # sparse input
    elif input_type == 'sparse':
        X = mmread(open(infile1, 'r'))

    # fasta_file input
    elif input_type == 'seq_fasta':
        pyfaidx = get_module('pyfaidx')
        sequences = pyfaidx.Fasta(fasta_path)
        n_seqs = len(sequences.keys())
        X = np.arange(n_seqs)[:, np.newaxis]
        for param in estimator_params.keys():
            if param.endswith('fasta_path'):
                estimator.set_params(
                    **{param: fasta_path})
                break
        else:
            raise ValueError(
                "The selected estimator doesn't support "
                "fasta file input! Please consider using "
                "KerasGBatchClassifier with "
                "FastaDNABatchGenerator/FastaProteinBatchGenerator "
                "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
                "in pipeline!")

    elif input_type == 'refseq_and_interval':
        path_params = {
            'data_batch_generator__ref_genome_path': ref_seq,
            'data_batch_generator__intervals_path': intervals,
            'data_batch_generator__target_path': targets
        }
        estimator.set_params(**path_params)
        n_intervals = sum(1 for line in open(intervals))
        X = np.arange(n_intervals)[:, np.newaxis]

    # Get target y
    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in ['by_index_number', 'all_but_by_index_number',
                         'by_header_name', 'all_but_by_header_name']:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None

    df_key = infile2 + repr(header)
    if df_key in loaded_df:
        infile2 = loaded_df[df_key]
    else:
        infile2 = pd.read_csv(infile2, sep='\t',
                              header=header, parse_dates=True)
        loaded_df[df_key] = infile2

    y = read_columns(
            infile2,
            c=c,
            c_option=column_option,
            sep='\t',
            header=header,
            parse_dates=True)
    if len(y.shape) == 2 and y.shape[1] == 1:
        y = y.ravel()
    if input_type == 'refseq_and_interval':
        estimator.set_params(
            data_batch_generator__features=y.ravel().tolist())
        y = None
    # end y

    # load groups
    if groups:
        groups_selector = (params['experiment_schemes']['test_split']
                                 ['split_algos']).pop('groups_selector')

        header = 'infer' if groups_selector['header_g'] else None
        column_option = \
            (groups_selector['column_selector_options_g']
                            ['selected_column_selector_option_g'])
        if column_option in ['by_index_number', 'all_but_by_index_number',
                             'by_header_name', 'all_but_by_header_name']:
            c = groups_selector['column_selector_options_g']['col_g']
        else:
            c = None

        df_key = groups + repr(header)
        if df_key in loaded_df:
            groups = loaded_df[df_key]

        groups = read_columns(
                groups,
                c=c,
                c_option=column_option,
                sep='\t',
                header=header,
                parse_dates=True)
        groups = groups.ravel()

    # del loaded_df
    del loaded_df

    # handle memory
    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    # cache iraps_core fits could increase search speed significantly
    if estimator.__class__.__name__ == 'IRAPSClassifier':
        estimator.set_params(memory=memory)
    else:
        # For iraps buried in pipeline
        new_params = {}
        for p, v in estimator_params.items():
            if p.endswith('memory'):
                # for case of `__irapsclassifier__memory`
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    # cache iraps_core fits could increase search
                    # speed significantly
                    new_params[p] = memory
                # security reason, we don't want memory being
                # modified unexpectedly
                elif v:
                    new_params[p] = None
            # handle n_jobs
            elif p.endswith('n_jobs'):
                # For now, 1 CPU is suggested for iprasclassifier
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    new_params[p] = 1
                else:
                    new_params[p] = N_JOBS
            # for security reason, types of callback are limited
            elif p.endswith('callbacks'):
                for cb in v:
                    cb_type = cb['callback_selection']['callback_type']
                    if cb_type not in ALLOWED_CALLBACKS:
                        raise ValueError(
                            "Prohibited callback type: %s!" % cb_type)

        estimator.set_params(**new_params)

    # handle scorer, convert to scorer dict
    scoring = params['experiment_schemes']['metrics']['scoring']
    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # handle test (first) split
    test_split_options = (params['experiment_schemes']
                                ['test_split']['split_algos'])

    if test_split_options['shuffle'] == 'group':
        test_split_options['labels'] = groups
    if test_split_options['shuffle'] == 'stratified':
        if y is not None:
            test_split_options['labels'] = y
        else:
            raise ValueError("Stratified shuffle split is not "
                             "applicable on empty target values!")

    X_train, X_test, y_train, y_test, groups_train, groups_test = \
        train_test_split_none(X, y, groups, **test_split_options)

    exp_scheme = params['experiment_schemes']['selected_exp_scheme']

    # handle validation (second) split
    if exp_scheme == 'train_val_test':
        val_split_options = (params['experiment_schemes']
                                   ['val_split']['split_algos'])

        if val_split_options['shuffle'] == 'group':
            val_split_options['labels'] = groups_train
        if val_split_options['shuffle'] == 'stratified':
            if y_train is not None:
                val_split_options['labels'] = y_train
            else:
                raise ValueError("Stratified shuffle split is not "
                                 "applicable on empty target values!")

        X_train, X_val, y_train, y_val, groups_train, groups_val = \
            train_test_split_none(X_train, y_train, groups_train,
                                  **val_split_options)

    # train and eval
    if hasattr(estimator, 'validation_data'):
        if exp_scheme == 'train_val_test':
            estimator.fit(X_train, y_train,
                          validation_data=(X_val, y_val))
        else:
            estimator.fit(X_train, y_train,
                          validation_data=(X_test, y_test))
    else:
        estimator.fit(X_train, y_train)

    if hasattr(estimator, 'evaluate'):
        scores = estimator.evaluate(X_test, y_test=y_test,
                                    scorer=scorer,
                                    is_multimetric=True)
    else:
        scores = _score(estimator, X_test, y_test, scorer,
                        is_multimetric=True)
    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_result, sep='\t',
              header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        main_est = estimator
        if isinstance(estimator, pipeline.Pipeline):
            main_est = estimator.steps[-1][-1]

        if hasattr(main_est, 'model_') \
                and hasattr(main_est, 'save_weights'):
            if outfile_weights:
                main_est.save_weights(outfile_weights)
            del main_est.model_
            del main_est.fit_params
            del main_est.model_class_
            del main_est.validation_data
            if getattr(main_est, 'data_generator_', None):
                del main_est.data_generator_

        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(estimator, output_handler,
                        pickle.HIGHEST_PROTOCOL)
Exemplo n.º 29
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=False,
                   return_estimator=False,
                   split_progress=None,
                   candidate_progress=None,
                   error_score=np.nan,
                   online_train_val_split=False):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape (n_samples, n_features)
        The data to fit.
    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like of shape (n_train_samples,)
        Indices of training samples.
    test : array-like of shape (n_test_samples,)
        Indices of test samples.
    verbose : int
        The verbosity level.
    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : bool, default=False
        Compute and return score on training set.
    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.
    split_progress : {list, tuple} of int, default=None
        A list or tuple of format (<current_split_id>, <total_num_of_splits>).
    candidate_progress : {list, tuple} of int, default=None
        A list or tuple of format
        (<current_candidate_id>, <total_number_of_candidates>).
    return_n_test_samples : bool, default=False
        Whether to return the ``n_test_samples``.
    return_times : bool, default=False
        Whether to return the fit/score times.
    return_estimator : bool, default=False
        Whether to return the fitted estimator.
    Returns
    -------
    result : dict with the following attributes
        train_scores : dict of scorer name -> float
            Score on training set (for all the scorers),
            returned only if `return_train_score` is `True`.
        test_scores : dict of scorer name -> float
            Score on testing set (for all the scorers).
        n_test_samples : int
            Number of test samples.
        fit_time : float
            Time spent for fitting in seconds.
        score_time : float
            Time spent for scoring in seconds.
        parameters : dict or None
            The parameters that have been evaluated.
        estimator : estimator object
            The fitted estimator.
        fit_failed : bool
            The estimator failed to fit.
    """
    if not isinstance(error_score, numbers.Number) and error_score != 'raise':
        raise ValueError(
            "error_score must be the string 'raise' or a numeric value. "
            "(Hint: if using 'raise', please make sure that it has been "
            "spelled correctly.)")

    progress_msg = ""
    if verbose > 2:
        if split_progress is not None:
            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
        if candidate_progress and verbose > 9:
            progress_msg += (f"; {candidate_progress[0]+1}/"
                             f"{candidate_progress[1]}")

    if verbose > 1:
        if parameters is None:
            params_msg = ''
        else:
            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
            params_msg = (', '.join(f'{k}={parameters[k]}'
                                    for k in sorted_keys))
    if verbose > 9:
        start_msg = f"[CV{progress_msg}] START {params_msg}"
        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    if online_train_val_split:
        # inject the train and test data into the corresponding Subset selectors
        set_train_estim = False
        set_test_estim = False
        for estim in estimator:
            if set_train_estim and set_test_estim:
                break
            if isinstance(estim, TrainSubset):
                estim.date_range = [train]
                set_train_estim = True

            if isinstance(estim, TestSubset):
                estim.date_range = [test]
                set_test_estim = True
            if isinstance(estim, CVSubset) and isinstance(
                    estim, EvalOnlyWrapper):
                estim.set_range(test)
                set_test_estim = True
            if isinstance(estim, CVSubset) and isinstance(
                    estim, TrainOnlyWrapper):
                estim.set_range(train)
                set_train_estim = True

        if not set_train_estim or not set_test_estim:
            raise ValueError(
                "when specifying online learning a KeepTrain and KeepTest have to be in the pipeline"
            )
    else:
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)

    result = {}
    try:
        if online_train_val_split:
            estimator = estimator.train()
            estimator.fit(X, y, **fit_params)
        else:
            if y_train is None:
                estimator.fit(X_train, **fit_params)
            else:
                estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        result["fit_failed"] = True
        y_sample_len = len(test)
    else:
        result["fit_failed"] = False

        fit_time = time.time() - start_time

        estimator.eval()
        if online_train_val_split:
            # select estimator without the classifier and transform x and y
            # to retrieve y_test
            _, y_prime = estimator[:-1].transform(X, y)
            if isinstance(y_prime, pd.DataFrame) and len(y_prime.columns) == 1:
                y_prime = y_prime.T.values.squeeze()

            y_sample_len = len(y_prime)
            test_scores = _score(estimator, X, y_prime, scorer)
        else:
            test_scores = _score(estimator, X_test, y_test, scorer,
                                 error_score)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            if online_train_val_split:
                estimator.train()

                _, y_prime = estimator[:-1].transform(X, y)
                if isinstance(y_prime, pd.DataFrame) and len(
                        y_prime.columns) == 1:
                    y_prime = y_prime.T.values.squeeze()
                train_scores = _score(estimator, X, y_prime, scorer)

                estimator.eval()
            else:
                train_scores = _score(estimator, X_train, y_train, scorer,
                                      error_score)

    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = f"[CV{progress_msg}] END "
        result_msg = params_msg + (";" if params_msg else "")
        if verbose > 2 and isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                result_msg += f" {scorer_name}: ("
                if return_train_score:
                    scorer_scores = train_scores[scorer_name]
                    result_msg += f"train={scorer_scores:.3f}, "
                result_msg += f"test={test_scores[scorer_name]:.3f})"
        result_msg += f" total time={logger.short_format_time(total_time)}"

        # Right align the result_msg
        end_msg += "." * (80 - len(end_msg) - len(result_msg))
        end_msg += result_msg
        print(end_msg)

    result["test_scores"] = test_scores
    if return_train_score:
        result["train_scores"] = train_scores
    if return_n_test_samples:
        if online_train_val_split:
            result["n_test_samples"] = y_sample_len
        else:
            result["n_test_samples"] = _num_samples(X_test)
    if return_times:
        result["fit_time"] = fit_time
        result["score_time"] = score_time
    if return_parameters:
        result["parameters"] = parameters
    if return_estimator:
        result["estimator"] = estimator
    return result
Exemplo n.º 30
0
def fit_and_score_te_oracle(estimator,
                            X,
                            y,
                            w,
                            p,
                            t,
                            scorer,
                            train,
                            test,
                            parameters=None,
                            fit_params=None,
                            return_train_score=False,
                            return_parameters=False,
                            return_times=False,
                            return_estimator=False,
                            error_score=np.nan,
                            return_test_score_only=False):
    """Fit estimator and compute scores for a given dataset split, using oracle knowledge of
    treatment effects. Based on sklearn.model_selection._validation _fit_and_score, adapted to
    allow more inputs (treatments and treatment effects)

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape (n_samples, n_features)
            The features to fit to
    y : array-like of shape (n_samples,) or (n_samples, )
            The outcome variable
    w: array-like of shape (n_samples,)
            The treatment indicator
    p: array-like of shape (n_samples,)
            The treatment propensity
    t: array-like of shape (n_samples,)
        the true treatment effect to evaluate against
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like of shape (n_train_samples,)
        Indices of training samples.
    test : array-like of shape (n_test_samples,)
        Indices of test samples.
    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : bool, default=False
        Compute and return score on training set.
    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.
    return_times : bool, default=False
        Whether to return the fit/score times.
    return_estimator : bool, default=False
        Whether to return the fitted estimator.
    return_test_score_only: bool, default=False
        Whether to only return a test score

    Returns
    -------
    train_scores : dict of scorer name -> float
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.
    test_scores : float or dict of scorer name -> float
        If return_test_score_only and scorer == str, then returns only test score. Otherwise,
        s on testing set (for all the scorers)
    n_test_samples : int
        Number of test samples.
    fit_time : float
        Time spent for fitting in seconds.
    score_time : float
        Time spent for scoring in seconds.
    parameters : dict or None
        The parameters that have been evaluated.
    estimator : estimator object
        The fitted estimator
    """
    if not isinstance(estimator, BaseTEModel):
        raise ValueError("This method works only for BaseTEModel")

    scorers, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # Adjust length of sample weights (if ant)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    X_train, y_train, w_train, p_train, t_train = _safe_split_te(
        X, y, w, p, t, train)
    X_test, y_test, w_test, p_test, t_test = _safe_split_te(
        X, y, w, p, t, test)

    try:
        estimator.fit(X_train, y_train, w_train, p_train, **fit_params)

    except Exception as e:
        if return_test_score_only:
            if error_score == 'raise':
                raise
            else:
                return np.nan
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        try:
            test_scores = _score(estimator, X_test, t_test, scorers)
        except Exception:
            if return_test_score_only:
                if error_score == 'raise':
                    raise
                else:
                    return np.nan

        score_time = time.time() - start_time - fit_time

        if return_test_score_only:
            if type(scorer) == str:
                return test_scores['score']
            else:
                return test_scores

        if return_train_score:
            train_scores = _score(estimator, X_train, t_train, scorers)

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    return ret
Exemplo n.º 31
0
def _fit_and_score_keras2(method,
                          X,
                          y,
                          scorer,
                          train,
                          test,
                          verbose,
                          parameters,
                          fit_params,
                          type="Classification",
                          return_train_score=False,
                          return_parameters=False,
                          return_n_test_samples=False,
                          return_times=False,
                          error_score='raise'):
    """Fit estimator and compute scores for a given dataset split for KerasClassifier and KerasRegressor.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    session : Keras backend with a tensorflow session attached
        The keras backend session for applying K.clear_session()
        after the classifier or regressor has been train and scored
        given the split. This is mainly required to avoid posible
        Out Of Memory errors with tensorflow not deallocating the
        GPU memory after each iteration of the Cross Validation.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    from keras import backend as K
    import tensorflow as tf
    tf.logging.set_verbosity(
        tf.logging.ERROR)  # This is useful to avoid the info log of tensorflow
    # The next 4 lines are for avoiding tensorflow to allocate all the GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    estimator = None
    if type == "Classification":
        from keras.wrappers.scikit_learn import KerasClassifier
        estimator = KerasClassifier(build_fn=method, verbose=0)
    else:
        from keras.wrappers.scikit_learn import KerasRegressor
        estimator = KerasRegressor(build_fn=method, verbose=0)

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(
                    zip(scorer.keys(), [
                        error_score,
                    ] * n_scorers))
                if return_train_score:
                    train_scores = dict(
                        zip(scorer.keys(), [
                            error_score,
                        ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    # The estimator is erased
    del estimator
    # We assign the keras backend
    # Clean the session
    K.clear_session()
    # The garbage collector is called in order to ensure that the estimator is erased from memory
    for i in range(15):
        gc.collect()
    return ret
Exemplo n.º 32
0
def main(inputs,
         infile_estimator,
         infile1,
         infile2,
         outfile_result,
         outfile_object=None,
         outfile_weights=None,
         outfile_y_true=None,
         outfile_y_preds=None,
         groups=None,
         ref_seq=None,
         intervals=None,
         targets=None,
         fasta_path=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    outfile_weights : str, optional
        File path to save deep learning model weights

    outfile_y_true : str, optional
        File path to target values for prediction

    outfile_y_preds : str, optional
        File path to save deep learning model weights

    groups : str
        File path to dataset containing groups labels

    ref_seq : str
        File path to dataset containing genome sequence file

    intervals : str
        File path to dataset containing interval file

    targets : str
        File path to dataset compressed target bed file

    fasta_path : str
        File path to dataset containing fasta file
    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    #  load estimator
    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    estimator = clean_params(estimator)

    # swap hyperparameter
    swapping = params['experiment_schemes']['hyperparams_swapping']
    swap_params = _eval_swap_params(swapping)
    estimator.set_params(**swap_params)

    estimator_params = estimator.get_params()

    # store read dataframe object
    loaded_df = {}

    input_type = params['input_options']['selected_input']
    # tabular input
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in [
                'by_index_number', 'all_but_by_index_number', 'by_header_name',
                'all_but_by_header_name'
        ]:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None

        df_key = infile1 + repr(header)
        df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True)
        loaded_df[df_key] = df

        X = read_columns(df, c=c, c_option=column_option).astype(float)
    # sparse input
    elif input_type == 'sparse':
        X = mmread(open(infile1, 'r'))

    # fasta_file input
    elif input_type == 'seq_fasta':
        pyfaidx = get_module('pyfaidx')
        sequences = pyfaidx.Fasta(fasta_path)
        n_seqs = len(sequences.keys())
        X = np.arange(n_seqs)[:, np.newaxis]
        for param in estimator_params.keys():
            if param.endswith('fasta_path'):
                estimator.set_params(**{param: fasta_path})
                break
        else:
            raise ValueError(
                "The selected estimator doesn't support "
                "fasta file input! Please consider using "
                "KerasGBatchClassifier with "
                "FastaDNABatchGenerator/FastaProteinBatchGenerator "
                "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
                "in pipeline!")

    elif input_type == 'refseq_and_interval':
        path_params = {
            'data_batch_generator__ref_genome_path': ref_seq,
            'data_batch_generator__intervals_path': intervals,
            'data_batch_generator__target_path': targets
        }
        estimator.set_params(**path_params)
        n_intervals = sum(1 for line in open(intervals))
        X = np.arange(n_intervals)[:, np.newaxis]

    # Get target y
    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in [
            'by_index_number', 'all_but_by_index_number', 'by_header_name',
            'all_but_by_header_name'
    ]:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None

    df_key = infile2 + repr(header)
    if df_key in loaded_df:
        infile2 = loaded_df[df_key]
    else:
        infile2 = pd.read_csv(infile2,
                              sep='\t',
                              header=header,
                              parse_dates=True)
        loaded_df[df_key] = infile2

    y = read_columns(infile2,
                     c=c,
                     c_option=column_option,
                     sep='\t',
                     header=header,
                     parse_dates=True)
    if len(y.shape) == 2 and y.shape[1] == 1:
        y = y.ravel()
    if input_type == 'refseq_and_interval':
        estimator.set_params(data_batch_generator__features=y.ravel().tolist())
        y = None
    # end y

    # load groups
    if groups:
        groups_selector = (params['experiment_schemes']['test_split']
                           ['split_algos']).pop('groups_selector')

        header = 'infer' if groups_selector['header_g'] else None
        column_option = \
            (groups_selector['column_selector_options_g']
                            ['selected_column_selector_option_g'])
        if column_option in [
                'by_index_number', 'all_but_by_index_number', 'by_header_name',
                'all_but_by_header_name'
        ]:
            c = groups_selector['column_selector_options_g']['col_g']
        else:
            c = None

        df_key = groups + repr(header)
        if df_key in loaded_df:
            groups = loaded_df[df_key]

        groups = read_columns(groups,
                              c=c,
                              c_option=column_option,
                              sep='\t',
                              header=header,
                              parse_dates=True)
        groups = groups.ravel()

    # del loaded_df
    del loaded_df

    # cache iraps_core fits could increase search speed significantly
    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    main_est = get_main_estimator(estimator)
    if main_est.__class__.__name__ == 'IRAPSClassifier':
        main_est.set_params(memory=memory)

    # handle scorer, convert to scorer dict
    scoring = params['experiment_schemes']['metrics']['scoring']
    scorer = get_scoring(scoring)
    scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # handle test (first) split
    test_split_options = (
        params['experiment_schemes']['test_split']['split_algos'])

    if test_split_options['shuffle'] == 'group':
        test_split_options['labels'] = groups
    if test_split_options['shuffle'] == 'stratified':
        if y is not None:
            test_split_options['labels'] = y
        else:
            raise ValueError("Stratified shuffle split is not "
                             "applicable on empty target values!")

    X_train, X_test, y_train, y_test, groups_train, groups_test = \
        train_test_split_none(X, y, groups, **test_split_options)

    exp_scheme = params['experiment_schemes']['selected_exp_scheme']

    # handle validation (second) split
    if exp_scheme == 'train_val_test':
        val_split_options = (
            params['experiment_schemes']['val_split']['split_algos'])

        if val_split_options['shuffle'] == 'group':
            val_split_options['labels'] = groups_train
        if val_split_options['shuffle'] == 'stratified':
            if y_train is not None:
                val_split_options['labels'] = y_train
            else:
                raise ValueError("Stratified shuffle split is not "
                                 "applicable on empty target values!")

        X_train, X_val, y_train, y_val, groups_train, groups_val = \
            train_test_split_none(X_train, y_train, groups_train,
                                  **val_split_options)

    # train and eval
    if hasattr(estimator, 'config') and hasattr(estimator, 'model_type'):
        if exp_scheme == 'train_val_test':
            estimator.fit(X_train, y_train, validation_data=(X_val, y_val))
        else:
            estimator.fit(X_train, y_train, validation_data=(X_test, y_test))
    else:
        estimator.fit(X_train, y_train)

    if isinstance(estimator, KerasGBatchClassifier):
        scores = {}
        steps = estimator.prediction_steps
        batch_size = estimator.batch_size
        data_generator = estimator.data_generator_

        scores, predictions, y_true = _evaluate_keras_and_sklearn_scores(
            estimator,
            data_generator,
            X_test,
            y=y_test,
            sk_scoring=sk_scoring,
            steps=steps,
            batch_size=batch_size,
            return_predictions=bool(outfile_y_true))

    else:
        scores = {}
        if hasattr(estimator, 'model_') \
                and hasattr(estimator.model_, 'metrics_names'):
            batch_size = estimator.batch_size
            score_results = estimator.model_.evaluate(X_test,
                                                      y=y_test,
                                                      batch_size=batch_size,
                                                      verbose=0)
            metrics_names = estimator.model_.metrics_names
            if not isinstance(metrics_names, list):
                scores[metrics_names] = score_results
            else:
                scores = dict(zip(metrics_names, score_results))

        if hasattr(estimator, 'predict_proba'):
            predictions = estimator.predict_proba(X_test)
        else:
            predictions = estimator.predict(X_test)

        y_true = y_test
        sk_scores = _score(estimator,
                           X_test,
                           y_test,
                           scorer,
                           is_multimetric=True)
        scores.update(sk_scores)

    # handle output
    if outfile_y_true:
        try:
            pd.DataFrame(y_true).to_csv(outfile_y_true, sep='\t', index=False)
            pd.DataFrame(predictions).astype(np.float32).to_csv(
                outfile_y_preds,
                sep='\t',
                index=False,
                float_format='%g',
                chunksize=10000)
        except Exception as e:
            print("Error in saving predictions: %s" % e)
    # handle output
    for name, score in scores.items():
        scores[name] = [score]
    df = pd.DataFrame(scores)
    df = df[sorted(df.columns)]
    df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        main_est = estimator
        if isinstance(estimator, Pipeline):
            main_est = estimator.steps[-1][-1]

        if hasattr(main_est, 'model_') \
                and hasattr(main_est, 'save_weights'):
            if outfile_weights:
                main_est.save_weights(outfile_weights)
            del main_est.model_
            del main_est.fit_params
            del main_est.model_class_
            main_est.callbacks = []
            if getattr(main_est, 'data_generator_', None):
                del main_est.data_generator_

        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 33
0
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    return rfe._fit(
        X_train, y_train, lambda estimator, features: _score(
            estimator, X_test[:, features], y_test, scorer)).scores_
Exemplo n.º 34
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise',
                   return_estimator=False, return_idx=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.

    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    # do it for each patient
    X_train, y_train, X_test, y_test = _safe_split_multi(
        estimator, X, y, train, test)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    if return_idx:
        ret.extend([train, test])
    return ret
Exemplo n.º 35
0
def _do_train_test_split_val(searcher,
                             X,
                             y,
                             params,
                             error_score='raise',
                             primary_scoring=None,
                             groups=None,
                             outfile=None):
    """ do train test split, searchCV validates on the train and then use
    the best_estimator_ to evaluate on the test

    Returns
    --------
    Fitted SearchCV object
    """
    train_test_split = try_get_attr('galaxy_ml.model_validations',
                                    'train_test_split')
    split_options = params['outer_split']

    # splits
    if split_options['shuffle'] == 'stratified':
        split_options['labels'] = y
        X, X_test, y, y_test = train_test_split(X, y, **split_options)
    elif split_options['shuffle'] == 'group':
        if groups is None:
            raise ValueError("No group based CV option was choosen for "
                             "group shuffle!")
        split_options['labels'] = groups
        if y is None:
            X, X_test, groups, _ =\
                train_test_split(X, groups, **split_options)
        else:
            X, X_test, y, y_test, groups, _ =\
                train_test_split(X, y, groups, **split_options)
    else:
        if split_options['shuffle'] == 'None':
            split_options['shuffle'] = None
        X, X_test, y, y_test =\
            train_test_split(X, y, **split_options)

    if error_score == 'raise':
        searcher.fit(X, y, groups=groups)
    else:
        warnings.simplefilter('always', FitFailedWarning)
        with warnings.catch_warnings(record=True) as w:
            try:
                searcher.fit(X, y, groups=groups)
            except ValueError:
                pass
            for warning in w:
                print(repr(warning.message))

    scorer_ = searcher.scorer_
    if isinstance(scorer_, collections.Mapping):
        is_multimetric = True
    else:
        is_multimetric = False

    best_estimator_ = getattr(searcher, 'best_estimator_')

    # TODO Solve deep learning models in pipeline
    if best_estimator_.__class__.__name__ == 'KerasGBatchClassifier':
        test_score = best_estimator_.evaluate(X_test,
                                              scorer=scorer_,
                                              is_multimetric=is_multimetric)
    else:
        test_score = _score(best_estimator_,
                            X_test,
                            y_test,
                            scorer_,
                            is_multimetric=is_multimetric)

    if not is_multimetric:
        test_score = {primary_scoring: test_score}
    for key, value in test_score.items():
        test_score[key] = [value]
    result_df = pd.DataFrame(test_score)
    result_df.to_csv(path_or_buf=outfile, sep='\t', header=True, index=False)

    return searcher
Exemplo n.º 36
0
def _fit_and_score_multisignal(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise', logger=logger):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        logger.info("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split_multisignal(estimator, X, y, train)
    X_test, y_test = _safe_split_multisignal(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            logger.warning("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, short_format_time(total_time))
        logger.info("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
def main(inputs, infile_estimator, infile1, infile2,
         outfile_result, outfile_object=None, groups=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    groups : str
        File path to dataset containing groups labels
    """

    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)
    if groups:
        (params['search_schemes']['options']['cv_selector']
         ['groups_selector']['infile_g']) = groups

    params_builder = params['search_schemes']['search_params_builder']

    input_type = params['input_options']['selected_input']
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in ['by_index_number', 'all_but_by_index_number',
                             'by_header_name', 'all_but_by_header_name']:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None
        X = read_columns(
                infile1,
                c=c,
                c_option=column_option,
                sep='\t',
                header=header,
                parse_dates=True).astype(float)
    else:
        X = mmread(open(infile1, 'r'))

    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in ['by_index_number', 'all_but_by_index_number',
                         'by_header_name', 'all_but_by_header_name']:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None
    y = read_columns(
            infile2,
            c=c,
            c_option=column_option,
            sep='\t',
            header=header,
            parse_dates=True)
    y = y.ravel()

    optimizer = params['search_schemes']['selected_search_scheme']
    optimizer = getattr(model_selection, optimizer)

    options = params['search_schemes']['options']

    splitter, groups = get_cv(options.pop('cv_selector'))
    options['cv'] = splitter
    options['n_jobs'] = N_JOBS
    primary_scoring = options['scoring']['primary_scoring']
    options['scoring'] = get_scoring(options['scoring'])
    if options['error_score']:
        options['error_score'] = 'raise'
    else:
        options['error_score'] = np.NaN
    if options['refit'] and isinstance(options['scoring'], dict):
        options['refit'] = primary_scoring
    if 'pre_dispatch' in options and options['pre_dispatch'] == '':
        options['pre_dispatch'] = None

    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    # cache iraps_core fits could increase search speed significantly
    if estimator.__class__.__name__ == 'IRAPSClassifier':
        estimator.set_params(memory=memory)
    else:
        for p, v in estimator.get_params().items():
            if p.endswith('memory'):
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    # cache iraps_core fits could increase search
                    # speed significantly
                    new_params = {p: memory}
                    estimator.set_params(**new_params)
                elif v:
                    new_params = {p, None}
                    estimator.set_params(**new_params)
            elif p.endswith('n_jobs'):
                new_params = {p: 1}
                estimator.set_params(**new_params)

    param_grid = _eval_search_params(params_builder)
    searcher = optimizer(estimator, param_grid, **options)

    # do train_test_split
    do_train_test_split = params['train_test_split'].pop('do_split')
    if do_train_test_split == 'yes':
        # make sure refit is choosen
        if not options['refit']:
            raise ValueError("Refit must be `True` for shuffle splitting!")
        split_options = params['train_test_split']

        # splits
        if split_options['shuffle'] == 'stratified':
            split_options['labels'] = y
            X, X_test, y, y_test = train_test_split(X, y, **split_options)
        elif split_options['shuffle'] == 'group':
            if not groups:
                raise ValueError("No group based CV option was "
                                 "choosen for group shuffle!")
            split_options['labels'] = groups
            X, X_test, y, y_test, groups, _ =\
                train_test_split(X, y, **split_options)
        else:
            if split_options['shuffle'] == 'None':
                split_options['shuffle'] = None
            X, X_test, y, y_test =\
                train_test_split(X, y, **split_options)
    # end train_test_split

    if options['error_score'] == 'raise':
        searcher.fit(X, y, groups=groups)
    else:
        warnings.simplefilter('always', FitFailedWarning)
        with warnings.catch_warnings(record=True) as w:
            try:
                searcher.fit(X, y, groups=groups)
            except ValueError:
                pass
            for warning in w:
                print(repr(warning.message))

    if do_train_test_split == 'no':
        # save results
        cv_results = pandas.DataFrame(searcher.cv_results_)
        cv_results = cv_results[sorted(cv_results.columns)]
        cv_results.to_csv(path_or_buf=outfile_result, sep='\t',
                          header=True, index=False)

    # output test result using best_estimator_
    else:
        best_estimator_ = searcher.best_estimator_
        if isinstance(options['scoring'], collections.Mapping):
            is_multimetric = True
        else:
            is_multimetric = False

        test_score = _score(best_estimator_, X_test,
                            y_test, options['scoring'],
                            is_multimetric=is_multimetric)
        if not is_multimetric:
            test_score = {primary_scoring: test_score}
        for key, value in test_score.items():
            test_score[key] = [value]
        result_df = pandas.DataFrame(test_score)
        result_df.to_csv(path_or_buf=outfile_result, sep='\t',
                         header=True, index=False)

    memory.clear(warn=False)

    if outfile_object:
        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)