예제 #1
0
    def fit_transform(self, Xs, y=None, groups=None, **fit_params):
        is_classifier = utils.is_classifier(self.multichannel_predictor)
        if y is not None and is_classifier:
            self.classes_, y = np.unique(y, return_inverse=True)

        self.fit(Xs, y, **fit_params)
        transform_method = self.get_transform_method()

        # if internal cv training is disabled
        if (self.internal_cv is None
                or (type(self.internal_cv) == int and self.internal_cv < 2)):
            y_transform = self.transform(X)
        # internal cv training is enabled
        else:
            split_results = cross_val_predict(
                self.multichannel_predictor,
                Xs,
                y,
                groups=groups,
                predict_method=None,
                transform_method=transform_method,
                score_method=self.score_method,
                cv=self.internal_cv,
                combine_splits=True,
                n_processes=self.cv_processes,
                fit_params=fit_params)

            y_transform = split_results['transform']['y_pred']
            y_score = split_results['score']['y_pred']
            is_binary = (True if is_classifier and len(self.classes_) == 2 else
                         False)
            score_method = split_results['score']['method']
            self.score_ = score_predictions(y, y_score, score_method,
                                            self.scorer, is_classifier,
                                            is_binary)

        # convert predictions to transformed matrix:
        X_t = y_transform
        if len(X_t.shape) == 1:
            X_t = X_t.reshape(-1, 1)

        # drop the redundant prob output from binary classifiers:
        elif (len(X_t.shape) == 2 and X_t.shape[1] == 2
              and utils.is_classifier(self.model)):
            X_t = X_t[:, 1].reshape(-1, 1)

        Xs_t = [None for X in Xs]
        Xs_t[0] = X_t

        return Xs_t
예제 #2
0
 def predict_with_method(self, Xs, method_name):
     if hasattr(self, 'model') is False:
         raise FitError('prediction attempted before call to fit()')
     prediction_method = getattr(self.model, method_name)
     predictions = prediction_method(Xs)
     if utils.is_classifier(self) and method_name == 'predict':
         predictions = self.classes_[predictions]
     return predictions
예제 #3
0
 def transform(self, X):
     if hasattr(self, 'model'):
         transformer = getattr(self.model, self.get_transform_method())
         X_t = transformer(X)
         # convert output array to output matrix:
         if len(X_t.shape) == 1:
             X_t = X_t.reshape(-1, 1)
         # drop redundant prob output from binary classifiers:
         elif (len(X_t.shape) == 2 and X_t.shape[1] == 2
               and utils.is_classifier(self.model)):
             X_t = X_t[:, 1].reshape(-1, 1)
         return X_t
     else:
         raise utils.FitError('transform called before model fitting')
예제 #4
0
    def fit(self, Xs, y=None, **fit_params):
        self.model = utils.get_clone(self.multichannel_predictor)
        if y is None:
            self.model.fit(Xs, **fit_params)
        else:
            if utils.is_classifier(self.model):
                self.classes_, y = np.unique(y, return_inverse=True)
            self.model.fit(Xs, y, **fit_params)

        self._set_estimator_type(self.model)
        self._remove_predictor_interface()
        self._add_model_interface(self.model, Xs)

        return self
예제 #5
0
    def predict_with_method(self, X, method_name):
        if hasattr(self, 'model') is False:
            raise utils.FitError('prediction attempted before model fitting')
        if hasattr(self.model, method_name):
            predict_method = getattr(self.model, method_name)
            predictions = predict_method(X)
        else:
            raise NameError(
                'prediction method {} not found in {} attributes'.format(
                    method_name, self.model))
        if utils.is_classifier(self) and method_name == 'predict':
            predictions = self.classes_[predictions]

        return predictions
예제 #6
0
    def transform(self, Xs):
        if hasattr(self, 'model') is False:
            raise FitError('transform attempted before call to fit()')
        tansformer = getattr(self.model, self.get_transform_method())
        predictions = np.array(tansformer(Xs))

        # convert output array to output matrix:
        if len(predictions.shape) == 1:
            predictions = predictions.reshape(-1, 1)
        # drop the redundant prob output from binary classifiers:
        elif (len(predictions.shape) == 2 and predictions.shape[1] == 2
              and utils.is_classifier(self.model)):
            predictions = predictions[:, 1].reshape(-1, 1)

        Xs_t = [predictions if i == 0 else None for i, X in enumerate(Xs)]
        return Xs_t
예제 #7
0
    def fit(self, X, y=None, **fit_params):
        self.model = utils.get_clone(self.predictor)
        is_classifier = utils.is_classifier(self.predictor)
        if y is None:
            try:
                self.model.fit(X, **fit_params)
            except:
                self.model.fit(X)
        else:
            if is_classifier:
                self.classes_, y = np.unique(y, return_inverse=True)
            try:
                self.model.fit(X, y, **fit_params)
            except:
                self.model.fit(X, y)
        self._set_estimator_type(self.model)
        self._remove_predictor_interface()
        self._add_model_interface(self.model, X)

        return self
예제 #8
0
def cross_val_score(predictor,
                    Xs,
                    y=None,
                    groups=None,
                    score_method='predict',
                    scorer='auto',
                    cv=3,
                    n_processes=1,
                    **fit_params):
    """
    Analog of the scikit-learn cross_val_score function that supports both
    single and multichannel cross validation.

    Parameters
    ----------
    predictor : estimator/predictor instance
        Classifier or regressor that implements the scikit-learn estimator and
        predictor interfaces.
    Xs : list
        List of feature matrices and None spaceholders.
    y : list/array, default=None
        Optional targets for supervised ML.
    groups: list/array, default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used if cv parameter is set to GroupKFold.
    score_method : str, default='predict'
        Name of method called to make predictions for performance scoring. If
        'auto', methods are attempted in the order defined in
        config.score_method_precedence.
        Default: predict_proba->predict_log_proba->decision_function->predict.
    scorer : {callable, 'auto'}, default='auto'
        - Function calculating performance scores.
        - If 'auto':
            - explained_variance_score for regressors with predict()
            - roc_auc_score for classifiers with {predict_proba,
              predict_log_proba, decision_function}
            - balanced_accuracy_score for classifiers with only predict()
        - If callable: A scorer that returns a scalar figure of merit score
          with signature: score = scorer(y_true, y_pred).
    cv : int, or callable, default=5
        - Set the cross validation method:
        - If int > 1: Use StratifiedKfold(n_splits=internal_cv) for
          classifiers or Kfold(n_splits=internal_cv) for regressors.
        - If None or 5: Use 5 splits with the default split generator.
        - If callable: Assumes interface like Kfold scikit-learn.
    n_processes : int or 'max', default=1
        - If 1: Run all split computations in a single process.
        - If 'max': Run splits in multiple processes, using all
          available CPUs.
        - If int > 1: Run splits in multiple processes, using up to
          n_processes number of CPUs.
    fit_params : dict, default={}
        Auxiliary parameters sent to pipe fit_transform and fit methods.

    Returns
    -------
        - If 1 predict_method is specified: List of scalar figure of merit
          scores, one for each split.
        - If >1 predict_method is specified:  Dict indexed by prediction method
          name where values are lists of scores the splits.

    Examples
    --------
    ::

        import pipecaster as pc
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.svm import SVC

        Xs, y, X_types = pc.make_multi_input_classification(n_informative_Xs=3,
                                                            n_random_Xs=7)
        clf = pc.MultichannelPipeline(n_channels=10)
        clf.add_layer(pc.ChannelEnsemble(GradientBoostingClassifier(), SVC()))

        pc.cross_val_score(clf, Xs, y)
        # ouput: [0.7647058823529411, 0.8455882352941176, 0.8180147058823529]
    """

    is_classifier = utils.is_classifier(predictor)
    is_binary = False
    if is_classifier and y is not None:
        classes_, y = np.unique(y, return_inverse=True)
        if len(classes_) == 2:
            is_binary = True

    split_results = cross_val_predict(predictor,
                                      Xs,
                                      y,
                                      groups,
                                      predict_method=None,
                                      transform_method=None,
                                      score_method=score_method,
                                      cv=cv,
                                      combine_splits=False,
                                      n_processes=n_processes,
                                      **fit_params)

    # score the predictions
    scores = [
        score_predictions(y[idx], yp, score_method, scorer, is_classifier,
                          is_binary) for yp, idx in
        zip(split_results['score']['y_pred'], split_results['indices'])
    ]

    return scores
예제 #9
0
def _fit_predict_split(predictor,
                       Xs,
                       y,
                       train_indices,
                       test_indices,
                       predict_method='predict',
                       transform_method=None,
                       score_method=None,
                       fit_params=None):
    """
        Clone, fit, and predict with a single channel or multichannel
        predictor.
        """
    is_classifier = utils.is_classifier(predictor)
    model = utils.get_clone(predictor)
    fit_params = {} if fit_params is None else fit_params

    if utils.is_multichannel(model):
        X_trains = [X[train_indices] if X is not None else None for X in Xs]
        model.fit(X_trains, y[train_indices], **fit_params)
    else:
        model.fit(Xs[train_indices], y[train_indices], **fit_params)

    split_predictions = {}

    if predict_method is not None:
        split_predictions['predict'] = {}
        if predict_method == 'auto' and is_classifier:
            prediction_made = False
            for m in config.predict_method_precedence:
                try:
                    y_pred = _predict(model, Xs, test_indices, m)
                    if y_pred is not None:
                        prediction_made = True
                except:
                    pass
                if prediction_made is True:
                    split_predictions['predict']['method'] = m
                    break
            if prediction_made == False:
                raise AttributeError('failed to auto-detect prediction '
                                     'method')
        elif predict_method == 'auto' and is_classifier == False:
            y_pred = _predict(model, Xs, test_indices, 'predict')
            split_predictions['predict']['method'] = 'predict'
        else:
            y_pred = _predict(model, Xs, test_indices, predict_method)
            split_predictions['predict']['method'] = predict_method

        split_predictions['predict']['y_pred'] = y_pred

    if transform_method is not None:
        split_predictions['transform'] = {}
        if transform_method == 'auto':
            prediction_made = False
            for m in config.transform_method_precedence:
                try:
                    y_pred = _predict(model, Xs, test_indices, m)
                    if y_pred is not None:
                        prediction_made = True
                except:
                    pass
                if prediction_made is True:
                    split_predictions['transform']['method'] = m
                    break
            if prediction_made == False:
                raise AttributeError('failed to auto-detect transform '
                                     'method')
        elif transform_method == 'auto' and is_classifier == False:
            y_pred = _predict(model, Xs, test_indices, 'predict')
            split_predictions['transform']['method'] = 'predict'
        else:
            y_pred = _predict(model, Xs, test_indices, transform_method)
            split_predictions['transform']['method'] = transform_method

        split_predictions['transform']['y_pred'] = y_pred

    if score_method is not None:
        split_predictions['score'] = {}
        if score_method == 'auto':
            prediction_made = False
            for m in config.score_method_precedence:
                try:
                    y_pred = _predict(model, Xs, test_indices, m)
                    if y_pred is not None:
                        prediction_made = True
                except:
                    pass
                if prediction_made is True:
                    split_predictions['score']['method'] = m
                    break
            if prediction_made == False:
                raise AttributeError('failed to auto-detect score ' 'method')
        elif score_method == 'auto' and is_classifier == False:
            y_pred = _predict(model, Xs, test_indices, 'predict')
            split_predictions['score']['method'] = 'predict'
        else:
            y_pred = _predict(model, Xs, test_indices, score_method)
            split_predictions['score']['method'] = score_method

        split_predictions['score']['y_pred'] = y_pred

    return split_predictions, test_indices
예제 #10
0
def cross_val_predict(predictor,
                      Xs,
                      y=None,
                      groups=None,
                      predict_method='predict',
                      transform_method=None,
                      score_method=None,
                      cv=None,
                      combine_splits=True,
                      n_processes=1,
                      fit_params=None):
    """
    Analog of the scikit-learn cross_val_predict function that supports both
    single and multichannel cross validation.

    Parameters
    ----------
    predictor : estimator/predictor instance
        Classifier or regressor that implements the scikit-learn estimator and
        predictor interfaces.
    Xs : list
        List of feature matrices and None spaceholders.
    y : list/array, default=None
        Optional targets for supervised ML.
    groups: list/array, default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used if cv parameter is set to GroupKFold.
    predict_method : str, default='predict'
        - Name of the method used for predicting.
        - If 'auto' :
            - If classifier : method picked using
              config.predict_method_precedence order (default:
              predict->predict_proba->predict_log_proba->decision_function).
            - If regressor : 'predict'
    transform_method : str, default=None
        - Name of the prediction method to call when transforming (e.g. when
          outputting meta-features).
        - If 'auto' :
            - If classifier : method picked using
              config.transform_method_precedence order (default:
              predict_proba->predict_log_proba->decision_function->predict).
            - If regressor : 'predict'
    score_method : str, default=None
        - Name of prediction method used when scoring predictor performance.
        - If 'auto' :
            - If classifier : method picked using
              config.score_method_precedence order (default:
              ppredict_proba->predict_log_proba->decision_function->predict).
            - If regressor : 'predict'
    cv : int, or callable, default=5
        - Set the cross validation method:
        - If int > 1: Use StratifiedKfold(n_splits=internal_cv) for
          classifiers or Kfold(n_splits=internal_cv) for regressors.
        - If None or 5: Use 5 splits with the default split generator.
        - If callable: Assumes interface like Kfold scikit-learn.
    combine_splits : bool, default=True
        - If True: Concatenate results for splits into a single array.
        - If False: Return results for separate splits.
    n_processes : int or 'max', default=1
        - If 1: Run all split computations in a single process.
        - If 'max': Run splits in multiple processes, using all
          available CPUs.
        - If int > 1: Run splits in multiple processes, using up to
          n_processes number of CPUs.
    fit_params : dict, default={}
        Auxiliary parameters sent to pipe fit_transform and fit methods.

    Returns
    -------
    dict
        - If combine_splits is True :
            {'predict':y_pred, 'transform':y_pred, 'score':y_pred)}
            Where y_pred = np.array(n_samples) or None if the type of
            prediction was not requested.  There will not be dict entries for
            prediction method parameters set to None (e.g. no 'transform' key
            when transform_method=None).
        - If combine_splits is False :
            {'predict':[], 'transform':[],
            'score':[], 'indices':[])}
            Where empty brackets indicate identically ordered lists with one
            list item per split.  List items are either prediction arrays or
            sample indices for the splits.  There will not be dict entries for
            prediction method parameters set to None (e.g. no 'transform' key
            when transform_method=None).

    Examples
    --------
    ::

        import pipecaster as pc
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.svm import SVC

        Xs, y, X_types = pc.make_multi_input_classification(n_informative_Xs=3,
                                                            n_random_Xs=7)
        clf = pc.MultichannelPipeline(n_channels=10)
        clf.add_layer(pc.ChannelEnsemble(GradientBoostingClassifier(), SVC()))

        predictions = pc.cross_val_predict(clf, Xs, y)
        predictions['predict']['y_pred']
        # output: [1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...]
        y
        # output: [1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...]
    """

    is_classifier = utils.is_classifier(predictor)
    if is_classifier and y is not None:
        classes_, y = np.unique(y, return_inverse=True)

    cv = int(5) if cv is None else cv

    if type(cv) == int:
        if groups is not None:
            cv = GroupKFold(n_splits=cv)
        else:
            if utils.is_classifier(predictor):
                cv = StratifiedKFold(n_splits=cv)
            else:
                cv = KFold(n_splits=cv)

    is_multichannel = utils.is_multichannel(predictor)
    if is_multichannel:
        live_Xs = [X for X in Xs if X is not None]
        splits = list(cv.split(live_Xs[0], y, groups))
    else:
        splits = list(cv.split(Xs, y, groups))

    args_list = [(predictor, Xs, y, train_indices, test_indices,
                  predict_method, transform_method, score_method, fit_params)
                 for train_indices, test_indices in splits]

    n_jobs = len(args_list)
    n_processes = 1 if n_processes is None else n_processes
    if (type(n_processes) == int and n_jobs < n_processes):
        n_processes = n_jobs

    if n_processes == 'max' or n_processes > 1:
        try:
            shared_mem_objects = [Xs, y, fit_params]
            job_results = parallel.starmap_jobs(
                _fit_predict_split,
                args_list,
                n_cpus=n_processes,
                shared_mem_objects=shared_mem_objects)
        except Exception as e:
            print(
                'parallel processing request failed with message {}'.format(e))
            print('defaulting to single processor')
            n_processes = 1
    if n_processes == 1:
        # print('running a single process with {} jobs'.format(len(args_list)))
        job_results = [_fit_predict_split(*args) for args in args_list]

    split_predictions, split_indices = zip(*job_results)

    # reorganize so splits are in lists
    results = {
        k: {
            'y_pred': [sp[k]['y_pred'] for sp in split_predictions],
            'method': split_predictions[0][k]['method']
        }
        for k in split_predictions[0]
    }

    # decode classes where necessary
    if is_classifier:
        for predict_method in results:
            if results[predict_method]['method'] == 'predict':
                results[predict_method]['y_pred'] = [
                    classes_[p] for p in results[predict_method]['y_pred']
                ]

    if combine_splits is True:
        sample_indices = np.concatenate(split_indices)
        for predict_method in results:
            y_concat = np.concatenate(results[predict_method]['y_pred'])
            results[predict_method]['y_pred'] = y_concat[sample_indices]
    else:
        results['indices'] = split_indices

    return results