示例#1
0
    def train_model(self, X_train, y_train):
        '''
        Train the model with specified experiments

        params: X_train pd.Dataframe with train data
                y_train pd.Series with train labels

        return: dict with trained model
        '''

        for alg in self.tested_algorithms.keys():
            print('Treinando o modelo', alg)
            test = self.tested_algorithms[alg]
            print(test)
            steps = [('over', SMOTE()), ('model', test)]
            pipeline = Pipeline(steps=steps)
            pipeline.fit(X_train, y_train)
            print('Cross val score using RepeatedStratifiedKFold')
            cv = RepeatedStratifiedKFold(n_splits=10,
                                         n_repeats=5,
                                         random_state=42)
            scores = cross_val_score(pipeline,
                                     X_train,
                                     y_train,
                                     scoring='roc_auc',
                                     cv=cv,
                                     n_jobs=-1)
            print(np.mean(scores))
            if self.models is None:
                self.models = {alg: test}
            else:
                self.models.update({alg: test})
        return self.models
示例#2
0
    def test_evaluate_pipeline(self):
        X, y = make_classification(
            n_samples=100, n_features=5, n_informative=2, n_redundant=2
        )
        X_train, _, y_train, _ = train_test_split(
            X, y, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE
        )
        dummy_pipeline = Pipeline(
            [("dummy_classifier", DummyClassifier(strategy="constant", constant=0))]
        )
        dummy_pipeline.fit(X_train, y_train)

        with tempfile.TemporaryDirectory() as destination:
            threshold = destination + "/DUMMY_threshold.json"
            save_pipeline(
                pipeline=dummy_pipeline,
                model="DUMMY",
                optimal_threshold=0,
                destination=destination,
            )
            evaluate_pipeline(
                X=X,
                y=y,
                pipeline=dummy_pipeline,
                threshold=threshold,
                prefix="DUMMY",
                destination=destination,
            )
            files = glob.glob(destination + "/*")
            self.assertTrue(any([".png" in file for file in files]))
            self.assertTrue(any([".json" in file for file in files]))
            self.assertTrue(any([".csv" in file for file in files]))
示例#3
0
def trainPipeLine(databaseName, samplerName, scalerName, featureSelectorName,
                  modelName, expectedVariance):
    dataSet = getAllRecordsFromDatabase(databaseName)
    availableSamplers = getRandomSamplers()
    availableScalers = getScalers()
    availableFeatureSelectors = getFeatureSelectors()
    availableModels = getModels()
    features = dataSet[:, 1:-1]
    binaries = dataSet[:, -1:]
    binaries = binaries.astype(int)
    sampler = availableSamplers.get(samplerName)
    sampledFeatures, sampledLabels = sampler.fit_resample(features, binaries)
    scaler = availableScalers.get(scalerName)
    featureSelector = availableFeatureSelectors.get(featureSelectorName)
    model = availableModels.get(modelName)
    pipeline = Pipeline([('scaler', scaler),
                         ('featureSelector', featureSelector), ('m', model)])
    trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(
        sampledFeatures, sampledLabels, test_size=0.2, random_state=0)
    pipeline.fit(trainFeatures, trainLabels)
    pickledPipelineName = "Pipeline_" + databaseName + "_" + samplerName + "_" + scalerName + "_" + featureSelectorName + "_" + modelName
    storePickledPipeline(pipeline, pickledPipelineName)
    qualifyPipeline(pipeline, pickledPipelineName, testFeatures, testLabels,
                    databaseName, samplerName, scalerName, featureSelectorName,
                    modelName)
def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)
    assert pipe.named_steps['clf'].got_attribute
示例#5
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
def Predict(data, mode):
    train, test = data
    idx = test.id.values.astype(int)
    y = train.median_relevance.values

    train_query = list(
        train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    train_title = list(
        train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    test_query = list(
        test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1))
    test_title = list(
        test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1))

    stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \
                                                'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9'])
    stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

    tfv = text.TfidfVectorizer(min_df=7,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \
                               ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)

    tfv.fit(train_query + train_title)
    X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)])
    X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)])

    sim = similarlity_stack()
    if mode == 'eda':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                            ('scl', scl),\
                            ('svm', svm)])
    elif mode == 'sampling':
        svd = TruncatedSVD(n_components=200)
        scl = StandardScaler(with_mean=False)
        svm = SVC(C=10,
                  gamma="auto",
                  kernel="rbf",
                  class_weight=None,
                  probability=True)
        sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4)
        clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\
                                  ('scl', scl),\
                                  ('sampling', sampling),\
                                  ('svm', svm)])

    clf.fit(X_train, y)
    preds = clf.predict(X_test)
    pred_probas = clf.predict_proba(X_test)

    submission = pd.DataFrame({"id": idx, "prediction": preds})
    submission_probas = pd.DataFrame(pred_probas, index=idx)

    return submission, submission_probas
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([("mock", transf1)])
    assert pipeline.named_steps["mock"] is transf1

    # Directly setting attr
    pipeline.steps = [("mock2", transf2)]
    assert "mock" not in pipeline.named_steps
    assert pipeline.named_steps["mock2"] is transf2
    assert [("mock2", transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[("mock", transf1)])
    assert [("mock", transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [("mock", transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[("junk", ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([('mock', transf1)])
    assert pipeline.named_steps['mock'] is transf1

    # Directly setting attr
    pipeline.steps = [('mock2', transf2)]
    assert 'mock' not in pipeline.named_steps
    assert pipeline.named_steps['mock2'] is transf2
    assert [('mock2', transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[('mock', transf1)])
    assert [('mock', transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [('mock', transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[('junk', ())])
    with raises(TypeError):
        pipeline.fit([[1]], [1])
    with raises(TypeError):
        pipeline.fit_transform([[1]], [1])
示例#9
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)
    assert_array_almost_equal(y_trans, y_trans2)
    assert_array_almost_equal(y_trans, y_trans3)

    pca = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(y_trans, y_trans2)
示例#10
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)
    assert_array_almost_equal(y_trans, y_trans2)
    assert_array_almost_equal(y_trans, y_trans3)

    pca = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(y_trans, y_trans2)
def test_pipeline_score_samples_pca_lof():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.3, 0.7],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=500,
        random_state=0,
    )
    # Test that the score_samples method is implemented on a pipeline.
    # Test that the score_samples method on pipeline yields same results as
    # applying transform and score_samples steps separately.
    rus = RandomUnderSampler(random_state=42)
    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
    lof = LocalOutlierFactor(novelty=True)
    pipe = Pipeline([("rus", rus), ("pca", pca), ("lof", lof)])
    pipe.fit(X, y)
    # Check the shapes
    assert pipe.score_samples(X).shape == (X.shape[0], )
    # Check the values
    X_res, _ = rus.fit_resample(X, y)
    lof.fit(pca.fit_transform(X_res))
    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )

    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)
    assert pipe.named_steps["clf"].got_attribute
示例#14
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
示例#15
0
def find_expert(tag):
    """
    输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户
    """
    fold = StratifiedKFold(n_splits=4)
    params = best_solution(tag)
    data, target, ratio = load_data(tag)
    fold.random_state = int(params['seed'])
    samp = ADASYN(n_neighbors=2,
                  sampling_strategy=float(params['sampling_strategy']) * ratio,
                  random_state=int(params['seed']))
    clf = XGBClassifier(n_estimators=int(params['n_estimators']),
                        gamma=float(params['gamma']),
                        eta=float(params['eta']),
                        reg_lambda=int(params['reg_lambda']),
                        verbosity=0,
                        n_jobs=-1,
                        random_state=int(params['seed']))
    pipeline = Pipeline([(type(samp).__name__, samp),
                         (type(clf).__name__, clf)])
    experts = pd.DataFrame(columns=['id', 'probability'])
    for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4):
        pipeline.fit(data.iloc[train], target.iloc[train])
        pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1],
                               index=target.iloc[test].index,
                               name='probability')
        experts = experts.append(pred_proba.to_frame().reset_index())
    experts = experts.sort_values(by=['probability'],
                                  ascending=False).iloc[:20]
    experts['probability'] = experts['probability'].astype(float).map(
        "{:.1%}".format)
    print(experts.to_string(index=False))
示例#16
0
    def illigal_genralization_checking(self, X_test, y_test):

        X = self.df[self.features]
        X_test = X_test[self.features]
        Y = self.df[self.target]
        pipe = Pipeline(
            steps=[('classifier',
                    XGBClassifier(
                        n_estimators=1000, scale_pos_weight=3, reg_alpha=1))])
        y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x))
        scores = cross_val_score(pipe,
                                 X,
                                 Y,
                                 scoring='precision',
                                 cv=StratifiedKFold(5))
        print(self.features)
        print("cross vl scores")
        print(sum(scores) / 5)
        pipe.fit(X, Y.values)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
    clf = SVC(
        gamma="scale",
        probability=True,
        random_state=0,
        decision_function_shape="ovr",
    )

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert predict.shape == (n_samples, )

        proba = pipe.predict_proba(X)
        assert proba.shape == (n_samples, n_classes)

        log_proba = pipe.predict_log_proba(X)
        assert log_proba.shape == (n_samples, n_classes)

        decision_function = pipe.decision_function(X)
        assert decision_function.shape == (n_samples, n_classes)

        pipe.score(X, y)
示例#18
0
def main():
    """ Trains a logistic regression, an attempt to be 'production' grade
    """

    logger = logging.getLogger(__name__)
    logger.info(f'Reading data')
    processed_df = pd.read_csv('../../data/processed/processed.csv')

    X = processed_df.drop('Class', axis=1).values
    y = processed_df['Class'].values

    accuracy_lst = []
    precision_lst = []
    recall_lst = []
    f1_lst = []

    rand_log_reg = RandomizedSearchCV(
        baseline_classifiers['LogisticRegression'],
        LogisticRegression_rndm_params,
        n_iter=4)

    skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

    logger.info(f'Constructing model pipeline and cross validating')
    idx = 1
    for train, test in skf.split(X=X, y=y):
        logger.info(f'Run {idx}')
        model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')),
                          ('classification', rand_log_reg)])

        model.fit(X[train], y[train])
        best_estimators = rand_log_reg.best_estimator_
        prediction = best_estimators.predict(X[test])

        accuracy_lst.append(model.score(X[test], y[test]))
        precision_lst.append(precision_score(y[test], prediction))
        recall_lst.append(recall_score(y[test], prediction))
        f1_lst.append(f1_score(y[test], prediction))
        idx += 1

    metrics = f'''
    Accuracy: {mean(accuracy_lst)} \n
    Precision: {mean(precision_lst)} \n
    Recall: {mean(recall_lst)} \n
    F1: {mean(f1_lst)}
          '''

    print(metrics)

    f = open(f'../../models/metrics.txt', 'w')
    f.write(metrics)
    f.close()

    joblib.dump(rand_log_reg,
                f'../../models/{best_model_file_name}',
                compress=9)
    logger.info(f'Serialised model as {best_model_file_name}')

    return rand_log_reg
def test_row_selector_pipeline_integration():
    """Test the integration of row selector
    and pipelines."""
    pipeline = Pipeline([('selector',
                          RowSelector(sampling_strategy=0.8,
                                      selection_strategy=0)),
                         ('lr', LinearRegression())])
    pipeline.fit(X, y)
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([("transf", Transf()), ("clf", Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.score(X, sample_weight=np.array([2, 3]))
示例#21
0
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.score(X, sample_weight=np.array([2, 3]))
示例#22
0
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, y=None) == 3
    assert pipe.score(X, y=None, sample_weight=None) == 3
    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
示例#23
0
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X, y=None)
    assert_equal(pipe.score(X), 3)
    assert_equal(pipe.score(X, y=None), 3)
    assert_equal(pipe.score(X, y=None, sample_weight=None), 3)
    assert_equal(pipe.score(X, sample_weight=np.array([2, 3])), 8)
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, y=None) == 3
    assert pipe.score(X, y=None, sample_weight=None) == 3
    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_init_tuple():
    # Pipeline accepts steps as tuple
    X = np.array([[1, 2]])
    pipe = Pipeline((("transf", Transf()), ("clf", FitParamT())))
    pipe.fit(X, y=None)
    pipe.score(X)
    pipe.set_params(transf="passthrough")
    pipe.fit(X, y=None)
    pipe.score(X)
示例#26
0
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert_true(pipe.predict(None))
    # and transformer params should not be changed
    assert_true(pipe.named_steps['transf'].a is None)
    assert_true(pipe.named_steps['transf'].b is None)
示例#27
0
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert_true(pipe.predict(None))
    # and transformer params should not be changed
    assert_true(pipe.named_steps['transf'].a is None)
    assert_true(pipe.named_steps['transf'].b is None)
示例#28
0
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin):
    """
    imblearnのリサンプリングの偏りを再較正するやつ
    再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう
    estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ
    * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない
    
    :param estimatror: scikit-learn API 準拠の estimator オブジェクト
    :param resampler: imblearn で使われる各種 resampler オブジェクト
    :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う.
    :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う.
    """
    def __init__(self,
                 estimator,
                 resampler,
                 alpha='auto',
                 post_minor_rate=None):
        resampler = clone(resampler)
        if post_minor_rate is None and alpha is None:
            warnings.warn(
                'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.'
            )
        elif post_minor_rate and alpha:
            warnings.warn(
                'both of `post_minor_rate` and `alpha` are specified. the former is applied.'
            )
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif post_minor_rate:
            self.post_minor_rate = post_minor_rate
            self.resampling_strategy = 'posterior_rate'
        elif alpha:
            self.alpha = alpha
            self.resampling_strategy = 'alpha'
            resampler.set_params(sampling_strategy=alpha)
        else:
            raise ('initialized error')
        self.estimator_ = Pipeline([('resampler', resampler),
                                    ('estimator', clone(estimator))])

    def fit(self, X, y):
        if self.resampling_strategy == 'posterior_rate':
            alpha = get_oversampling_rate(self.post_minor_rate)
            self.alpha = alpha
            self.estimator_['resampler'].set_params(sampling_strategy=alpha)
        self.estimator_.fit(X, y)
        self.minor_rate_ = np.min([y.mean(), 1 - y.mean()])
        return self

    def predict(self, X):
        return self.estimator_.predict(X)

    def predict_proba(self, X):
        return calibrate_imbalanceness(self.estimator_.predict_proba(X),
                                       pos_rate=get_oversampling_power(
                                           self.alpha, self.minor_rate_))
def test_pipeline_correctly_adjusts_steps(passthrough):
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)
    pipeline = Pipeline([('m2', mult2), ('bad', passthrough), ('m3', mult3),
                         ('m5', mult5)])
    pipeline.fit(X, y)
    expected_names = ['m2', 'bad', 'm3', 'm5']
    actual_names = [name for name, _ in pipeline.steps]
    assert expected_names == actual_names
def test_pipeline_correctly_adjusts_steps(passthrough):
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)
    pipeline = Pipeline([("m2", mult2), ("bad", passthrough), ("m3", mult3),
                         ("m5", mult5)])
    pipeline.fit(X, y)
    expected_names = ["m2", "bad", "m3", "m5"]
    actual_names = [name for name, _ in pipeline.steps]
    assert expected_names == actual_names
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps["transf"].a is None
    assert pipe.named_steps["transf"].b is None
    # invalid parameters should raise an error message
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.fit(None, None, clf__bad=True)
示例#32
0
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps['transf'].a is None
    assert pipe.named_steps['transf'].b is None
    # invalid parameters should raise an error message
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.fit(None, None, clf__bad=True)
示例#33
0
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    assert_raise_message(
        TypeError,
        "score() got an unexpected keyword argument 'sample_weight'",
        pipe.score,
        X,
        sample_weight=np.array([2, 3]))
示例#34
0
def test_pipeline_sample_transform():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pca = PCA()
    pca2 = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)])

    pipeline.fit(X, y).transform(X)
示例#35
0
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
                           memory=memory)
    error_regex = ("'memory' should either be a string or a joblib.Memory"
                   " instance, got 'memory=1' instead.")
    with raises(ValueError, match=error_regex):
        cached_pipe.fit(X, y)
示例#36
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
示例#37
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#38
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression(solver="lbfgs", multi_class="auto")
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([("anova", filter1), ("logistic", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#40
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline([("transf", DummyTransf()),
                            ("svc", SVC(gamma="scale"))],
                           memory=memory)
    error_regex = "string or have the same interface as"
    with raises(ValueError, match=error_regex):
        cached_pipe.fit(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma="scale", probability=True, random_state=0)
    pca = PCA(svd_solver="full", n_components="mle", whiten=True)
    pipe = Pipeline([("pca", pca), ("svc", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline(
        [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))],
        memory=memory)
    error_regex = ("string or have the same interface as")
    with raises(ValueError, match=error_regex):
        cached_pipe.fit(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#45
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#46
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#47
0
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
示例#48
0
    def illigal_genralization_checking(self, X_test, y_test):

        X = self.df[self.features]
        X_test = X_test[self.features]
        Y = self.df[self.target]
        pipe = Pipeline(steps=[('classifier', XGBClassifier(n_estimators=1000, scale_pos_weight=3, reg_alpha=1))])
        y_test = y_test["intrusion_cutoff"].apply(lambda x: int(x))
        scores = cross_val_score(pipe, X, Y, scoring='precision', cv=StratifiedKFold(5))
        print(self.features)
        print("cross vl scores")
        print(sum(scores)/5)
        pipe.fit(X, Y.values)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
示例#49
0
    def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features):

        self.df = self.df[~self.df['intrusion_cutoff'].isna()]
        self.df = self.df[~self.df['avoidance_cutoff'].isna()]
        self.df = self.df[~self.df['hypertention_cutoff'].isna()]
        print("self.df.shape", self.df.shape)
        X = self.df
        Y = self.df[self.target]# strict
        all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"]


        X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y)

        # intrusion
        X_intrusion = X_train[intrusion_features].values
        y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x))
        pipe_intrusion = Pipeline(steps=[
            ('rfe', BorderlineSMOTE()),
            ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))])
        scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5))
        print(f"intrusion {sum(scores)/5}")
        pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        X_avoidance = X_train[avoidance_features].values
        y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x))
        pipe_avoidance = Pipeline(steps=[
            ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))])
        scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5))
        print(f"avoidance {sum(scores)/5}")
        pipe_avoidance.fit(X_avoidance, y_avoidance)


        # hypertension
        X_hypertension = X_train[hypertension_features].values
        y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x))
        pipe_hypertension = Pipeline(steps=[
            ('classifier', BalancedBaggingClassifier(n_estimators=100))])
        scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        pipe_hypertension.fit(X_hypertension, y_hypertention)

        ## combine three classifiers
        X_test_hypertension = X_test[hypertension_features].values
        X_test_avoidance = X_test[avoidance_features].values
        X_test_intrusion = X_test[intrusion_features].values

        y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension)
        y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance)
        y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion)
        y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion)

        y_target = y_test["PCL_Strict3"].apply(lambda x: int(x))

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("test scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
示例#50
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(X_trans, X_trans3, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans3, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()),
                         ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
示例#51
0
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipeline.transform and pipeline.inverse_transform
    iris = load_iris()
    X = iris.data
    pca = PCA(n_components=2)
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
示例#52
0
class TargetEnsembler(object):

    def __init__(self, features):
        self.features = features

    def fit(self, X_train, y_train):

        # intrusion
        if intrusion:
            X_intrusion = FeatureEngineering(X_train[self.features], "intrusion_cutoff").engineer_features().values
            y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x))

            self.pipe_intrusion = Pipeline(steps=[
                ('feature_selection', SelectFpr(alpha=0.05)),
                ('sampling', BorderlineSMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(n_estimators=300, max_depth=5))])

            scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"intrusion {sum(scores)/5}")
            self.pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        if avoidance:
            X_avoidance = FeatureEngineering(X_train[self.features], "avoidance_cutoff").engineer_features().values
            y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x))

            self.pipe_avoidance = Pipeline(steps=[
                ('feature_selection',  RFE(estimator=XGBClassifier(scale_pos_weight=5.88, n_estimators=100),
                                           n_features_to_select=20)),
                ('classifier', BalancedRandomForestClassifier(n_estimators=300, max_depth=10))])

            scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"avoidance {sum(scores)/5}")
            self.pipe_avoidance.fit(X_avoidance, y_avoidance)

        # hypertension
        if hypertension:
            X_hypertension = FeatureEngineering(X_train[self.features], "hypertention_cutoff").engineer_features().values
            y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x))

            self.pipe_hypertension = Pipeline(steps=[
                ('feature_selection',  RFE(estimator=XGBClassifier(n_estimators=100, scale_pos_weight=3.51),
                                           n_features_to_select=20)),
                ( 'sampling', SMOTE(k_neighbors=10)),
                ('classifier', BalancedRandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"hypertension {sum(scores)/5}")
            self.pipe_hypertension.fit(X_hypertension, y_hypertention)

        # depression
        if depression:
            X_depression = FeatureEngineering(X_train[self.features], "depression_cutoff").engineer_features().values
            y_depression = X_train["depression_cutoff"].apply(lambda x: int(x))

            self.pipe_depression = Pipeline(steps=[
                ('feature_selection', SelectFdr(alpha=0.1)),
                ('sampling', SMOTE(k_neighbors=5)),
                ('classifier', RandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_depression, X_depression, y_depression, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"depression {sum(scores)/5}")
            self.pipe_depression.fit(X_depression, y_depression)

        # only_avoidance
        if only_avoidance:
            X_only_avoidance = FeatureEngineering(X_train[self.features], "only_avoidance_cutoff").engineer_features().values
            y_only_avoidance = X_train["only_avoidance_cutoff"].apply(lambda x: int(x))

            self.pipe_only_avoidance = Pipeline(steps=[
                ('feature_selection', RFE(XGBClassifier(n_estimators=100,max_depth=3), n_features_to_select=10)),
                ('classifier', BalancedRandomForestClassifier( n_estimators=500, max_depth=10))])

            scores = cross_val_score(self.pipe_only_avoidance, X_only_avoidance,
                                     y_only_avoidance, scoring='f1', cv=StratifiedKFold(5))
            print(f"only_avoidance {sum(scores)/5}")
            self.pipe_only_avoidance.fit(X_only_avoidance, y_only_avoidance)

        # pcl_strict3
        if PCL_Strict3:
            X_PCL_Strict3 = FeatureEngineering(X_train[self.features], "PCL_Strict3").engineer_features().values
            y_PCL_Strict3 = y_train["PCL_Strict3"].apply(lambda x: int(x))

            self.pipe_PCL_Strict3 = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=20)),
                ('sampling', SMOTE(k_neighbors=5)),
                ('classifier', XGBClassifier(max_depth=3, n_estimators=100))])

            scores = cross_val_score(self.pipe_PCL_Strict3, X_PCL_Strict3,
                                     y_PCL_Strict3, scoring='f1', cv=StratifiedKFold(5))
            print(f"PCL_Strict3 {sum(scores)/5}")
            self.pipe_PCL_Strict3.fit(X_PCL_Strict3, y_PCL_Strict3)


        # cutoff_33
        if regression_cutoff_33:
            X_regression_cutoff_33 = FeatureEngineering(X_train[self.features],
                                                        "regression_cutoff_33").engineer_features().values
            y_regression_cutoff_33 = X_train["regression_cutoff_33"].apply(lambda x: int(x))

            self.pipe_regression_cutoff_33 = Pipeline(steps=[
                ('feature_selection', SelectFpr(alpha=0.033)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5))])

            scores = cross_val_score(self.pipe_regression_cutoff_33, X_regression_cutoff_33,
                                     y_regression_cutoff_33, scoring='f1', cv=StratifiedKFold(5))
            print(f"regression_cutoff_33 {sum(scores)/5}")
            self.pipe_regression_cutoff_33.fit(X_regression_cutoff_33, y_regression_cutoff_33)

        # cutoff 50
        if regression_cutoff_50:
            X_regression_cutoff_50 = FeatureEngineering(X_train[self.features], "regression_cutoff_50").engineer_features().values
            y_regression_cutoff_50 = X_train["regression_cutoff_50"].apply(lambda x: int(x))

            self.pipe_regression_cutoff_50 = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=10)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(max_depth=2, n_estimators=100))])

            scores = cross_val_score(self.pipe_regression_cutoff_50, X_regression_cutoff_50,
                                     y_regression_cutoff_50, scoring='f1', cv=StratifiedKFold(5))
            print(f"regression_cutoff_50 {sum(scores)/5}")
            self.pipe_regression_cutoff_50.fit(X_regression_cutoff_50, y_regression_cutoff_50)

        # tred_cutoff
        if tred_cutoff:
            X_tred_cutoff = FeatureEngineering(X_train[self.features], "tred_cutoff").engineer_features().values
            y_tred_cutoff = X_train["tred_cutoff"].apply(lambda x: int(x))

            self.pipe_tred_cutoff = Pipeline(steps=[
                ('feature_selection', SelectKBest(k=20)),
                ('sampling', SMOTE(k_neighbors=10)),
                ('classifier', XGBClassifier(n_estimators=100, max_depth=2))])

            scores = cross_val_score(self.pipe_tred_cutoff, X_tred_cutoff, y_tred_cutoff, scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"tred_cutoff {sum(scores)/5}")
            self.pipe_tred_cutoff.fit(X_tred_cutoff, y_tred_cutoff)

        # target
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else: y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        else: y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else: y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance)
        else: y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else: y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33)
        else: y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50)
        else: y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else: y_pred_tred_cutoff = 1


        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")

    def predict(self, X_test):


        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features],
                                                         "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff)
        else: y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                         "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff)
        else: y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features],
                                                            "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff)
        else: y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(X_test[self.features],
                                                          "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff)
        else: y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                              "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff)
        else: y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3)
        else: y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33)
        else: y_pred_regression_cutoff_33 =1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50)
        else: y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff)
        else: y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        return y_pred
示例#53
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
示例#54
0
    def fit(self, X_train, y_train):

        predictions_list = []

        for target in self.targets_list:
            if self.use_feature_engineering:
                X = FeatureEngineering(X_train[self.features], target).engineer_features().values
            else:
                X = X_train[self.features].values

            if target == "PCL_Strict3":
                y = y_train[target].apply(lambda x: int(x))
            else:
                y = X_train[target].apply(lambda x: int(x))

            pipeline = pipeline_per_target[target]
            scores = cross_val_score(pipeline, X, y, scoring='f1', cv=StratifiedKFold(5))
            print(f"{target} - {sum(scores)/len(scores)}")

            if self.train_on_partial_prediction:
                combined_y = pd.DataFrame(y, columns=[target])
                if target != "PCL_Strict3":
                    combined_y["PCL_Strict3"] = y_train["PCL_Strict3"].apply(lambda x: int(x))

                _X_train, _X_test, _y_train, _y_test = \
                    train_test_split(X, combined_y, test_size=0.25)
                self.trained_pipelines[target] = pipeline.fit(_X_train, _y_train[target])
                y_pred = self.trained_pipelines[target].predict(_X_test)
                predictions_list.append(y_pred)
                print("test f1", target, f1_score(_y_test[target], y_pred))
                self.trained_pipelines[target] = pipeline.fit(X, y)
                y = _y_test["PCL_Strict3"]
            else:
                self.trained_pipelines[target] = pipeline.fit(X, y)
                predictions_list.append([self.trained_pipelines[target].predict(X)])
                y = y_train["PCL_Strict3"]

            if self.check_on_test_set:
                if target == "PCL_Strict3":
                    y_test = self.y_test[target].apply(lambda x: int(x))
                else:
                    y_test = X_train[target].apply(lambda x: int(x))
                if self.use_feature_engineering:
                    X_test = FeatureEngineering(self.X_test[self.features], target).engineer_features().values
                else:
                    X_test = self.X_test[self.features].values

                model = self.trained_pipelines[target]
                y_pred = model.predict(X_test)
                s_f = f1_score(self.y_test, y_pred)
                s_p = precision_score(self.y_test, y_pred)
                s_r = recall_score(self.y_test, y_pred)
                print(f"test f1 {target}", s_f)
                print(f"test recall {target}", s_r)
                print(f"test precision {target}", s_p)

        #pipe = Pipeline(steps=[
        #    ('scaling', StandardScaler()),
        #    ('sampling', SMOTE()),
        #    ('classifier', LogisticRegression(penalty='l1'))])
        #c = ((len(y) - sum(y)) / sum(y))

        if not self.use_and_func:
            c = 10
            pipe = Pipeline(steps=[('feature_selection',
                                    RFE(XGBClassifier(n_estimators=10, scale_pos_weight=c))),
                                   ('clf', XGBClassifier(scale_pos_weight=c))])
            X = predictions_list
            self.combined_model = pipe.fit(np.array(X).reshape(-1, len(predictions_list)), y)
class TargetEnsembler(object):

    def __init__(self, features):
        self.features = features

    def fit(self, X_train, y_train):

        # create list of targets

        # self.pipelines_list = []
        # self.preds = []
        # for i in targets :
        #  x. feature engineering (i)
        # y = df[i]
        # cv_scores  (x, y, pipeline_per_target[i])
        # model = pipeline_per_target[i].train(x, y)
        # pipelines_list.append(model)
        # preds.append(model.pred(x))

        # y = df[y]
        # combined_model = LogReg.train(preds, y)
        # print results....

        # def pred(X):
        #
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else:
            y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        else:
            y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else:
            y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_only_avoidance)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")

        # combined
        y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension)
        y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        y_pred_regression = self.pipe_regression.predict(X_regression)

        X_train["y_pred_hypertension"] = y_pred_hypertension
        X_train["y_pred_avoidance"] = y_pred_avoidance
        X_train["y_pred_intrusion"] = y_pred_intrusion
        X_train["y_pred_regression"] = y_pred_regression
        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_train[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values
        y_combined = y_train
        self.pipe_combined = Pipeline(steps=[
            ('classifier', DecisionTreeClassifier())])
        scores = cross_val_score(self.pipe_combined, X_combined, y_combined, scoring='precision', cv=StratifiedKFold(5))
        print(f"hypertension {sum(scores)/5}")
        self.pipe_combined.fit(X_combined, y_combined)

    def predict(self, X_test):

        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features],
                                                         "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff)
        else:
            y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                         "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff)
        else:
            y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features],
                                                            "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff)
        else:
            y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(X_test[self.features],
                                                          "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff)
        else:
            y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                              "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_test[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values

        y_pred = self.pipe_combined.predict(X_combined)
        return y_pred
示例#56
0
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    expected_params = {'steps': pipeline.steps,
                       'm2': mult2,
                       'm3': None,
                       'last': mult5,
                       'memory': None,
                       'm2__mult': 2,
                       'last__mult': 5}
    assert pipeline.get_params(deep=True) == expected_params

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = ['predict_proba', 'predict_log_proba',
                     'decision_function', 'transform', 'score']
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    pipeline.fit(X, y)
    pipeline.transform(X)
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    with raises(AttributeError, match="has no attribute 'predict'"):
        getattr(pipeline, 'predict')

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))