예제 #1
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
예제 #2
0
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    with raises(TypeError, match="unexpected keyword argument"):
        pipe.score(X, sample_weight=np.array([2, 3]))
예제 #3
0
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, y=None) == 3
    assert pipe.score(X, y=None, sample_weight=None) == 3
    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
예제 #4
0
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #5
0
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(gamma='scale', probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #7
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression()
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #8
0
def test_pipeline_methods_rus_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #9
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
예제 #10
0
def test_pipeline_methods_pca_rus_svm():
    # Test the various methods of the pipeline (pca + svm).
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)

    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA()
    rus = RandomUnderSampler(random_state=0)
    pipe = Pipeline([('pca', pca), ('rus', rus), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
예제 #11
0
def test_pipeline_methods_anova_rus():
    # Test the various methods of the pipeline (anova).
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    # Test with RandomUnderSampling + Anova + LogisticRegression
    clf = LogisticRegression(solver="lbfgs")
    rus = RandomUnderSampler(random_state=0)
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
    tree.DecisionTreeClassifier(),
    RandomForestClassifier(max_depth=2, random_state=42),
    BaggingClassifier(base_estimator=SVC(), n_estimators=20, random_state=42),
    AdaBoostClassifier(n_estimators=200),
    GradientBoostingClassifier(random_state=42),
    xgb.XGBClassifier(random_state=42,learning_rate=0.01)
    ]

# For each classifier in classifiers, scale the inputs and then apply the algorithim.
# Then, fit the algorithim and print it's score.
for classifier in classifiers:
    pipe = Pipeline(steps=[('scale',StandardScaler()),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train.values.ravel())   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))
    
# -

# Examining the output above, Logistic Regression, Gradient Bossted Trees and the AdaBoost Classifier seem to be our best bets for a good classification algorithim, as Gaussian and LDA/QDA do not work well due to the implicit correlation present in our dummy variables that cannot be easily removed.

# # Examining the Classifiers in Depth

# ## Definitions of some metrics
# * Accuracy: Accuracy in this case refers to the portion of cases that your algorithim has predicted correctly. We don't actively use this statistic in this run due to it's ability to be meaningless for imbalanced datasets, as the fraction of correct observations doesn't take into account the skewed nature of the data. However, in those cases, python does have an adjusted accuracy metric to account for the imbalanced nature of the data. However, in this case as I believe the cost of misdiagnosis is more severe for the company, we focus on recall as the key scoring metric.
# * Confusion Matrix: A matrix which outlines in a table the number of true positives, false positive, true negatives and false negatives predicted by the classification algorithim.
# * Precision: The fraction of observations that are predicted correctly out of the total.
# * Recall: The probability that your classification predicted correctly out of all positive samples.
# * F_1 Score: Weighed Harmonic Mean of the above two measures
# * ROC: Receiver Operating Characteristic, essentially a measure of the performance of your classifer that compares your true positive rate to your false positive rate over a series of thresholds.
예제 #13
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
예제 #14
0
def logistic_numeric(df):
    # In target column (bsa_dummy), 0 stands for bsa obtained and 1 stands for bsa not obtained

    # Remove extra columns
    del df['language']
    del df['motivation']
    del df['program']
    del df['studentnr_crypt']

    df = df.fillna(method='ffill')

    # Select categorical features
    categorical_features = [
        'cohort', 'field', 'prior_educ', 'previously_enrolled',
        'multiple_requests', 'gender', 'interest', 'ase', 'reenrolled', 'year'
    ]

    # Select numeric features
    numeric_features = [
        'age', 'HSGPA', 'WC', 'WPS', 'Sixltr', 'Dic', 'funct', 'pronoun',
        'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb',
        'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj',
        'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend',
        'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad',
        'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib',
        'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
        'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time',
        'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death',
        'assent', 'nonfl', 'filler', 'pronadv', 'shehethey', 'AllPunc',
        'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash',
        'Quote', 'Apostro', 'Parenth', 'OtherP', 'count_punct',
        'count_stopwords', 'nr_token', 'nr_adj', 'nr_noun', 'nr_verb',
        'nr_number', 'topic1', 'topic2', 'topic3', 'topic4', 'topic5',
        'topic6', 'topic7', 'topic8', 'topic9', 'topic10', 'topic11',
        'topic12', 'topic13', 'topic14', 'topic15'
    ]

    # Change object (string) type of features to float
    change_type = [
        'WPS', 'Sixltr', 'Dic', 'funct', 'pronoun', 'ppron', 'i', 'we', 'you',
        'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past',
        'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant',
        'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect',
        'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight',
        'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl',
        'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual',
        'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve',
        'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl',
        'filler', 'pronadv', 'shehethey', 'AllPunc', 'Period', 'Comma',
        'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro',
        'Parenth', 'OtherP'
    ]
    df[change_type] = df[change_type].apply(lambda x: x.str.replace(',', '.'))
    df[change_type] = df[change_type].astype(float).fillna(0.0)

    # Scaling features
    # Apply standard scaler and polynomial features algorithms to numerical features
    numeric_transformer = Pipeline(steps=[('poly', PolynomialFeatures(
        degree=2)), ('scaler', StandardScaler())])

    # Apply one hot-encoding for categorical columns
    categorical_transformer = Pipeline(
        steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine both numerical and categorical column
    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features
                       ), ('cat', categorical_transformer,
                           categorical_features)])

    # Define the SMOTE and Logistic Regression algorithms
    smt = SMOTE(random_state=42)
    lor = LogisticRegression(solver='sag', C=50)

    # Chain all the steps using the Pipeline module
    clf = Pipeline([('preprocessor', preprocessor), ('smt', smt),
                    ('lor', lor)])

    # Split the data into train and test folds and fit the train set using chained pipeline
    y = df['bsa_dummy']
    X = df.drop('bsa_dummy', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)
    clf.fit(X_train, y_train)

    # Train score
    print('train score: ', clf.score(X_train, y_train))
    with open(
            '../../results/output/classification_reports/logistic regression/report.txt',
            'a+') as f:
        print('train score: ', clf.score(X_train, y_train), file=f)

    # Test score
    print('test score: ', clf.score(X_test, y_test))
    with open(
            '../../results/output/classification_reports/logistic regression/report.txt',
            'a+') as f:
        print('\n', file=f)
        print('test score: ', clf.score(X_test, y_test), file=f)

    # Predict results on the test set
    clf_predicted = clf.predict(X_test)

    # Build confusion matrix
    confusion = confusion_matrix(y_test, clf_predicted)
    print(confusion)
    with open(
            '../../results/output/classification_reports/logistic regression/report.txt',
            'a+') as f:
        print('\n', confusion, file=f)

    # Print classification report
    print(classification_report(y_test, clf_predicted, target_names=['0',
                                                                     '1']))
    with open(
            '../../results/output/classification_reports/logistic regression/report.txt',
            'a+') as f:
        print('\n',
              classification_report(y_test,
                                    clf_predicted,
                                    target_names=['0', '1']),
              file=f)

    # Extract feature importance
    importance = clf.steps[2][1].coef_
    feature_names = numeric_features + categorical_features

    # Zip feature importance and feature names in the format of dictionary
    coef_dict = {}
    for coef, feat in zip(clf.steps[2][1].coef_[0, :], feature_names):
        coef_dict[feat] = coef

    # Sort feature_importance values
    coef_dict = dict(sorted(coef_dict.items(), key=lambda item: item[1]))

    # Turn dictionary to series
    feature_importance = pd.Series(list(coef_dict.values()),
                                   index=coef_dict.keys())
    with open(
            '../../results/output/classification_reports/logistic regression/feature_importance.txt',
            'w') as f:
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            print(feature_importance, file=f)

    # Plot feature importance
    feature_importance.plot.barh(figsize=(15, 25))
    plt.show()
예제 #15
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
예제 #16
0
#Fit

pipe = Pipeline(steps=[('selector', VarianceThreshold(threshold=0)),
                ('scaler', StandardScaler()), ('sampler', RandomOverSampler()),
                ('kNN', KNeighborsClassifier(n_jobs=-1, n_neighbors=22))])
{'kNN__n_neighbors': 22, 'selector__threshold': 0}

start_time = time.time()
pipe.fit(x_train,y_train)
fit_times['knn'] = time.time()-start_time

start_time = time.time()
new_predictions['knn'] = pipe.predict(x_test)
pred_times['knn'] = time.time()-start_time

new_accuracies['knn'] = pipe.score(x_test,y_test)

print("\nf1-micro average of Knn Classifier:",f1_score(y_test,new_predictions['knn'], average='micro'))
print("\nf1-macro average of Knn Classifier:",f1_score(y_test,new_predictions['knn'], average='macro'))
print()
print("\nConfusion Matrix of Knn:\n",confusion_matrix(y_test,new_predictions['knn']))
print("\nClassification Report of Knn:\n",classification_report(y_test,new_predictions['knn']))
print("\nFit time:\n",fit_times['knn'])
print("\nPrediction time:\n",pred_times['knn'],"\n")

"""### Optimize GNB

Για τον Gaussian Naive Bayes δεν έχουμε υπερπαραμέτρους να βελτιστοποιήσουμε, παρόλα αυτά θα χρησιμοποιήσουμε cross validation για να βρούμε τις κατάλληλες παραμετρους του που τον βελτιστοποιούν.
"""

#Arxika tha doyme kai me ta 4 processing
예제 #17
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
clf.best_score_  #print best score from grid search
ap_score(y_test, model.predict(X_test))
f1_score(y_test, model.predict(X_test))
matthews_corrcoef(y_test, model.predict(X_test))
accuracy_score(y_test, clf.predict(X_test))
confusion_matrix(y_test, model.predict(X_test))
recall_score(y_test, model.predict(X_test))

#plot
scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs), len(Gammas))

for ind, i in enumerate(Cs):
    plt.plot(Gammas, scores[ind], label='C: ' + str(i))
plt.legend()
plt.xlabel('Gamma')
plt.ylabel('Average precision')
plt.show()

#code for tree, comment out if SVM is used
model.fit(X_train, y_train)
cv = ms.StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
ms.cross_val_score(model, X_train, y_train, cv=cv)
model.score(X_test, y_test)
ap_score(y_test, model.predict(X_test))
f1_score(y_test, model.predict(X_test))
matthews_corrcoef(y_test, model.predict(X_test))
accuracy_score(y_test, clf.predict(X_test))
confusion_matrix(y_test, model.predict(X_test))
recall_score(y_test, model.predict(X_test))