def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 with raises(TypeError, match="unexpected keyword argument"): pipe.score(X, sample_weight=np.array([2, 3]))
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_methods_pca_rus_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('pca', pca), ('rus', rus), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs") rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("rus", rus), ("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
tree.DecisionTreeClassifier(), RandomForestClassifier(max_depth=2, random_state=42), BaggingClassifier(base_estimator=SVC(), n_estimators=20, random_state=42), AdaBoostClassifier(n_estimators=200), GradientBoostingClassifier(random_state=42), xgb.XGBClassifier(random_state=42,learning_rate=0.01) ] # For each classifier in classifiers, scale the inputs and then apply the algorithim. # Then, fit the algorithim and print it's score. for classifier in classifiers: pipe = Pipeline(steps=[('scale',StandardScaler()), ('classifier', classifier)]) pipe.fit(X_train, y_train.values.ravel()) print(classifier) print("model score: %.3f" % pipe.score(X_test, y_test)) # - # Examining the output above, Logistic Regression, Gradient Bossted Trees and the AdaBoost Classifier seem to be our best bets for a good classification algorithim, as Gaussian and LDA/QDA do not work well due to the implicit correlation present in our dummy variables that cannot be easily removed. # # Examining the Classifiers in Depth # ## Definitions of some metrics # * Accuracy: Accuracy in this case refers to the portion of cases that your algorithim has predicted correctly. We don't actively use this statistic in this run due to it's ability to be meaningless for imbalanced datasets, as the fraction of correct observations doesn't take into account the skewed nature of the data. However, in those cases, python does have an adjusted accuracy metric to account for the imbalanced nature of the data. However, in this case as I believe the cost of misdiagnosis is more severe for the company, we focus on recall as the key scoring metric. # * Confusion Matrix: A matrix which outlines in a table the number of true positives, false positive, true negatives and false negatives predicted by the classification algorithim. # * Precision: The fraction of observations that are predicted correctly out of the total. # * Recall: The probability that your classification predicted correctly out of all positive samples. # * F_1 Score: Weighed Harmonic Mean of the above two measures # * ROC: Receiver Operating Characteristic, essentially a measure of the performance of your classifer that compares your true positive rate to your false positive rate over a series of thresholds.
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def logistic_numeric(df): # In target column (bsa_dummy), 0 stands for bsa obtained and 1 stands for bsa not obtained # Remove extra columns del df['language'] del df['motivation'] del df['program'] del df['studentnr_crypt'] df = df.fillna(method='ffill') # Select categorical features categorical_features = [ 'cohort', 'field', 'prior_educ', 'previously_enrolled', 'multiple_requests', 'gender', 'interest', 'ase', 'reenrolled', 'year' ] # Select numeric features numeric_features = [ 'age', 'HSGPA', 'WC', 'WPS', 'Sixltr', 'Dic', 'funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', 'filler', 'pronadv', 'shehethey', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'count_punct', 'count_stopwords', 'nr_token', 'nr_adj', 'nr_noun', 'nr_verb', 'nr_number', 'topic1', 'topic2', 'topic3', 'topic4', 'topic5', 'topic6', 'topic7', 'topic8', 'topic9', 'topic10', 'topic11', 'topic12', 'topic13', 'topic14', 'topic15' ] # Change object (string) type of features to float change_type = [ 'WPS', 'Sixltr', 'Dic', 'funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', 'filler', 'pronadv', 'shehethey', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP' ] df[change_type] = df[change_type].apply(lambda x: x.str.replace(',', '.')) df[change_type] = df[change_type].astype(float).fillna(0.0) # Scaling features # Apply standard scaler and polynomial features algorithms to numerical features numeric_transformer = Pipeline(steps=[('poly', PolynomialFeatures( degree=2)), ('scaler', StandardScaler())]) # Apply one hot-encoding for categorical columns categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Combine both numerical and categorical column preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features ), ('cat', categorical_transformer, categorical_features)]) # Define the SMOTE and Logistic Regression algorithms smt = SMOTE(random_state=42) lor = LogisticRegression(solver='sag', C=50) # Chain all the steps using the Pipeline module clf = Pipeline([('preprocessor', preprocessor), ('smt', smt), ('lor', lor)]) # Split the data into train and test folds and fit the train set using chained pipeline y = df['bsa_dummy'] X = df.drop('bsa_dummy', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50) clf.fit(X_train, y_train) # Train score print('train score: ', clf.score(X_train, y_train)) with open( '../../results/output/classification_reports/logistic regression/report.txt', 'a+') as f: print('train score: ', clf.score(X_train, y_train), file=f) # Test score print('test score: ', clf.score(X_test, y_test)) with open( '../../results/output/classification_reports/logistic regression/report.txt', 'a+') as f: print('\n', file=f) print('test score: ', clf.score(X_test, y_test), file=f) # Predict results on the test set clf_predicted = clf.predict(X_test) # Build confusion matrix confusion = confusion_matrix(y_test, clf_predicted) print(confusion) with open( '../../results/output/classification_reports/logistic regression/report.txt', 'a+') as f: print('\n', confusion, file=f) # Print classification report print(classification_report(y_test, clf_predicted, target_names=['0', '1'])) with open( '../../results/output/classification_reports/logistic regression/report.txt', 'a+') as f: print('\n', classification_report(y_test, clf_predicted, target_names=['0', '1']), file=f) # Extract feature importance importance = clf.steps[2][1].coef_ feature_names = numeric_features + categorical_features # Zip feature importance and feature names in the format of dictionary coef_dict = {} for coef, feat in zip(clf.steps[2][1].coef_[0, :], feature_names): coef_dict[feat] = coef # Sort feature_importance values coef_dict = dict(sorted(coef_dict.items(), key=lambda item: item[1])) # Turn dictionary to series feature_importance = pd.Series(list(coef_dict.values()), index=coef_dict.keys()) with open( '../../results/output/classification_reports/logistic regression/feature_importance.txt', 'w') as f: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(feature_importance, file=f) # Plot feature importance feature_importance.plot.barh(figsize=(15, 25)) plt.show()
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
#Fit pipe = Pipeline(steps=[('selector', VarianceThreshold(threshold=0)), ('scaler', StandardScaler()), ('sampler', RandomOverSampler()), ('kNN', KNeighborsClassifier(n_jobs=-1, n_neighbors=22))]) {'kNN__n_neighbors': 22, 'selector__threshold': 0} start_time = time.time() pipe.fit(x_train,y_train) fit_times['knn'] = time.time()-start_time start_time = time.time() new_predictions['knn'] = pipe.predict(x_test) pred_times['knn'] = time.time()-start_time new_accuracies['knn'] = pipe.score(x_test,y_test) print("\nf1-micro average of Knn Classifier:",f1_score(y_test,new_predictions['knn'], average='micro')) print("\nf1-macro average of Knn Classifier:",f1_score(y_test,new_predictions['knn'], average='macro')) print() print("\nConfusion Matrix of Knn:\n",confusion_matrix(y_test,new_predictions['knn'])) print("\nClassification Report of Knn:\n",classification_report(y_test,new_predictions['knn'])) print("\nFit time:\n",fit_times['knn']) print("\nPrediction time:\n",pred_times['knn'],"\n") """### Optimize GNB Για τον Gaussian Naive Bayes δεν έχουμε υπερπαραμέτρους να βελτιστοποιήσουμε, παρόλα αυτά θα χρησιμοποιήσουμε cross validation για να βρούμε τις κατάλληλες παραμετρους του που τον βελτιστοποιούν. """ #Arxika tha doyme kai me ta 4 processing
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
clf.best_score_ #print best score from grid search ap_score(y_test, model.predict(X_test)) f1_score(y_test, model.predict(X_test)) matthews_corrcoef(y_test, model.predict(X_test)) accuracy_score(y_test, clf.predict(X_test)) confusion_matrix(y_test, model.predict(X_test)) recall_score(y_test, model.predict(X_test)) #plot scores = [x[1] for x in clf.grid_scores_] scores = np.array(scores).reshape(len(Cs), len(Gammas)) for ind, i in enumerate(Cs): plt.plot(Gammas, scores[ind], label='C: ' + str(i)) plt.legend() plt.xlabel('Gamma') plt.ylabel('Average precision') plt.show() #code for tree, comment out if SVM is used model.fit(X_train, y_train) cv = ms.StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) ms.cross_val_score(model, X_train, y_train, cv=cv) model.score(X_test, y_test) ap_score(y_test, model.predict(X_test)) f1_score(y_test, model.predict(X_test)) matthews_corrcoef(y_test, model.predict(X_test)) accuracy_score(y_test, clf.predict(X_test)) confusion_matrix(y_test, model.predict(X_test)) recall_score(y_test, model.predict(X_test))