예제 #1
0
def try_linear_svc():
    """
    Linear SVC
    """

    print("LinearSVC")
    from sklearn import svm
    from sklearn.svm import LinearSVC
    from sklearn.svm import SVC


    clf_lr = SVC(kernel='rbf', C=1000)
    clf_lr.fit(features_train,labels_train)
    pred = clf_lr.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    dict_results = { "classifier": "linear svc, rbf", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_lr
예제 #2
0
def try_ada_boost_decision_tree():
    """ 
    AdaBoost appied to Decision Tree
    """

    print("AdaBoost to Decision Tree")
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.grid_search import GridSearchCV

    param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "n_estimators": [10, 30]
                 }

    DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)

    ABC = AdaBoostClassifier(base_estimator = DTC)

    grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')

    grid_search_ABC.fit(features_train,labels_train)

    pred = grid_search_ABC.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("DecisionTree after applying AdaBoost and GridSearchCV:")
    print("accuracy AdaBoost: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "AdaBoost decision tree", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, grid_search_ABC
예제 #3
0
def try_k_neighbors():
    """
    K Nearest Neighbors classifier:
    """

    print "KNeighborsClassifier"
    from sklearn.neighbors import NearestNeighbors
    from sklearn.neighbors import KNeighborsClassifier

    pipe = Pipeline([
        ('classify', KNeighborsClassifier())
    ])

    clf_knn = KNeighborsClassifier(n_neighbors=5)
    params_knn = {}

    KNC = KNeighborsClassifier(n_neighbors=2)
    ABC = AdaBoostClassifier(base_estimator = KNC)

    KNC.fit(features_train,labels_train)
    pred = KNC.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    #f1 = f1_score(labels_test, pred)

    print("KNeighborsClassifier:")
    print("accuracy KNC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "K Nearest Neighbors", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, KNC
예제 #4
0
def try_classifier_GaussianNB():
    """ 
    GaussanNB
    """
    
    print "Classifier:  GaussianNB:"
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    clf_GuassianNB = GaussianNB()
    clf_GuassianNB.fit(features_train,labels_train)

    pred = clf_GuassianNB.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("GaussianNB accuracy: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    dict_results = { "classifier": "GaussianNB", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_GuassianNB
예제 #5
0
def try_logistic_regression_pipeline():
    """
    Logistic Regression with pipeline:
    """

    print("Logistic Regression with pipeline and PCA:")
    from sklearn import linear_model, decomposition, datasets
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    logistic = linear_model.LogisticRegression()
    pca = decomposition.PCA()
    pipe_lr = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
    pipe_lr.fit(features_train,labels_train)
    pred_lr = pipe_lr.predict(features_test)

    accuracy = accuracy_score(labels_test, pred_lr)
    precision = precision_score(labels_test,pred_lr)
    recall = recall_score(labels_test,pred_lr)
    f1 = f1_score(labels_test,pred_lr)

    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "Logistic regression, pca and pipeline", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, pipe_lr
예제 #6
0
def try_k_neighbors_pipeline():
    """
    K Nearest Neighbors with pipeline:
    """

    print "KNeighborsClassifier with Pipeline"
    from sklearn.neighbors import NearestNeighbors
    from sklearn.neighbors import KNeighborsClassifier

    KNC2 = KNeighborsClassifier()
    pca = PCA()
    pipe = Pipeline([('pca', pca), ('knn', KNC2)])
    pipe.fit(features_train, labels_train)
    pred = pipe.predict(features_test)

    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    print("KNeighborsClassifier - with Pipe:")
    print("accuracy KNC2: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "K nearest neighbors, pipeline", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, pipe
예제 #7
0
def try_classifier_Decision_Tree_Pipeline():
    """ 
    Decision Tree Classifier, optimized with Pipeline
    """
    print "Decision Tree classifier with pipeline:" 
 
    scaler = preprocessing.MinMaxScaler()
    skb = SelectKBest(k = 15)
    dt3 = tree.DecisionTreeClassifier(criterion='entropy',splitter='best')
    clf_DT3 =  Pipeline(steps=[('scaling',scaler),("SKB", skb), ("DecisionTree", dt3)])

    clf_DT3.fit(features_train,labels_train)
    pred = clf_DT3.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("accuracy:",accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    print_separator_line()
    dict_results = { "classifier": "Decision Tree, pipeline", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_DT3
예제 #8
0
def try_classifier_Decision_Tree2():
    """ 
    Decision Tree Classifier
    """

    print "DecisionTree with criterion = entropy:" 
    from sklearn import tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    scaler = preprocessing.MinMaxScaler()
    skb = SelectKBest(k = 'all')

    ### Use entropy as criterion
    dt = tree.DecisionTreeClassifier(criterion='entropy',splitter='best')
    clf_DT2 =  Pipeline(steps=[('scaling',scaler),("SKB", skb), ("DecisionTree", dt)])
    clf_DT2.fit(features_train,labels_train)
    pred = clf_DT2.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("DT-entropy with pipeline accuracy: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    dict_results = { "classifier": "Decision Tree, entropy", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_DT2
예제 #9
0
def try_classifier_GaussianNB_pipeline():
    """
    GaussanNB improved with Pipeline
    """

    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    print "GaussianNB with pipeline:"
    gnb = GaussianNB()
    skb = SelectKBest(k = 'all')
    pipe_GuassianNB = Pipeline(steps=[('scaling',scaler),("SKB", skb), ("NaiveBayes", gnb)])
    pipe_GuassianNB.fit(features_train,labels_train)

    pred = pipe_GuassianNB.predict(features_test)
    accuracy = pipe_GuassianNB.score(features_test,labels_test)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("GaussianNB with Pipeline, accuracy: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    dict_results = { "classifier": "GaussianNB with pipeline", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, pred
예제 #10
0
def try_random_forest():
    """ 
    Random Forest classifier
    """

    print("Random Forest")
    from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier()
    rf.fit(features_train, labels_train)
    score_train = rf.score(features_train,labels_train)
    score_test = rf.score(features_test,labels_test)
    print("score train: ", score_train)
    print("score test: ", score_test)

    clf_rf = RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=None, max_features=10)
    clf_rf.fit(features_train,labels_train)
    pred = clf_rf.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)
    f1 = f1_score(labels_test, pred)

    print("accuracy RandomForest: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print_separator_line()
    dict_results = { "classifier": "Random Forest", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_rf
예제 #11
0
def try_linear_svc_gridsearchcv():
    """
    Linear SVC model optimized with Pipeline and GridSearchCV
    """

    scaler = preprocessing.MinMaxScaler()
    skb = SelectKBest(k = 'all')

    pipe = Pipeline([
        ('reduce_dim', PCA()),
        ('classify', LinearSVC())
    ])

    N_FEATURES_OPTIONS = [2, 4, 8]
    C_OPTIONS = [1, 10, 100, 1000]
    param_grid = [
        {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
    ]
    reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
    grid_linearSVC = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)

    grid_linearSVC.fit(features_train,labels_train)
    pred = grid_linearSVC.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print "SVM after applying PCA and GridSearchCV:"
    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "linear SVC with GridSearchCV", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, grid_linearSVC
예제 #12
0
def try_logistic_regression_tuned():
    """
    Logistic Regression tuned
    """

    clf_winner = Pipeline(steps=[("scaler", scaler),
                          ("skb", SelectKBest(k='all')),
                          ("clf_winner", LogisticRegression(tol=0.1, C = 1**19, class_weight='balanced'))])

    clf_winner.fit(features_train,labels_train)
    pred = clf_winner.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("accuracy LogisticRegression with PCA 2: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    #return clf_winner
    print_separator_line()
    dict_results = { "classifier": "Logistic regression tuned", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_winner
예제 #13
0
def try_svm_classifier():
    """ 
    SVM classifier
    """

    print("svm SVC classifier:")

    clf_svm = svm.SVC(C=10., cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False)
    clf_svm.fit(features_train,labels_train)
    pred = clf_svm.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "svm.SVC", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_svm
예제 #14
0
def try_svc_tuned():
    """ 
    Apply the tuned parameters generated by GridSearchCV
    """

    from sklearn.model_selection import GridSearchCV
    parameters = {'kernel':['rbf'], 'C':[1, 1000]}
    svc = SVC()
    clf_svm3 = GridSearchCV(svc, parameters)
    clf_svm3.fit(features_train,labels_train)
    clf_svm3 = clf_svm3.best_estimator_

    pred = clf_svm3.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "svc tuned", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_svm3
예제 #15
0
def try_logistic_regression():
    """
    Logistic Regression classifier:
    """

    print("Logistic Regression")
    from sklearn import linear_model, decomposition, datasets
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    logistic = linear_model.LogisticRegression()
    logistic.fit(features_train,labels_train)

    pred = logistic.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("accuracy LogisticRegression with PCA: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "Logistic regression", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, logistic
예제 #16
0
def try_svc_pipeline_gridsearchcv():
    """
    SVC with pipeline and GridSearchCV
    generate the best parameters
    """

    print "results pipeline svm:"    
    pca = decomposition.PCA()
    svm = SVC()
    pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])
    n_components = [10, 14, 18]
    params_grid = {
        'svm__C': [1, 10, 100, 1000],
        'svm__kernel': ['linear', 'rbf'],
        'svm__gamma': [0.001, 0.0001],
        'pca__n_components': n_components,
    }
    estimator = GridSearchCV(pipe, params_grid)
    estimator.fit(features_train,labels_train)

    print estimator.best_params_, estimator.best_score_
    params = estimator.best_params_

    clf_svm2 = SVC(C=100, kernel='rbf', decision_function_shape='ovr', degree=3, gamma='auto', coef0=0.0,max_iter=-1, probability=False, random_state=None, shrinking=True)
    clf_svm2.fit(features_train,labels_train)
    pred = clf_svm2.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    precision = precision_score(labels_test, pred)
    recall = recall_score(labels_test, pred)

    print("accuracy SVC: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print_separator_line()
    dict_results = { "classifier": "svc with pipeline", "accuracy": accuracy, "precision": precision, "recall": recall }
    return dict_results, clf_svm2
예제 #17
0
from helpers_enron import make_csv
make_csv(data_dict)


### Task: Remove outliers
data_dict.pop('TOTAL',0)
data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0)
### This record has NaN in every field
data_dict.pop('LOCKHART EUGENE E',0)


### Task: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

print_separator_line()

### Combine poi, financial and email features
features_list = poi + features_financial + features_email

### Get the best features
features_list = set_kbest_features_list(data_dict, features_list)

### csv file is written in order to see results as spreadsheet
output_file = "test_results_original.csv"




### ***  UNCOMMENT THESE LINES TO RERUN THE CLASSIFIERS WITH ADDED FEATURES ***
### Additional features: log of financial fields and POI email ratio