Exemplo n.º 1
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = feature_format(dataset, feature_list, sort_keys=True)
    labels, features = target_feature_split(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy,
                                        precision,
                                        recall,
                                        f1,
                                        f2,
                                        display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Exemplo n.º 2
0
    def perform_k_fold_and_grid_search(data):
        from sklearn.svm import SVC
        from sklearn.metrics import accuracy_score
        from sklearn.model_selection import StratifiedKFold
        from sklearn.decomposition import PCA
        from sklearn.model_selection import GridSearchCV
        # split feature and target data
        labels_data, features_data = target_feature_split(data)
        features_train, labels_train, features_test, labels_test = [], [], [], []

        # split features and labels into train and test
        skf = StratifiedKFold(n_splits=3)
        for train_index, test_index in skf.split(features_data, labels_data):
            features_train = [features_data[index] for index in train_index]
            labels_train = [labels_data[index] for index in train_index]
            features_test = [features_data[index] for index in test_index]
            labels_test = [labels_data[index] for index in test_index]

        # perform principal components analysis and transform features into components
        pca = PCA(n_components=2)
        pca.fit(features_train)
        pca_train, pca_test = pca.transform(features_train), pca.transform(
            features_test)

        # dictionary of params for svm
        parameters = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 10, 1000],
            'gamma': [10, 1000]
        }
        _svm_ = SVC()
        # grid search will find the best params
        svm_classifier = GridSearchCV(_svm_, parameters)

        # svm classifier for classification
        # principal components are used in place of features
        svm_classifier.fit(features_train, labels_train)
        print("best params:", svm_classifier.best_params_)
        labels_prediction = svm_classifier.predict(features_test)
        print("accuracy score: ",
              accuracy_score(labels_test, labels_prediction) * 100, "%")
Exemplo n.º 3
0
def __main__():
    import numpy as np
    raw_data = feature_format(dictionary,
                              features_list,
                              remove_any_zeroes=True)
    target, features = target_feature_split(raw_data)
    feature_train, feature_test, target_train, target_test = train_test_split(
        features, target, test_size=0.3, random_state=42)
    ml = MachineLearningAlgorithms()
    ml.perform_linear_regression()
    # SVM with features
    ml.classify_svm(feature_train, target_train, feature_test, target_test)
    pca_train_, pca_test_ = ml.principal_component_analysis(
        feature_train, feature_test)
    # SVM with principle components
    ml.classify_svm(pca_train_, target_train, pca_test_, target_test)
    ml.kmeans_cluster(feature_train)
    # k fold train/test splitting
    ml.perform_k_fold_and_grid_search(raw_data)
    # feature scaling
    print("rescaled: {}".format(
        ml.feature_rescale(np.array([50.0, 99.0, 22.3, 88.0]))))
    ml.text_classification()
dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "rb"))

# list the features you want to look at -- first item in the list will be the "target" feature
features_list = [
    "bonus",  # target
    "long_term_incentive"  # feature -- use salary, long_term_incentive and other features to compare score
]
"""
long term incentives have better relation with bonus than the salaries
we find it by comparing r square scores while using both features as input and bonus as target
"""

data = feature_format(dictionary, features_list, remove_any_zeroes=True)
target, features = target_feature_split(data)
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

# we are trying to predict bonus using salary
# feature --> salary, long_term_incentive or any other feature --> input
# target --> bonus --> output

reg = LinearRegression()
reg.fit(feature_train, target_train)
target_prediction = reg.predict(feature_test)
intercept_prediction = reg.intercept_
slope_prediction = reg.coef_
Exemplo n.º 5
0
# can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = "total_payments"
poi = "poi"

# add 3rd feature in features_list and compare the results
# after adding 3rd feature, total 4 data points exchanged their positions
# in the plot
features_list = [poi, feature_1, feature_2]
# splitting dictionary to list
# list containing poi(target), feature1, feature2
data = feature_format(data_dict, features_list)
# splitting list to further lists
# separate poi(target) from feature1, feature2
poi, finance_features = target_feature_split(data)
# feature scaling
# after scaling --> 0...1
# run k-means with and without scaling and comparing the results
# some of the data points will be clustered in different cluster after re-scaling.
# in this case, we may not need scaling
# but when we are using salary and from_messages as features then scaling
# is critical

# finance_features = MinMaxScaler().fit_transform(finance_features)

exercised_stock_options_values = []
salary_values = []
# change to f1, f2, f3 --> for 3 features
for f1, f2 in finance_features:
    if f1 != 0:
Exemplo n.º 6
0
#!/usr/bin/python
"""
    Starter code for the evaluation mini-project.
    Start by copying your trained/tested POI identifier from
    that which you built in the validation mini-project.

    This is the second step toward building your POI identifier!

    Start by loading/formatting the data...
"""

import pickle
import sys
sys.path.append("../tools/")
from feature_format import feature_format, target_feature_split

with open("../final_project/final_project_dataset.pkl", "rb") as f:
    data_dict = pickle.load(f)

# add more features to features_list!
features_list = ["poi", "salary"]

data = feature_format(data_dict,
                      features_list,
                      sort_keys='../tools/python2_lesson13_keys.pkl')
labels, features = target_feature_split(data)

# your code goes here