Exemplo n.º 1
0
def main():
    clf, dataset, feature_list = tester.load_classifier_and_data()
    data = tester.featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = tester.targetFeatureSplit(data)
    pipeline = Pipeline([('kbest', SelectKBest()),
                         ('ada', AdaBoostClassifier())])
    #grid_search = GridSearchCV(pipeline,{'kbest__k':range(1,20),'ada__n_estimators':[25,50,75,100],'ada__learning_rate':[ x / 10.0 for x in range(2,20,2)]},
    #  cv=StratifiedShuffleSplit(labels,50),scoring=scorer,n_jobs=4)
    grid_search = GridSearchCV(pipeline, {
        'kbest__k': range(10, 21),
        'ada__n_estimators': [75],
        'ada__learning_rate': [0.5],
        'ada__random_state': [13]
    },
                               cv=StratifiedShuffleSplit(labels,
                                                         1000,
                                                         test_size=0.1,
                                                         random_state=42),
                               scoring=scorer,
                               n_jobs=4)
    grid_search.fit(features, labels)
    print "Best Parameters: ", grid_search.best_params_, " score: ", grid_search.best_score_
    #print "All Scores: ",grid_search.grid_scores_
    best_features = SelectKBest(k=grid_search.best_params_['kbest__k'])
    best_features.fit(features, labels)
    best_feature_labels = []
    for include, feature in zip(best_features.get_support(), feature_list[1:]):
        if include:
            best_feature_labels.append(feature)
    print "Best Features: ", best_feature_labels
def main():
    """Development test environment."""
    # Read persons in the dataset, so that others may be skipped.
    from tester import load_classifier_and_data
    clf, dataset, feature_list = load_classifier_and_data()
    # Run the email feature computation.
    generator = EmailWordFeatures()
    generator.extend(dataset)
Exemplo n.º 3
0
def setup_and_test(my_dataset, features_list, classifier):
    # Dump classifier and features list, so we can test them
    dump_classifier_and_data(classifier, my_dataset, features_list)

    # load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    # Run testing script
    test_classifier(clf, dataset, feature_list)

    return
def main():
    clf, dataset, feature_list = tester.load_classifier_and_data()
    data = tester.featureFormat(dataset,feature_list, sort_keys = True)
    labels, features = tester.targetFeatureSplit(data)
    pipeline = Pipeline([('kbest', SelectKBest()),('ada',AdaBoostClassifier())])
    #grid_search = GridSearchCV(pipeline,{'kbest__k':range(1,20),'ada__n_estimators':[25,50,75,100],'ada__learning_rate':[ x / 10.0 for x in range(2,20,2)]},
    #  cv=StratifiedShuffleSplit(labels,50),scoring=scorer,n_jobs=4)
    grid_search = GridSearchCV(pipeline,{'kbest__k':range(10,21),'ada__n_estimators':[75],'ada__learning_rate':[0.5],'ada__random_state':[13]},
      cv=StratifiedShuffleSplit(labels,1000,test_size=0.1,random_state=42),scoring=scorer,n_jobs=4)
    grid_search.fit(features,labels)
    print "Best Parameters: ",grid_search.best_params_," score: ",grid_search.best_score_
    #print "All Scores: ",grid_search.grid_scores_
    best_features = SelectKBest(k=grid_search.best_params_['kbest__k'])
    best_features.fit(features, labels)
    best_feature_labels = []
    for include,feature in zip(best_features.get_support(),feature_list[1:]):
        if include:
            best_feature_labels.append(feature)
    print "Best Features: ",best_feature_labels
Exemplo n.º 5
0
# print(new_features_list)

parameters1 = {
    "criterion": ["gini", "entropy"],
    "min_samples_split": [2, 3, 4, 5, 6, 7],
    "max_features": ["auto", "log2", "sqrt", None]
}
classifier = GridSearchCV(DT_classifier, parameters1)
classifier.fit(features_train, labels_train)
classifier = classifier.best_estimator_
# pred = classifier.predict(features_test)
# accuracy = accuracy_score(pred, labels_test)
# precision = precision_score(pred, labels_test)
# cm = confusion_matrix(labels_test, pred)
# recall = recall_score(pred, labels_test)
# print(accuracy)
# print(precision)
# print(recall)
# print(cm)
test_classifier(classifier, my_dataset, features_list)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(classifier, my_dataset, features_list)

clf, dataset, features = load_classifier_and_data()
print(clf)
print(dataset)
print(features)
Exemplo n.º 6
0
# The optimal model selected by GridSearchCV:
clf = gs.best_estimator_

# Access the feature importances

# create a new list that contains the features selected by SelectKBest
# in the optimal model selected by GridSearchCV
features_selected=[features_list[i+1] for i in clf.named_steps['kbest'].get_support(indices=True)]

# The step in the pipeline for the Decision Tree Classifier is called 'DTC'
# that step contains the feature importances
importances = clf.named_steps['DTC'].feature_importances_

import numpy as np
indices = np.argsort(importances)[::-1]

# Use features_selected, the features selected by SelectKBest, and not features_list
print 'Feature Ranking: '
for i in range(len(features_selected)):
    print "feature no. {}: {} ({})".format(i+1,features_selected[indices[i]],importances[indices[i]])

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

clf, dataset, feature_list = load_classifier_and_data()
test_classifier(clf, dataset, feature_list)
Exemplo n.º 7
0
def use_test_classifier(clf, my_dataset, features_list):
    # test classifier using tester functions
    dump_classifier_and_data(clf, my_dataset, features_list)
    clf, dataset, feature_list = load_classifier_and_data()
    test_classifier(clf, dataset, feature_list)
Exemplo n.º 8
0
# In[120]:


from sklearn.cross_validation import train_test_split
data01 = featureFormat(my_dataset, features_list01, sort_keys = True)
labels01, features01 = targetFeatureSplit(data01)
features_train01, features_test01, labels_train01, labels_test01 =     train_test_split(features01, labels01, test_size=0.3, random_state=42)
    


# In[121]:


dump_classifier_and_data(clf01, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
if __name__ == '__main__':
    main()


# In[122]:


dump_classifier_and_data(clf02, my_dataset, features_list01)
load_classifier_and_data()
print features_list01
if __name__ == '__main__':
    main()

Exemplo n.º 9
0
gs.fit(features, labels)
clf = gs.best_estimator_

# Print the selected features and pvalues
print "Processing time:", round(time()-t0, 3), "s"
k_best_support = clf.named_steps['SKB'].get_support(False).tolist()
df_selected_features1 = pd.DataFrame(
    {'Feature': list(compress(features_no_poi, k_best_support)),
     'p value': list(compress(clf.named_steps['SKB'].pvalues_,k_best_support))
    })
pprint(df_selected_features1)
print

# Test the results
dump_classifier_and_data(clf, my_dataset, features_list)
clf, dataset, feature_list = load_classifier_and_data()
test_classifier(clf, dataset, feature_list)

#####################################################################
# Implement Modeling Pipeline with "Select K Best" and "DecisionTree"
####################################################################
print "\n******************\n Select K Best + DecisionTree Pipeline\n"
t0 = time()
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

k_range = range(2,8)
params = {'SKB__k' : k_range,
          "dt__min_samples_leaf": [2, 4, 6],
          "dt__min_samples_split": [8, 10, 12],
          "dt__min_weight_fraction_leaf": [0, 0.1],
Exemplo n.º 10
0
    feature_no = []

    features_sublist = features_list[:optimum_feature_no[i]+1]

    #Define classifiers
    if algorithm == "Gaussian":
        clf = GaussianNB()
    elif algorithm == "RandomForest":
        clf = RandomForestClassifier()
    elif algorithm == "AdaBoost":
        clf = AdaBoostClassifier()
    elif algorithm == "KNeighbor":
        clf = KNeighborsClassifier()
        
    dump_classifier_and_data(clf, data_dict, features_sublist)
    clf_test, dataset_test, feature_list_test = load_classifier_and_data()
        
    accuracy, precision, recall, f1 = test_classifier(clf_test, dataset_test,
                                                      feature_list_test, False)
    accuracy_optimum_values.append(accuracy)
    precision_optimum_values.append(precision)
    recall_optimum_values.append(recall)
    f1_optimum_values.append(f1)

print "ALGORITHM", " "*(15-len("ALGORITHM")), "# OF FEATURES", " "* \
    (18-len("# OF FEATURES")), "F1 SCORE"
print "-------------------------------------------------"
for i, algorithm in enumerate(algorithms):
    features = optimum_feature_no[i]
    print algorithm, " "*(20-len(algorithm)), features, " "*13, \
        round(f1_optimum_values[i],4)