def GaussianNB(feature_list, dataset): from sklearn.naive_bayes import GaussianNB clf = GaussianNB() test_classifier(clf, dataset, feature_list) #score = clf. return clf
def RandomForest(feature_list,dataset): from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() test_classifier(clf,dataset,feature_list) imp= clf.feature_importances_ print_importance (feature_list,imp) return clf
def tune_classifier(classifier, clf_params, max_features): ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = get_feature_list() ### Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = get_data() ### Extract features and labels from dataset for local testing features_list = features_list[0:max_features+1] data, labels, features = get_features_and_labels(my_dataset, features_list) ### Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) # Testing clf = GridSearchCV(classifier, param_grid=clf_params, scoring=make_scorer(f1_score)) clf.fit(features_train, labels_train) clf_final = clf.best_estimator_ print "The best estimator = ", clf_final test_classifier(clf_final, my_dataset, features_list, 1000)
def decisionTree(feature_list, dataset): from sklearn import tree clf = tree.DecisionTreeClassifier() test_classifier(clf, dataset, feature_list) print clf.feature_importances_ return clf
def iterPipe(num1, num2): for i in range(num1, num2 + 1): # estimators = [('scaling', StandardScaler()),('reduce_dim', PCA()), ('dtc', DTC(min_samples_split=i*2))] # estimators = [('reduce_dim', PCA(n_components=2)), ('dtc', DTC(min_samples_split=i))] # clfIter = Pipeline(estimators) # clfIter.set_params(reduce_dim__n_components=3) clfIter = DTC(min_samples_split=i) test_classifier(clfIter, my_dataset, features_list)
def KNN(feature_list,dataset): from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler knn = KNeighborsClassifier() # feature scale estimators = [('scale', StandardScaler()), ('knn', knn)] clf = Pipeline(estimators) test_classifier(clf, my_dataset, features_list)
def setup_and_test(my_dataset, features_list, classifier): # Dump classifier and features list, so we can test them dump_classifier_and_data(classifier, my_dataset, features_list) # load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() # Run testing script test_classifier(clf, dataset, feature_list) return
def tuneDT(feature_list,dataset): from sklearn.neighbors import KNeighborsClassifier from sklearn.grid_search import GridSearchCV from sklearn import tree tree_clf = tree.DecisionTreeClassifier() parameters = {'criterion':('gini', 'entropy'), 'splitter':('best','random')} clf = GridSearchCV(tree_clf, parameters,scoring = 'recall') test_classifier(clf, my_dataset, features_list) print '###best_params' print clf.best_params_
def detect_poi(): ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### Task 1: Remove outliers data_dict.pop('TOTAL',0) ### Task 2: Select what features ### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio' ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". my_dataset = data_dict stk_pay_ratio(my_dataset) from_poi_ratio(my_dataset) to_poi_ratio(my_dataset) bonus_salary_ratio(my_dataset) ### Task 3: Feature Selection ### Generate a set of 15 feature lists from these 4 features ### This way, all possible combinations of these features are tested all_features_list = fList_set() ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation in tester.py metrics = [] clf = GaussianNB() ### ptest uses Stratified shuffle split cross validation and calculates the precision ### Find the precision for every list for i in range(0,15): metrics.append(ptest(clf,my_dataset,all_features_list[i])) ### Go for the feature list that produces the best precision. ### For this dataset only, it is harder to get a high precision. best = np.array(metrics).argmax() ### Run test_classifier to print evaluation metrics to console test_classifier(clf, my_dataset,all_features_list[best]) ### Now use the same feature list to run the decison tree classifier features_list = all_features_list[best] ### Task 4: Try a varity of classifiers samples_split_values = [2,4] samples_leaf_values = [1,2] for split in samples_split_values: for leaf in samples_leaf_values: clf = tree.DecisionTreeClassifier(min_samples_split=split,\ min_samples_leaf=leaf) test_classifier(clf, my_dataset, features_list) print_feature_importances(features_list, clf) ###Choose best classfier and feature set clf = GaussianNB() ### Dump classifier, dataset, and features_list dump_classifier_and_data(clf, my_dataset, features_list)
def tuneKmeans(feature_list,dataset): from sklearn.cluster import KMeans from sklearn.grid_search import GridSearchCV km_clf = KMeans(n_clusters=2, tol=0.001) parameters = {'n_clusters': (2,10)} clf = GridSearchCV(km_clf, parameters, scoring='recall') test_classifier(clf, dataset, feature_list) print '###best_params' print clf.best_params_ return clf.best_estimator_
def explore_scores(): for n in features: for c in n_neighbor: for d in weights: for e in algorithm: for f in leaf_size: for g in p: for h in metric: feature = 0 feature = features_select(n) pipeline = Pipeline([('normalization', scaler), ('classifier', KNeighborsClassifier(n_neighbors=c, weights=d, algorithm=e, leaf_size=f, p=g, metric=h))]) test_classifier(pipeline, enron_data, feature)
def tuneKNN(feature_list,dataset): from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.grid_search import GridSearchCV knn = KNeighborsClassifier() # feature scale estimators = [('scale', StandardScaler()), ('knn', knn)] pipeline = Pipeline(estimators) parameters = {'knn__n_neighbors':[1,8], 'knn__algorithm':('ball_tree','kd_tree','brute','auto')} clf = GridSearchCV(pipeline, parameters,scoring = 'recall') test_classifier(clf, my_dataset, features_list) print '###best_params' print clf.best_params_
def getRF(): print "===============" print "RandomForests" print "===============" for score in scores: print score print #parameters = {'n_estimators':range(10, 150, 10), 'criterion':['gini', 'entropy'], 'min_samples_split':range(2, 8, 2)} parameters = {'rf__n_estimators':range(10, 150, 10), 'rf__criterion':['gini', 'entropy'], 'rf__min_samples_split':range(2, 8, 2), 'selector__k':range(3, 22, 1)} gs = grid_search.GridSearchCV(rf_pipe, parameters, scoring=score, cv=cv) gs.fit(features, labels) #This is the model you pass to tester.py clf = gs.best_estimator_ print " " print "Optimal Model - by Grid Search" print clf print " " best_parameters = gs.best_estimator_.get_params() print " " print "Best Parameters- by Grid Search" print best_parameters print " " labels_pred = gs.predict(features) # Print Results (will print the Grid Search score) print "Grid Search Classification report:" print " " print classification_report(labels, labels_pred) print ' ' # Print Results (will print the tester.py score) print "tester.py Classification report:" print " " test_classifier(clf, my_dataset, features_list) print " " print
def getAda(): print "===============" print "AdaBoost" print "===============" for score in scores: print score print #parameters = {'n_estimators':range(50, 100, 1), 'learning_rate':[x * 0.01 for x in range(100, 160, 1)]} parameters = {'ada__n_estimators': range(1, 100, 20), 'ada__learning_rate':[x * 0.01 for x in range(100, 160, 10)], 'selector__k':range(3, 22, 1)} gs = grid_search.GridSearchCV(ada_pipe, parameters, scoring=score, cv=cv) gs.fit(features, labels) #This is the model you pass to tester.py clf = gs.best_estimator_ print " " print "Optimal Model - by Grid Search" print clf print " " best_parameters = gs.best_estimator_.get_params() print " " print "Best Parameters- by Grid Search" print best_parameters print " " labels_pred = gs.predict(features) # Print Results (will print the Grid Search score) print "Grid Search Classification report:" print " " print classification_report(labels, labels_pred) print ' ' # Print Results (will print the tester.py score) print "tester.py Classification report:" print " " test_classifier(clf, my_dataset, features_list) print " " print
def getKNN(): print "===============" print "KNeighborsClassifier" print "===============" for score in scores: print score print #parameters = {'n_neighbors':range(2, 10, 2), 'weights':['distance', 'uniform'], 'metric':['minkowski', 'euclidean']} parameters = {'knn__n_neighbors': range(2, 10, 2), 'knn__weights':['distance', 'uniform'], 'knn__metric':['minkowski', 'euclidean'], 'selector__k':range(3, 20, 1)} gs = grid_search.GridSearchCV(knn_pipe, parameters, scoring=score, cv=cv) gs.fit(features, labels) #This is the model you pass to tester.py clf = gs.best_estimator_ print " " print "Optimal Model - by Grid Search" print clf print " " best_parameters = gs.best_estimator_.get_params() print " " print "Best Parameters- by Grid Search" print best_parameters print " " labels_pred = gs.predict(features) # Print Results (will print the Grid Search score) print "Grid Search Classification report:" print " " print classification_report(labels, labels_pred) print ' ' # Print Results (will print the tester.py score) print "tester.py Classification report:" print " " test_classifier(clf, my_dataset, features_list) print " " print
def getSVC(): print "===============" print "SVC" print "===============" for score in scores: print score print parameters = {'sv__C': [0.01, 0.1, 1, 500, 1000, 5000, 10000, 50000, 100000], 'sv__kernel':['linear'], 'selector__k':range(3, 22, 1)} #'sv__gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1, 10, 100, 500, 1000], gs = grid_search.GridSearchCV(sv_pipe, parameters, scoring=score, cv=cv) gs.fit(features, labels) #This is the model you pass to tester.py clf = gs.best_estimator_ print " " print "Optimal Model - by Grid Search" print clf print " " best_parameters = gs.best_estimator_.get_params() print " " print "Best Parameters- by Grid Search" print best_parameters print " " labels_pred = gs.predict(features) # Print Results (will print the Grid Search score) print "Grid Search Classification report:" print " " print classification_report(labels, labels_pred) print ' ' # Print Results (will print the tester.py score) print "tester.py Classification report:" print " " test_classifier(clf, my_dataset, features_list) print " " print
def getNB(): print "===============" print "GaussianNB" print "===============" for score in scores: print score print parameters = {'selector__k':range(3, 22, 1)} gs = grid_search.GridSearchCV(nb_pipe, parameters, scoring=score, cv=cv) gs.fit(features, labels) #This is the model you pass to tester.py clf = gs.best_estimator_ print " " print "Optimal Model - by Grid Search" print clf print " " best_parameters = gs.best_estimator_.get_params() print " " print "Best Parameters- by Grid Search" print best_parameters print " " labels_pred = gs.predict(features) # Print Results (will print the Grid Search score) print "Grid Search Classification report:" print " " print classification_report(labels, labels_pred) print ' ' # Print Results (will print the tester.py score) print "tester.py Classification report:" print " " test_classifier(clf, my_dataset, features_list) print " " print
def train_test(): data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) clf = DecisionTreeClassifier(random_state=42) clf.fit(features_train, labels_train) print test_classifier(clf, my_dataset, features_list) ### Print feature importance in order features_imp = {} for i in xrange(len(features_list)-1): features_imp[features_list[1+i]] = clf.feature_importances_[i] pprint(sorted(features_imp.items(), key=operator.itemgetter(1),reverse=True))
def main(): data_dict = pickle.load(open("final_project_dataset.pkl", "r")) my_dataset = data_dict my_dataset = AddFeatures(my_dataset) # Exclude using Discretion. Exc1 = ["email_address"] # Replaced by creating better versions of the features Exc2 = ["to_messages", "from_messages", "from_this_person_to_poi", "from_poi_to_this_person"] # Exclude because Highly Correlated with stronger features Exc3 = [ "deferral_payments", "expenses", "deferred_income", "restricted_stock_deferred", "director_fees", "long_term_incentive", "bonus", "total_payments", "salary", "total_stock_value", "restricted_stock", "exercised_stock_options", "other", ] exclude = Exc1 + Exc2 + Exc3 # QueryDataSet(my_dataset) # ShowCorrel(my_dataset) features_list = next(my_dataset.itervalues()).keys() for i in exclude: features_list.remove(i) features_list.insert(0, features_list.pop(features_list.index("poi"))) data = featureFormat(my_dataset, features_list, sort_keys=True) ### Extract features and labels from dataset for local testing labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.1, random_state=42, stratify=labels ) # clf=TuneSVM(features, labels,features_list) # clf=TuneKNN(features, labels,features_list) # clf=NoTuneDT(features, labels,features_list) # clf=TuneDT(features,labels,features_list) features_list.insert(0, "poi") dump_classifier_and_data(clf, my_dataset, features_list) test_classifier(clf, my_dataset, features_list)
def train_and_predict(first,second): #trains the model and returns the value of desired evaluation metric features_list = ["poi",first,second] data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) from sklearn.naive_bayes import GaussianNB from sklearn import tree if dt: clf = tree.DecisionTreeClassifier() else: clf = GaussianNB() if f1: return test_classifier(clf, my_dataset, features_list,return_F1=True) else: return test_classifier(clf, my_dataset, features_list,return_precision=True)
def get_top_features_all_data(X_df, y_df, grid_searcher, top_N=9): '''Give an estimate of the model produced by grid_search using features selected from using ExtraTreesClassifier on the entire dataset before searching for a model. In general, this may produce overly optimistic results since there is leakage from the test dataset when selecting features using the entire dataset. This is to show that this can improve cross-validated internal testing over choosing kbest within each cross-validation fold, but is still overly optimistic if the model were to be used on completely new data. Args: X_df: Pandas dataframe of features used to predict. y_df: Pandas dataframe of labels being predicted. grid_searcher: GridSearchCV object being searched over for optimal tuning parameters. top_N: Top N features to retain based on feature importances obtained from the ExtraTreesClassifier estimator used in the top_N_features() function. Returns: A list of the top N features that were selected to be fed into the GridSearchCV object. Prints: Test results from the 1000 cross-validation splits testing in tester.py ''' top_N_features = top_importances(X_df, y_df, top_N=top_N) top_N_names = list(top_N_features.index) X_df = X_df[top_N_names] features_list = ['poi'] + list(top_N_names) grid_searcher.fit(X_df, y_df) clf = grid_searcher.best_estimator_ my_dataset = combine_to_dict(features_df=X_df, labels_df=y_df) data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) test_classifier(clf, my_dataset, features_list) return top_N_features
def find_best_features(feature_names, features, labels, classifier_fun, search_grid, normalize_data=False): results = [] processed_features = np.array(features) processed_labels = labels if normalize_data: scaler = StandardScaler() processed_features = scaler.fit_transform(processed_features, processed_labels) feature_selector = SelectKBest(k="all") feature_selector.fit(processed_features, processed_labels) ranked_features = sorted(zip(feature_names, feature_selector.scores_), key=lambda t: t[1], reverse=True) ranked_feature_names = [t[0] for t in ranked_features] logging.info("Scored features:\n%s", pprint.pformat(ranked_features)) logging.info("Ranked feature names: %s", ranked_feature_names) for k in range(1, len(feature_names) + 1): logging.info("Selecting %s best feature(s)", k) selected_feature_names = ranked_feature_names[:k] logging.info("Selected features: %s", selected_feature_names) feature_indices = [feature_names.index(f) for f in selected_feature_names] feature_subset = processed_features[:, feature_indices] clf = classifier_fun(random_state=98123) logging.info("Tuning classifier parameters.") clf_tune = grid_search.GridSearchCV( clf, search_grid, n_jobs=-1, cv=StratifiedShuffleSplit(labels, n_iter=1000, random_state=42), scoring="f1" ) clf_tune.fit(feature_subset, processed_labels) logging.info("Scores:\n%s", pprint.pformat(clf_tune.grid_scores_)) logging.info("Best parameters: %s with score %s", clf_tune.best_params_, clf_tune.best_score_) clf = classifier_fun(random_state=1987341, **clf_tune.best_params_) logging.info("Testing classifier.") precision, recall, f1 = test_classifier(clf, feature_subset, processed_labels) results.append((k, precision, recall, f1)) logging.info("Best features:\n%s", pprint.pformat(results))
def analyze_feats(each_feature_set, my_dataset, scoresheet_highest_accuracy, scoresheet_highest_precision): data = featureFormat(my_dataset, each_feature_set, sort_keys=True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = ( train_test_split(features, labels, test_size=0.5, random_state=42)) # ################## For each feature set, tune the SVC parameter and # return the best SVC parameters # tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 3, 10, 100, 1000], # 'degree':[1,2,3]}] # #score = 'precision' # clf = GridSearchCV(SVM, tuned_parameters) # clf.fit(features_train, labels_train) # SVM = clf.best_estimator_ # print SVM # For each feature set, tune the SVC parameter and return the best SVC # parameters DT_tuned_parameters = [{'min_samples_split': [30, 40, 50]}] # score = 'precision' dt_clf = GridSearchCV(tree.DecisionTreeClassifier(), DT_tuned_parameters) dt_clf.fit(features_train, labels_train) DT = dt_clf.best_estimator_ print DT classifier_type = [DT] # continue # run each type of classifier and return results try: total_results = [] for index, each_clf in enumerate(classifier_type): results = test_classifier(each_clf, my_dataset, each_feature_set) print each_feature_set, results total_results.append(results) # for a given feature set, find the classifier with highest # precision/accuracy and store it in a list for index, num in enumerate(total_results): if num[1] == max([accuracy[1] for accuracy in total_results]): # print "Highest accuracy: \t", num[0], num[1] scoresheet_highest_accuracy.append( [each_feature_set, total_results[index]]) if num[1] == max([precision[1] for precision in total_results]): # print "Highest precision: \t", num[0], num[1] scoresheet_highest_precision.append( [each_feature_set, total_results[index]]) except: pass
def try_all_k_best(max=13): data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.25, random_state=42) for k in range(1,max+1): pipe = Pipeline([('impute', Imputer(strategy='median')), ('select', SelectKBest(k=k)), ('classify', LogisticRegressionCV())]) pipe.fit(features_train, labels_train) total_predictions, accuracy, precision, recall, f1, f2 = \ test_classifier(pipe, my_dataset, features_list, folds=1000) acc.append(accuracy) prec.append(precision) reca.append(recall)
def one_feature_predict(features_list, my_dataset): all = [] for i in features_list: if i != 'poi': l = [] l.append('poi') l.append(i) all.append(l) #print all mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2'] resultdf = pd.DataFrame(columns=mycolumns) for item in all: data = featureFormat(my_dataset, item, sort_keys = True) labels, features = targetFeatureSplit(data) clf = tree.DecisionTreeClassifier(min_samples_split = 4) clf.fit(features, labels) resultdf.loc[len(resultdf)] = (test_classifier(clf, my_dataset, item)) return resultdf
def analyze_feats(each_feature_set, my_dataset, classifier_type, scoresheet_highest_accuracy, scoresheet_highest_precision, scoresheet_highest_recall): # run each type of classifier and return results try: total_results = [] for index, each_clf in enumerate(classifier_type): results, feature_importances = test_classifier(each_clf, my_dataset, each_feature_set) if len(feature_importances) > 0: # results, feature_importances = test_classifier(each_clf, my_dataset, each_feature_set) print "####CLF NAME",each_clf print "#####Length of feature_importances",len(feature_importances) np.asarray(feature_importances) importances = zip(np.mean(feature_importances, axis=0),each_feature_set[1:]) importances = sorted(importances,key=lambda i:i[0],reverse=True) print "#####Length of importances",len(importances) print importances print each_feature_set, results total_results.append(results) print "total_results",total_results # for a given feature set, find the classifier with highest # precision/accuracy and store it in a list for index, num in enumerate(total_results): if num[1] == max([accuracy[1] for accuracy in total_results]): print "Highest accuracy: \t", num[0], num[1] scoresheet_highest_accuracy.append( [each_feature_set, total_results[index]]) if num[2] == max([precision[2] for precision in total_results]): print "Highest precision: \t", num[0], num[2] scoresheet_highest_precision.append( [each_feature_set, total_results[index]]) if num[3] == max([recall[3] for recall in total_results]): print "Highest recall: \t", num[0], num[3] scoresheet_highest_recall.append( [each_feature_set, total_results[index]]) except: pass return
def tuneNB(): for i in range(1, 20): acc = [] prec = [] reca = [] testing_features_list = [u'poi'] for feature in features_list_score_order: testing_features_list.append(feature) pipe = Pipeline([('impute', Imputer(strategy='median')), ('classify', GaussianNB(priors=[(i/2.)*.1, (1 - (i/2.)*.1)]))]) total_predictions, accuracy, precision, recall, f1, f2 = \ test_classifier(pipe, my_dataset, testing_features_list, folds=200) acc.append(accuracy) prec.append(precision) reca.append(recall) acc_all.append(acc) prec_all.append(prec) reca_all.append(reca) results_dict['prec' + str(i)] = prec results_dict['reca' + str(i)] = reca results_dict['acc' + str(i)] = acc
'min_samples_leaf': [x for x in range(1, 10, 2)], 'max_depth': [None, 1, 2, 4, 8, 12, 18], 'max_features': ['log2', 'sqrt'] } rfc = RandomForestClassifier(random_state=42) clf = GridSearchCV(rfc, param_list, cv=5, verbose=3, n_jobs=-1) clf_ = clf.fit(features, labels) print clf.best_score_ print clf.best_estimator_ print "Training Set Score:", clf.score(X_train, y_train) print "Validation Set Score:", clf.score(X_val, y_val) ## Cross Validation of Model test_classifier(clf.best_estimator_, features, labels, folds=100) ########################################################################### ## Make predictions test_features = scl.fit_transform( df_test.drop(['Survived', 'PassengerId'], axis=1).values) clf.best_estimator_.fit(features, labels) predictions = clf.best_estimator_.predict(test_features) output_df = pd.DataFrame({ 'PassengerId': df_test['PassengerId'], 'Survived': pd.Series(predictions) }) output_df = output_df.astype('Int64')
'total_stock_value', 'prop_to_poi', 'prop_from_poi' ] # You will need to use more features from tester import test_classifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=42) clf2 = GaussianNB() clf3 = RandomForestClassifier(random_state=42) clf4 = SGDClassifier(random_state=42) test_classifier(clf, fp, features_list, folds=1000) test_classifier(clf2, fp, features_list, folds=1000) test_classifier(clf3, fp, features_list, folds=1000) test_classifier(clf4, fp, features_list, folds=1000) ### Result from initial classifier using all features ### DecisionTreeClassifier Accuracy: 0.80840 Precision: 0.26327 Recall: 0.24300 F1: 0.25273 F2: 0.24680 #BEST higher overall precision, recall& F1 ### GaussianNB Accuracy: 0.83920 Precision: 0.32890 Recall: 0.19800 F1: 0.24719 F2: 0.21512 ### RandomForestClassifier Accuracy: 0.86073 Precision: 0.42811 Recall: 0.13250 F1: 0.20237 F2: 0.15373 ### SGDClassifier Accuracy: 0.52980 Precision: 0.10665 Recall: 0.34250 F1: 0.16265 F2: 0.23747 # Using Decision Tree Classifier to find attributes of importance fp = pd.DataFrame(fp) fp = fp.transpose() X = fp.drop(['poi'], axis=1)
#dt = DecisionTreeClassifier() t0 = time() grid_obj = GridSearchCV(dt, parameters, scoring='f1', cv=sss) print "======== Decision Tree (Optimized) ========" print("DecisionTree tuning: %r" % round(time() - t0, 3)) # TODO: Fit the grid search object to the training data and find the optimal parameters t0 = time() grid_obj = grid_obj.fit(features, labels) print("DecisionTree fitting: %r" % round(time() - t0, 3)) # Get the estimator dt = grid_obj.best_estimator_ ## Print the parameters print dt.get_params(), '\n' print 'Result of feature_list without new create feature:' test_classifier(dt, my_dataset, features_list_without_create_feature, folds=100) print 'Result of feature_list with new create feature:' test_classifier(dt, my_dataset, features_list, folds=100) clf = dt ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
clf = clf.fit(feature_train, target_train) accuracy_grid = clf.score(feature_train, target_train) print "Best estimator found by grid search:" print clf.best_estimator_ return clf.best_estimator_, accuracy_grid ############################################################################### enron_method_svm = SVR(kernel='rbf') param_grid_svm = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } [clf_grid, acc5] = val_grid(enron_method_svm, features_train, labels_train, param_grid_svm) print acc5 [accuracy, precision, recall, f1, f2] = test_classifier(clf_svm, pd.DataFrame(data_dict), features_list, folds=1000) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf_reg, my_dataset, features_list)
# parameters = { # 'anova__k': (2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), # 'anova__k': [2,4,6,8,10,12,14,16,18,20], # 'pca__whiten': [True, False], # 'pca__n_components': [6,8,10,11], # For use with PCA # 'clf__min_samples_split':[2,10,20,30,40,50], # for use with DecisionTree # 'clf__criterion': ['gini','entropy'] # for use with DecisionTree # 'clf__n_estimators': [50,100,200], # For use with Adaboost # 'clf__C': [1, 10, 100, 1e3, 5e3, 1e4, 5e4, 1e5], # for use with SVM # 'clf__gamma': [0.0001, 0.0005, 0.001, 0.005, # for use with SVM # 0.01, 0.1], # 'clf__kernel': ['linear','rbf','poly'] # for use with SVM # } ## Create Cross Validation object for use in GridSearchCV # cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) ## Apply GridSearchCV to the dataset # clf = GridSearchCV(clf, parameters, scoring = 'f1', cv=cv) # clf.fit(features, labels) ## Set the best performing combination of parameters as the new classifier # clf = clf.best_estimator_ ## Use included tester function to assess performance using cross validation test_classifier(clf, my_dataset, features_list) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(clf, my_dataset, features_list)
## The DT algorithm results will be displayed while running tester.py # dt_pred = dt_clf.predict(features_test) # print "DT best accuracy:", accuracy_score(dt_pred,labels_test) # print "DT Precision:", precision_score(labels_test, dt_pred) # print "DT Recall:", recall_score(labels_test, dt_pred) ########################################################################################### ## Run tester.py selected_features = [ 'poi', 'total_payments', 'total_stock_value', 'salary', 'bonus', 'fraction_from_poi', 'fraction_to_poi' ] dump_classifier_and_data(dt_clf_best, enron_less_outliers, selected_features) test_classifier(dt_clf_best, enron_less_outliers, selected_features) ########################################################################################## ########################################################################################### # ## Decision Tree Using SelectK in GridSearchCV # # print "Check performance of Decision Tree Using SelectK in GridSearchCV" # # data = featureFormat(enron_less_outliers, features_all, sort_keys=True) # labels, features = targetFeatureSplit(data) # # features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( # features, labels, test_size=0.2, random_state=42) # dt = tree.DecisionTreeClassifier(random_state=42) #
### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html print "Tuning parameters of classifiers" print # Naive Bayes classifier print "Performing Grid Search of Naive Bayes classification" param_dict_NB = {'feature_selection__k': range(5, len(features_list))} gs = grid_search(steps_NB, param_dict_NB, features, labels) gs_clf_NB = gs.best_estimator_ print '\n Score Metrics Decission Tree Classifier' test_classifier(gs_clf_NB, data_dict, features_list, folds=1000) print # The rest of the parameter tuning is comented out due to time execution # NB turned out to be the classifier with better scores ''' # Decision Tree classifier print "Performing Grid Search of Decission Tree classification" param_dict_DT = {'feature_selection__k': range(5, len(features_list)),\ 'Decission_Tree__criterion': ['gini', 'entropy'],\ 'Decission_Tree__min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9, 10]} gs = grid_search(steps_DT, param_dict_DT, features, labels) gs_clf_DT = gs.best_estimator_ print '\n Score Metrics Decission Tree Classifier' test_classifier(gs_clf_DT, data_dict, features_list, folds = 100)
false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf ##running through different fold inputs of k-fold cross-validation #folds = [2,3,5,10] #for each in folds: # test_classifier_kfold(clf,my_dataset,features_list,each) ##test the algorithm multiple times and obtain the accuracy, precision, and recall averages tot_accuracy = 0 tot_precision = 0 tot_recall = 0 i = 0 while i < 10: accuracy, precision, recall = test_classifier(clf, my_dataset, features_list) tot_accuracy += accuracy tot_precision += precision tot_recall += recall i += 1 print tot_accuracy / float(10), tot_precision / float(10), tot_recall / float( 10) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(clf, my_dataset, features_list)
cv = StratifiedShuffleSplit(labels, folds, random_state=42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ## Initial algorithms scores clf_AB = AdaBoostClassifier() tester.test_classifier(clf_AB, data_dict, features_list) clf_RBF = SVC(kernel='rbf', max_iter=1000) tester.test_classifier(clf_RBF, data_dict, features_list) clf_RF = RandomForestClassifier() tester.test_classifier(clf_RF, data_dict, features_list) clf_SVC = SVC(kernel='linear', max_iter=1000) tester.test_classifier(clf_SVC, data_dict, features_list) clf_NB = GaussianNB() tester.test_classifier(clf_NB, data_dict, features_list) clf_KNN = KNeighborsClassifier() tester.test_classifier(clf_KNN, data_dict, features_list) # ### Task 5: Tune your classifier to achieve better than .3 precision and recall # ### using our testing script. Check the tester.py script in the final project # ### folder for details on the evaluation method, especially the test_classifier # ### function. Because of the small size of the dataset, the script uses
tree_features = {} # show which features correspond to average importances for idx, elem in enumerate(analyzed_features_list[1:]): tree_features[elem] = get[idx] print tree_features fin_feat_tree = [ 'poi', '%frompoi', 'shared_receipt_with_poi', 'exercised_stock_options', 'expenses' ] #Gaussian NB comparison clf = GaussianNB() test_classifier(clf, my_dataset, fin_feat_kbest, folds=1000) clf = GaussianNB() test_classifier(clf, my_dataset, fin_feat_tree, folds=1000) # preparing ground for DT classifier data = featureFormat(my_dataset, fin_feat_kbest, sort_keys=True) data2 = featureFormat(my_dataset, fin_feat_tree, sort_keys=True) labels, features = targetFeatureSplit(data) labels2, features2 = targetFeatureSplit(data2) tuned_parameters_tree = [{ 'max_depth': [2, 3, 4, 5, 6], 'min_samples_leaf': [1, 2, 3, 4] }]
# grid_search.fit(features,labels) # pprint.pprint(grid_search.grid_scores_) # use K-best to rank the best features k_best = SelectKBest() k_best.fit(features, labels) results_list = zip(k_best.get_support(), features_list[1:], k_best.scores_) results_list = sorted(results_list, key=lambda x: x[2], reverse=True) # print the scores for each feature pprint.pprint(results_list) # use feature_importances_ from a decision tree classifier to rank the best features from tester import test_classifier, dump_classifier_and_data from sklearn import tree clf_test = tree.DecisionTreeClassifier() test_classifier(clf_test, my_dataset, features_list) importance = clf_test.feature_importances_ for i in range(len(importance)): print features_list[i + 1] + ": " + str(importance[i]) # Using the ranking from K-best anf merging it with the more important features obtained from de Decision tree clasifier # I decided to select the best of both sets: features_list = [ 'poi', 'exercised_stock_options', 'total_stock_value', 'salary', 'fraction_to_poi', 'restricted_stock', 'shared_receipt_with_poi' ] ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info:
labels, features = targetFeatureSplit(data) # In[7]: ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. clf = GaussianNB() test_classifier(clf,my_dataset,features_list) clf1=tree.DecisionTreeClassifier() test_classifier(clf1,my_dataset,features_list) clf2 = AdaBoostClassifier() test_classifier(clf2,my_dataset,features_list) clf3=KNeighborsClassifier(n_neighbors = 4) test_classifier(clf3,my_dataset,features_list) # In[8]: from sklearn.neighbors.nearest_centroid import NearestCentroid clf4 = NearestCentroid() test_classifier(clf4,my_dataset,features_list)
])) feature_list_default.insert( 0, feature_list_default.pop(feature_list_default.index("poi"))) # test different classifiers with the default feature set (for more details, please see the notebook) for clf in [ GaussianNB(), KMeans(), LogisticRegression(class_weight="balanced"), SVC(class_weight="balanced"), ADA(), DT(class_weight="balanced"), RF(class_weight="balanced") ]: tester.test_classifier(clf, final_dataset, feature_list_default) ### 4.1: Evaluate the impact of feature engineering on classification performance (for more details, please refer to the notebook). Test only with SVC (the best classifier). print "BASELINE PERFORMANCE -default feature set and SVC with linear kernel" print "--------------------------------------------------------------------" clf = SVC(kernel="linear", class_weight="balanced") tester.test_classifier(clf, final_dataset, feature_list_default) print "EXTENDED FEATURE SET1 PERFORMANCE -default feature set + TF-IDF features and SVC with linear kernel" print "--------------------------------------------------------------------" selected_feature_list = feature_list_default + [ 'word_feature_2', 'word_feature_3' ] clf = SVC(kernel="linear", class_weight="balanced") tester.test_classifier(clf, final_dataset, selected_feature_list)
parameters_NB = dict(SelectKBest__k=range(1, 10)) pipeline = sklearn.pipeline.Pipeline(steps_NB) grid = GridSearchCV(pipeline, param_grid=parameters_NB, cv=cv, scoring='f1') grid.fit(features_train, labels_train) predict = grid.predict(features_test) report = classification_report(labels_test, predict) best_params = grid.best_params_ #print report print "PARAMETERS USED:" print best_params print grid.best_score_ clf_GNB = grid.best_estimator_ print "TUNED CLASSIFICATION REPORT:" test_classifier(clf_GNB, my_dataset, RF_features_list, folds=1000) #overwrite features_list features_list = RF_features_list #tuned Random Forest **without** SKB steps_RF = [ ('minmax', mms), #('SelectKBest', skb), ('random_forest', clf_RF) ] parameters_RF = dict( #SelectKBest__k = [6], random_forest__criterion=['gini'], random_forest__n_estimators=[9], random_forest__min_samples_split=[2],
#效果不好,予以去除 #ab_pca = {"PCA__n_components": range(4, 7), "PCA__whiten": [True, False]} #ab_k.update(ab_pca) ab_k.update(ab_params) enron.get_best_parameters_reports(pipe_ab, ab_k, features, labels) if __name__ == '__main__': ''' GAUSSIAN NAIVE BAYES ''' #设置需要使用的分类器 clf = GaussianNB() #CrossValidation进行测试 print "Gaussian Naive Bayes : \n", tester.test_classifier( clf, my_dataset, best_features_list) """ Gaussian Naive Bayes : GaussianNB(priors=None) Accuracy: 0.84380 Precision: 0.40058 Recall: 0.34550 F1: 0.37101 F2: 0.35527 Total predictions: 15000 True positives: 691 False positives: 1034 False negatives: 1309 True negatives: 11966 None """ ''' LOGISTIC REGRESSION ''' #使用tune获取每个算法的最佳参数 #tune_logistic_regression() """
### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. #GaussianNB from sklearn.naive_bayes import GaussianNB clf = GaussianNB() test_classifier(clf,my_dataset,features_list,folds = 1000) #decision tree from sklearn import tree clf = tree.DecisionTreeClassifier(min_samples_leaf=1) test_classifier(clf,my_dataset,features_list,folds = 1000) #Adaboost from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier() test_classifier(clf,my_dataset,features_list,folds = 1000) #kNearestNeighbours from sklearn.neighbors import KNeighborsClassifier clf=KNeighborsClassifier(n_neighbors = 4) test_classifier(clf,my_dataset,features_list)
for i, clf in enumerate(classifiers): print 'Step 1: ', i, names[i] clf.fit(features_train, labels_train) pred = clf.predict(features_test) print "Precision: ", precision_score(labels_test, pred) print "Recall: ", recall_score(labels_test, pred) print "F1: ", f1_score(labels_test, pred) print 'done...' for clf in classifiers: test_classifier(clf, my_dataset, features_list) print '-------------------------------------------------------------------------------' #%% Step 2: classifier_opt = [] parameters = [ dict(), dict(n_neighbors=range(1, 20, 1), weights=['uniform', 'distance']), dict(criterion=['gini', 'entropy'], min_samples_split=range(10, 30, 1), min_samples_leaf=range(1, 11, 1)), dict(criterion=['gini', 'entropy'], n_estimators=[5, 8, 10, 12, 25],
precision_knn = [] recall_knn = [] # Apply SelectKBest to each classifier for k=1 to k=19 for i in range(1, 20): k = SelectKBest(f_classif, k=i) features_new = k.fit_transform(features, labels) selected_features_index = k.get_support() selected_features_list = features_list[selected_features_index] selected_features_list = np.insert(selected_features_list, 0, 'poi') print "====================" print "Selected features: ", selected_features_list # DecisionTree clf = tree.DecisionTreeClassifier() pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000) precision_tree.append(pre) recall_tree.append(rec) # Naive Bayes clf = naive_bayes.GaussianNB() pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000) precision_nb.append(pre) recall_nb.append(rec) # K Nearest Neighbors clf = neighbors.KNeighborsClassifier(n_neighbors=3) pre, rec = test_classifier(clf, my_dataset, selected_features_list, folds = 1000) precision_knn.append(pre) recall_knn.append(rec)
# Out of the three SVC seems to be most accurate. # In[89]: from sklearn.tree import DecisionTreeClassifier from tester import test_classifier from sklearn import preprocessing from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.cross_validation import StratifiedShuffleSplit test_classifier(DecisionTreeClassifier( random_state = 1), enron_data, features_final, folds = 100) tree = DecisionTreeClassifier() parameters = {'tree__criterion': ('gini','entropy'), 'tree__splitter':('best','random'), 'tree__min_samples_split':[2, 10, 20], 'tree__max_depth':[10,15,20,25,30], 'tree__max_leaf_nodes':[5,10,30]} # use scaling in GridSearchCV Min_Max_scaler = preprocessing.MinMaxScaler() #features = Min_Max_scaler.fit_transform(features) pipeline = Pipeline(steps=[('scaler', Min_Max_scaler), ('pca',PCA(n_components = 2)), ('tree', tree)]) cv = StratifiedShuffleSplit(target, 100, random_state = 42)
('NB', nb_Clf), ('SVM', svm_Clf)] results = [] names = [] scoring = 'accuracy' df1 = df f_list = df.columns f_list = f_list[1:len(f_list) - 1] f_list = map(str, f_list) f_features = ['poi'] for fn in f_list: f_features.append(fn) print(f_features) test_classifier(l_Clf, my_dataset, f_features, folds=45) test_classifier(lda_Clf, my_dataset, f_features, folds=45) test_classifier(knn_Clf, my_dataset, f_features, folds=45) test_classifier(rf_Clf, my_dataset, f_features, folds=45) print("\n\n") df['salary_bonus_ratio'] = df.salary.div(df.bonus) df.loc[~np.isfinite(df['salary_bonus_ratio']), 'salary_bonus_ratio'] = 0 df['salary_expense_ratio'] = df.salary.div(df.expenses) df.loc[~np.isfinite(df['salary_expense_ratio']), 'salary_expense_ratio'] = 0 features_list.append('salary_bonus_ratio') features_list.append('salary_expense_ratio') df = df[features_list] df = df.apply(np.sqrt, axis=1)
sd = StandardScaler() fsl = FeatureSel(k_best=5, pca_comp=5) # clf=Pipeline([("fsl",fsl),("sd",sd),("lvc",LinearSVC(C=0.000001))]) clf = Pipeline([("fsl", fsl), ("sd", sd), ("lvc", LinearSVC())]) gscv=GridSearchCV(clf,{"lvc__C":np.logspace(-6,-1,5), "fsl__k_best":[1,5,10], "fsl__pca_comp":[0,5,10]}, scoring="recall",verbose=0) gscv.fit(np.array(features),np.array(labels)) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html test_classifier(gscv.best_estimator_, my_dataset, features_list) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(gscv.best_estimator_, my_dataset, features_list)
sss = StratifiedShuffleSplit(labels, n_iter =100, test_size=0.3, random_state = 42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv = sss) ### Tried different parameter for StratifiedShuffleSplit and GridSearcgCV #sss= StratifiedShuffleSplit(n_iter = 20,test_size=0.5, random_state = 5) #grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv = sss, verbose=10, scoring='f1') #grid_search = GridSearchCV(pipeline, param_grid=parameters, cv = sss, error_score = 0, scoring='f1') #print "Grid Search: ", grid_search #print(grid_search.best_estimator_.steps) #print "\n", "Best parameters are: ", grid_search.best_params_, "\n" grid_search.fit(features, labels) clf = grid_search.best_estimator_ ### Use test_classifier.py to test the best model found from tester import test_classifier # Use test_classifier to evaluate the model selected by GridSearchCV print "\n", "Tester Classification report - StratifiedShuffleSplit:" test_classifier(clf, data_dict, features_list) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. #print features_list dump_classifier_and_data(clf, my_dataset, features_list)
print("Training time : {}".format(end_fitting - start_fitting)) start_predicting = time() svc_pred = svc_grid.predict(features_test) end_predicting = time() print("Predicting time : {}".format(end_predicting - start_predicting)) svc_accuracy = accuracy_score(svc_pred, labels_test) print('SVC accuracy score : {}'.format(svc_accuracy)) print "f1 score :", f1_score(svc_pred, labels_test) print "precision score :", precision_score(svc_pred, labels_test) print "recall score :", recall_score(svc_pred, labels_test) svc_best_estimator = svc_grid.best_estimator_ print(svc_best_estimator) test_classifier(nb_grid.best_estimator_, my_dataset, features_list) #Checking the affect of new feature on the final classifier test_features_list = [ 'poi', 'total_stock_value', 'exercised_stock_options', 'bonus', 'deferred_income', 'long_term_incentive', 'restricted_stock', 'salary', 'total_payments', 'other', 'shared_receipt_with_poi', 'fraction_from_this_person_to_poi' ] print "\n=================Effect of new feature on final classifier=================" test_classifier(nb_grid.best_estimator_, my_dataset, test_features_list) ###Task 6: Dump your classifier, dataset, and features_list so anyone can
} dtc_clf = sklearn.tree.DecisionTreeClassifier() dtcclf = grid_search.GridSearchCV(dtc_clf, parameters, scoring=scoring, cv=cv) dtcclf.fit(features, labels) print 'best estimator:', dtcclf.best_estimator_ print 'best score:', dtcclf.best_score_ print 'Processing time:', round(time() - t0, 3), 's' #Validation of ClassifierClassifier validation ##DecisionTreeClassifier Validation No. 1 (StratifiedShuffleSplit, folds = 1000) t0 = time() dtc_best_clf = dtcclf.best_estimator_ test_classifier(dtc_best_clf, enron_data, eng_feature_list) print 'Processing time:', round(time() - t0, 3), 's' ##DecisionTreeClassifier Validation No. 2 (Randomized, partitioned trials, n=1,000) t0 = time() dtc_best_clf = dtcclf.best_estimator_ evaluate.evaluate_clf(dtc_best_clf, features, labels, num_iters=1000, test_size=0.3) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print 'Processing time:', round(time() - t0, 3), 's'
# Importing the AdaBoost from Scikit learn package. from sklearn import tree # Creating a classifier with the optimized parameters. clf_ada = tree.DecisionTreeClassifier(splitter='best', criterion='gini', class_weight='balanced', min_samples_leaf=1, min_samples_split=2, max_depth=5, max_leaf_nodes=4) # Testing the classifier. tester.test_classifier(clf=clf_ada, dataset=my_dataset, feature_list=features_list) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Please, find more information in the Jupyter Notebook. # Example starting point. Try investigating other evaluation techniques! ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure
# from sklearn.preprocessing import MinMaxScaler from sklearn.naive_bayes import GaussianNB from sklearn import linear_model from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.feature_selection import SelectKBest, f_classif from sklearn.svm import SVC features_train_zero, features_test_zero, labels_train_zero, labels_test_zero = train_test_split( features, labels, test_size=0.3, random_state=42) clf = GaussianNB() clf.fit(features_train_zero, labels_train_zero) print test_classifier(clf, my_dataset, features_list, folds=1000) # #### Task 5: Tune your classifier to achieve better than .3 precision and recall #### using our testing script. Check the tester.py script in the final project #### folder for details on the evaluation method, especially the test_classifier #### function. Because of the small size of the dataset, the script uses #### stratified shuffle split cross validation. For more info: #### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # ## Example starting point. Try investigating other evaluation techniques! #from sklearn.cross_validation import train_test_split #features_train, features_test, labels_train, labels_test = \ # train_test_split(features, labels, test_size=0.3, random_state=42) # #### Task 6: Dump your classifier, dataset, and features_list so anyone can
random_state=42, stratify=labels) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. # clf = DecisionTreeClassifier(min_samples_split=100) # clf = SVC(C=10.0, gamma=0.001) # clf = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=100), algorithm="SAMME") clf = KNeighborsClassifier(n_neighbors=5, weights="distance", algorithm="auto") test_classifier(clf, data_dict, selected_features_list) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # # tuned_parameters = {"criterion": ("gini", "entropy"), "max_depth": (None, 1, 2, 5, 7, 10), "min_samples_split": (10, 100, 250)} # # tuned_parameters = {"C": (10.0, 100.0, 1000.0), "gamma": (1e-3, 1e-4)} # # tuned_parameters = {"n_estimators": (50, 100, 150, 200), "learning_rate": (1.0, 1.5, 2.0), "algorithm": ("SAMME", "SAMME.R")} # tuned_parameters = {"n_neighbors": (1, 5, 10, 15), "weights": ("uniform", "distance")} # # gs = GridSearchCV(clf, tuned_parameters, cv=10) # gs.fit(X_train, y_train)
clf_NB = GaussianNB() parm = {} clf_NB = Pipeline([('scaler', scaler), ('gnb', clf_NB)]) gs = GridSearchCV(clf_NB, parm) gs.fit(features_train, labels_train) clf_NB = gs.best_estimator_ print "\nGaussianNB score:\n", clf_NB.score(features_train, labels_train) print "GaussianNB score time:", round(time() - t1, 3), "s" ## Test Point print "\nGaussianNB:\n", test_classifier(clf_NB, my_dataset, features_list) ## 2. Decision Tree Classifier t2 = time() parms = {'criterion': ['gini', 'entropy'], \ 'min_samples_split': [2, 5, 10, 20], \ 'max_depth': [None, 2, 5, 10], \ 'splitter': ['random', 'best'], \ 'max_leaf_nodes': [None, 5, 10, 20]} clf_DT = tree.DecisionTreeClassifier() gs = GridSearchCV(clf_DT, parms)
parameters = {'max_depth': [1,2,3,4,5,6,8,9,10], 'min_samples_split':[2,3,4,5,6,7,8], 'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10], 'criterion':('gini', 'entropy')} dt_clf = DecisionTreeClassifier(random_state = 42) cv = cross_validation.StratifiedShuffleSplit(labels, n_iter=10) clf = GridSearchCV(dt_clf, parameters,cv=cv, scoring = 'f1') clf.fit(features,labels) predictor = clf.predict(features_test) dt_best_estimator=clf.best_estimator_ precision = precision_score(labels_test,predictor) recall = recall_score(labels_test,predictor) f1_score=f1_score(labels_test,predictor) print "Best score:%f"%clf.best_score_ print dt_best_estimator print "processing time:", round(time()-t0, 3), "s" # Classifier validation ##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000) t0 = time() test_classifier(dt_best_estimator, my_dataset, my_features_list) print 'Processing time:', round(time() - t0, 3), 's' ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(dt_best_estimator, my_dataset, my_features_list)
### Create StratifiedKFold skf = StratifiedKFold(labels_train,random_state=42) ### DecisionTreeClassifier algo_name = "DecisionTreeClassifier" print algo_name from sklearn.tree import DecisionTreeClassifier pipeline = Pipeline([('scaler', MinMaxScaler()), ('kbest', SelectKBest(f_classif)), ('dtc', DecisionTreeClassifier(random_state=42))]) grid_search = GridSearchCV(pipeline, {'kbest__k': range(1,16), 'dtc__min_samples_split': [1,2,3], 'dtc__max_depth': [None, 10, 5]},scoring='f1',cv=skf) grid_search.fit(features_train, labels_train) clf = grid_search.best_estimator_ perf_dict[algo_name] = test_classifier(clf, my_dataset, features_list) parm_dict[algo_name] = grid_search.best_params_ ### Print SelectKBest scores, note these are the same for all classifiers kbest_scores = clf.named_steps['kbest'].scores_ feature_scores = {} for i in xrange(1,len(features_list)): feature_scores[features_list[i]] = kbest_scores[i-1] feature_scores = sorted(feature_scores.items(), key=operator.itemgetter(1),reverse=True) i = 1 for f in feature_scores: print "|{}|{}|{}|".format(i,f[0],f[1]) i += 1
### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. ### Extract features and labels from my_dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) folds = 1000 cv = StratifiedShuffleSplit(labels, folds, random_state=42) ## Setting up 3 classifiers with feature scaling ## Gaussian NB print "Gaussian NB classifier output:" NB_clf = Pipeline( steps=[('scaling', preprocessing.MinMaxScaler()), ('classifier', GaussianNB())]) t0 = time() tester.test_classifier(NB_clf, my_dataset, features_list) print "Gaussian NB run time:", round(time() - t0, 3), "s" ## KMeans print "KMeans classifier output:" KM_clf = Pipeline( steps=[('scaling', preprocessing.MinMaxScaler()), ('classifier', KMeans(n_clusters=2))]) t0 = time() tester.test_classifier(KM_clf, my_dataset, features_list) print "KMeans run time:", round(time() - t0, 3), "s" ## Decision tree print "Decision Tree classifier output:" DT_clf = Pipeline(steps=[(
#### keep the engineered features added to data_dict my_dataset = data_dict ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ##### Random Forest from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(class_weight='auto', random_state=42) from time import time from tester import test_classifier t0 = time() test_classifier(rf, data_dict, features_list, folds = 100) print("Random forest fitting time: %rs" % round(time()-t0, 3)) ###### Adaboost from sklearn.ensemble import AdaBoostClassifier ab = AdaBoostClassifier(random_state=42) t0 = time() test_classifier(ab, data_dict, features_list, folds = 100) print("AdaBoost fitting time: %rs" % round(time()-t0, 3)) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
'metric': ['manhattan', 'minkowski', 'euclidean'], 'weights': ['distance', 'uniform'] }, cv=cv, scoring='f1') knn.fit(features, labels) print 'K Nearest Neighbors best estimator: ', knn.best_estimator_ print 'K Nearest Neighbors best parameters: ', knn.best_params_ print 'K Nearest Neighbors best score: ', knn.best_score_ # tester.test_classifier(knn.best_estimator_, my_dataset, best_features) # Pipeline print "Pipelining..." pipeline = Pipeline([('normalization', scaler), ('classifier', knn.best_estimator_)]) tester.test_classifier(pipeline, my_dataset, best_features) # Tune K Means kmeans = GridSearchCV( KMeans(), param_grid={ 'n_clusters': [2], 'tol': [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01], 'max_iter': [300, 200, 400, 500, 600, 700], 'init': ['k-means++', 'random'], 'copy_x': [True, False] }, cv=cv, scoring='f1') kmeans.fit(features, labels) print 'K Means best estimator: ', kmeans.best_estimator_