def select_features(feature_list, my_dataset, k): ''' Select k number of features based on SelectKBest function and StratifiedShuffleSplit feature_list = list of strings representing feature names my_dataset = dataset containing all features and labels k = number of desired features ''' from sklearn.model_selection import StratifiedShuffleSplit # Create feature and label arrays from the dataset data = featureFormat(my_dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) # create sss with 1000 splits sss = StratifiedShuffleSplit(n_splits=1000, random_state=42) feature_scores = {} # Create 1000 different sets of training and testing samples for train, test in sss.split(features, labels): features_train = [features[i] for i in train] labels_train = [labels[i] for i in train] # fit the selectkbest function on each set of training data selector = SelectKBest(k=k) selector.fit(features_train, labels_train) # Get list of features, scores, and pvalues for each selector feature_indices = selector.get_support(indices=True) sel_features = [(feature_list[i + 1], selector.scores_[i], selector.pvalues_[i]) for i in feature_indices] # Gather the scores and pvalue of each feature from each split for feat, score, pval in sel_features: if feat not in feature_scores: feature_scores[feat] = {"scores": [], "pvalue": []} feature_scores[feat]['scores'].append(score) feature_scores[feat]['pvalue'].append(pval) # Get average score and pvalue of each feature feature_scores_l = [] for feat in feature_scores: feature_scores_l.append((feat, np.mean(feature_scores[feat]['scores']), np.mean(feature_scores[feat]['pvalue']))) import operator sorted_feature_scores = sorted(feature_scores_l, key=operator.itemgetter(1), reverse=True) sorted_feature_scores_str = [ "{}: {} {}".format(z[0], z[1], z[2]) for z in sorted_feature_scores ] print "feature: score, p-value" for line in sorted_feature_scores_str: print line return
# loop through each classifier and capture evaluation metrics for c in clf_list: clf = c clf.fit(features_train, labels_train) pred = clf.predict(features_test) #print pred print "Accuracy is ", clf.score(features_test, labels_test) print "precision = ", precision_score(labels_test, pred) print "recall = ", recall_score(labels_test, pred) recall_list.append(recall_score(labels_test, pred)) print "\nRunning Stratisfied Shuffle Split cross validation to compare recall\n" print "printing mean of Stratisfied Shuffle Split" print "mean = ", cross_val_score(clf, features, labels, cv=cv.split(features, labels), scoring='recall').mean() mean_recall_list.append( cross_val_score(clf, features, labels, cv=cv.split(features, labels), scoring='recall').mean()) print "\n" dump_classifier_and_data(clf, my_dataset, features_list) main() print "printing summary of recall score from Classifiers, \n", recall_list #print "printing summary of accuracy scores from Classifiers, \n", accuracy_list print "printing summary of mean recall scores from Stratisfied Shuffle Split CV, \n", mean_recall_list # Since I didn’t have a particular algorithm to try in mind, I chose to iterate through several classifiers to evaluate different metrics without making any parameter tunes to get a baseline of how each classifier performs. This will help me choose which classifier to focus tuning parameters on. When iterating, I captured the accuracy of a feature train/test split as well as a mean accuracy when running Stratisfied Shuffle split validation test to compare the two scores. I felt it was necessary to perform Stratisfied Shuffle split cross validation because of the imbalance in POIs and non POIs and to ensure all data is included in a test and a training procedure. Additionally, I dumped out the classifier, dataset, and feature list in each iteration so I can call tester.py. I recorded all the scores the tester file provides scores of each classifier. After iterating through each classier, I observed the following metrics: