def precompute_recall_precision(features_list, sum = False): features_list_all = ['poi'] + features_list data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) standardized = MinMaxScaler().fit_transform(features) # Score the features using f_classif sel = SelectKBest(k='all', score_func=f_classif) sel.fit_transform(features, labels) kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)] sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True) print "Feature Set(", len(kbest), ") List and K-best scores:" for tup in sorted_kbest: print tup[2], "\t", tup[0], tup[1] if not sum: plot_feature_correlation(features, len(kbest)) for i, method in enumerate(methods): pipe, params = method() grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall') grid_searcher.fit(features, labels) clf = grid_searcher.best_estimator_ ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) my_test_classifier(clf, my_dataset, features_list_all, i)
def check_enron_outliers(data_dict): fname="enron_salary_outlier.png" features_list = ["poi", "salary", "exercised_stock_options"] #features_list = ["poi", "from_this_person_to_poi", "shared_receipt_with_poi"] data = featureFormat(data_dict, features_list) midx = data[:, 2].argmax() feature_2_max = max(data[:, 2]) print "idx of max ", features_list[2], " = ", midx print "max " , features_list[2], " = ", feature_2_max, ", ", data[:, 2][midx] plt.subplot(1,2,1) colors=map(lambda x: 'red' if x else 'grey', data[:, 0]) plt.scatter(data[:, 1], data[:, 2], s=40+data[:,0], c=colors, alpha=0.5, lw=0.) plt.xlabel(features_list[1]) plt.ylabel(features_list[2]) # Now remove one outlier data_dict.pop("TOTAL", 0) data = featureFormat(data_dict, features_list) plt.subplot(1,2,2) colors=map(lambda x: 'red' if x else 'grey', data[:, 0]) plt.scatter(data[:, 1], data[:, 2], s=40+data[:,0], c=colors, alpha=0.5, lw=0.) #plt.ticklabel_format(axis([-0.2e7, 1.2e7, -0.5, 4.0]) plt.ticklabel_format(useOffset=True) plt.xlabel(features_list[1]) plt.ylabel(features_list[2]) plt.title("{0} vs {1} Plots before and after Outlier Removal.".format(features_list[1], features_list[2]), x=-0.1, y=1.05) plt.show()
def dataset_explore(): enron_data = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) features = ["salary", "bonus"] # === Complete dataset === data = featureFormat(enron_data, features) features_plot(data, 0, 1, 'salary', 'bonus', 'Complete dataset') # === Dataset without outliers === dataset_outlier_cleaner(enron_data) data = featureFormat(enron_data, features) features_plot(data, 0, 1, 'salary', 'bonus', 'Dataset without outliers') return
def make_feature_histograms(dataset, features_list): data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) plt.ioff() if not os.path.exists(os.path.join(os.path.dirname(__file__), 'hists')): os.makedirs(os.path.join(os.path.dirname(__file__), 'hists')) for feature, i in zip(features_list[1:], range(len(features[0]))): plt.figure() feature_values_non_poi = [f[i] for f, l in zip(features, labels) if l == 0.0] feature_values_poi = [f[i] for f, l in zip(features, labels) if l == 1.0] feature_values = feature_values_non_poi + feature_values_poi non_zero_values_non_poi = [x for x in feature_values_non_poi if x != 0.0] non_zero_values_poi = [x for x in feature_values_poi if x != 0.0] non_zero_values = non_zero_values_non_poi + non_zero_values_poi q1, q3 = np.percentile(non_zero_values, [25, 75]) iqr = q3 - q1 outliers_hi = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x > q3] outliers_lo = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x < q1] # get same binwidth for both POI and non-POI bins = np.histogram(non_zero_values, bins=50)[1] plt.hist(non_zero_values_poi, bins=bins, alpha=.5, lw=0, color='r', label='POIs') plt.hist(non_zero_values_non_poi, bins=bins, alpha=.5, lw=0, color='b', label='Non-POIs') msg = ('Maximum %s: %d\n' % (feature, max(non_zero_values)) + 'Minimum %s: %d\n' % (feature, min(non_zero_values)) + 'Mean %s: %.5f\n' % (feature, np.mean(non_zero_values)) + 'Median %s: %d\n' % (feature, np.median(non_zero_values)) + '\nTotal Number of Values: %d\n' % len(feature_values) + 'Total Number of Non-Zero Values: %d\n' % len(non_zero_values)) # see which features have low number of non-zero values #if float(len(non_zero_values)) / len(feature_values) < 0.5: # print feature # print out some outlier values if they exist for outliers, which_ols in zip([outliers_hi, outliers_lo], ['Top', 'Bottom']): if outliers: if len(outliers) >= 5: top_n = 5 else: top_n = len(outliers) outliers = sorted(outliers) ol_line = q1 - 1.5*iqr if which_ols == 'Top': outliers = list(reversed(outliers)) ol_line = q3 + 1.5*iqr msg += '\n%s %d Outliers: ' % (which_ols, top_n) for i in range(top_n): if i != top_n - 1: msg += '%d, ' % outliers[i] else: msg += '%d' % outliers[i] plt.axvline(ol_line, lw=.5, ls='--', c='r') plt.figtext(.3, .4, msg) #plt.grid(axis='y') plt.title("%s histogram (non-zero values)" % feature) plt.legend() figname = 'hists/%s_histogram.png' % feature plt.savefig(figname) plt.close()
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # configure split of test_size and train_size cv = StratifiedShuffleSplit(labels, folds, random_state = 42, test_size = .2, train_size = .8) # print cv for train_idx, test_idx in cv: features_train = [] features_test = [] features_validation = [] labels_train = [] labels_test = [] labels_validation = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: if jj % 2 == 0: features_validation.append( features[jj] ) labels_validation.append( labels[jj] ) else: features_test.append( features[jj] ) labels_test.append( labels[jj] ) # Determine size of training & test sets fit_and_test_classifier(clf, features_train, labels_train, features_test, labels_test) fit_and_test_classifier(clf, features_train, labels_train, features_validation, labels_validation) print "features_train:", len(features_train), "labels_train:", len(labels_train) print "features_test:", len(features_test), "labels_test:", len(labels_test) print "features_validation:", len(features_validation), "labels_validation:", len(labels_validation)
def cluster2Features(): ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" poi = "poi" features_list = [poi, feature_1, feature_2] data = featureFormat(data_dict, features_list ) poi, finance_features = targetFeatureSplit( data ) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) #print finance_features for f1, f2 in finance_features: plt.scatter( f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans estimators = {'k_means_2': KMeans(n_clusters=2)} estimators['k_means_2'].fit(data) pred = estimators['k_means_2'].predict(data) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
def makeData(dataset, feature_list, folds = 1000): """Make and return dataset prepared for training. Keyword arguments: dataset --- dict of dict feature_list --- list of strings folds --- int """ data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) return features_train, features_test, labels_train, labels_test
def test_training_stratified_split(dataset, features_list, testsize=0.2): """ For E+F dataset, split dataset into the training and test set using stratified method. Input: dataset: data in dictionary format features_list: the full list of features to selection from test: the proportion of the dataset to include in the test split Return: labels_train, labels_test, features_train, features_test """ data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) labels = np.array([int(label) for label in labels]) features = np.array(features) ### Split data into test set and training set sss = StratifiedShuffleSplit(labels, 1, test_size=testsize, random_state=0) for train_index, test_index in sss: labels_train, labels_test = labels[train_index].tolist(), labels[test_index].tolist() features_train, features_test = features[train_index].tolist(), features[test_index].tolist() return labels_train, labels_test, features_train, features_test
def main(): ### load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, 1000, random_state=42) # Build an empty feature importance totals array for calculating average importance totals = [] for each_feature in feature_list: totals.append(0) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) clf = clf.fit(features_train, labels_train) for i in range(len(clf.feature_importances_)): totals[i] += clf.feature_importances_[i] # print clf.feature_importances_ for i in range(len(totals)): totals[i] /= 1000 # Display results print "Feature list: ", feature_list[1:] print "Importances: ", totals
def univariateFeatureSelection(f_list, my_dataset): result = [] for feature in f_list: # Replace 'NaN' with 0 for name in my_dataset: data_point = my_dataset[name] if not data_point[feature]: data_point[feature] = 0 elif data_point[feature] == 'NaN': data_point[feature] =0 data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((feature,score[0],score[1],score[2])) result = sorted(result, reverse=True, key=lambda x: x[3]) return result
def get_most_important_features(dataset, features_list): """Calculates the feature importances. Takes as input a dataset and a list of features. Creates an overfit Decision Tree and calculates the feature importances. Returns a list with the feature importances. """ # creating an overfitted decision tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # new features filtered, NaN values removed features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(labels_test, pred) # uncomment to print the accuracy score #print "overfitted accuracy", acc # calculating feature importances feat_imp = clf.feature_importances_ # uncomment to print the most important (common) ones #print feat_imp #for index, feature in enumerate(feat_imp): # if feature > 0.2: # print "spot:", index, ":", features_list[index+1], " | value:", feature return feat_imp
def prep_features(df, features_list, feature_scaled): """ Arguments: load dataframe (or dictionary), and features_list return scaled features, labels in numpy.ndarray, and scaled features, labels in pandas dataframe """ from feature_format import featureFormat, targetFeatureSplit import pandas as pd # for pandas dataframe df1 = df[features_list] features_df = df1.drop('poi', axis=1)#.astype(float) # new features (pandas dataframe) labels_df = df1['poi'] # new labels (pandas dataframe) if feature_scaled == True: features_df_scaled = scale_features(features_df) # scale features else: features_df_scaled = features_df # for dictionary df2 = df[features_list] data_dict_new = df2.T.to_dict() # data_dict (final) features_dic = features_df.copy() X_features = list(features_dic.columns) features_list_new = ['poi'] + X_features # selected features list (final) data = featureFormat(data_dict_new, features_list_new, sort_keys = True) labels, features = targetFeatureSplit(data) if feature_scaled == True: features = scale_features(features) return features, labels, features_df_scaled, labels_df
def selectKBest(previous_result, data): # remove 'restricted_stock_deferred' and 'director_fees' previous_result.pop(4) previous_result.pop(4) result = [] _k = 10 for k in range(0,_k): feature_list = ['poi'] for n in range(0,k+1): feature_list.append(previous_result[n][0]) data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((k+1,score[0],score[1],score[2])) return result
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf #print "Best Params: ", clf.best_params_ #print "Best Estimator: ", clf.best_estimator_ #current_classifier = clf.best_estimator_ importance = None if importance is not None: print "Importance: ", importance imp = sorted(zip(feature_list, importance), key=lambda tup: tup[1], reverse=True) print "Most Important Variables: " + str(imp) print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out: ", clf
def algorithm(data_dict, features_list): from feature_format import featureFormat from feature_format import targetFeatureSplit ### store to my_dataset for easy export below my_dataset = data_dict data = featureFormat(my_dataset, features_list) # scale features #data = scaleFeatures(data) ### split into labels and features (this line assumes that the first ### feature in the array is the label, which is why "poi" must always ### be first in features_list labels, features = targetFeatureSplit(data) from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \ learning_rate = 1.0, algorithm = "SAMME.R") ### dump your classifier, dataset and features_list so ### anyone can run/check your results pickle.dump(clf, open("my_classifier.pkl", "w") ) pickle.dump(data_dict, open("my_dataset.pkl", "w") ) pickle.dump(features_list, open("my_feature_list.pkl", "w") )
def __saveSelectedDataToCsv(self,features_list): print "Save selected data to csv" data = featureFormat(self.data_dict, features_list, sort_keys = True) df = pd.DataFrame(data, columns=features_list) df.to_csv('selecteddata.csv') print df.describe() return
def test_classifier(clf, dataset, feature_list, scaling = False, folds = 1000): score_all = [] precision_all = [] recall_all = [] data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) if scaling == True: min_max_scaler = preprocessing.MinMaxScaler() features = min_max_scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) for train_indices, test_indices in cv: features_train= [features[ii] for ii in train_indices] features_test= [features[ii] for ii in test_indices] labels_train=[labels[ii] for ii in train_indices] labels_test=[labels[ii] for ii in test_indices] clf.fit(features_train, labels_train) pred = clf.predict(features_test) score_all.append(clf.score(features_test,labels_test)) precision_all.append(precision_score(labels_test,pred)) recall_all.append(recall_score(labels_test,pred)) precision = numpy.average(precision_all) recall = numpy.average(recall_all) score = numpy.average(score_all) print "Score: " + str(score) print "Recall: " + str(precision) print "Precision: " + str(recall)
def validation(clf, dataset, feature_list, test_size=0.2, n_iter=1000): ''' validate given classifier with using stratifie shuffle split cross validation. returns average precision and recall ''' data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) precision = [] recall = [] cv = StratifiedShuffleSplit(labels, n_iter, test_size=test_size, random_state = 42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) clf.fit(features_train, labels_train) predictions = clf.predict(features_test) precision.append(precision_score(labels_test, predictions)) recall.append(recall_score(labels_test, predictions)) return np.mean(precision), np.mean(recall)
def find_best_parameters(pipeline, parameters, score_func, dataset, feature_list, test_size=0.2, n_iter=10): """ find best parameter by using GridSearchCV with given scoring function. returns GridSearchCV object that has best parameters. """ data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, 1, test_size=test_size, random_state = 42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) sss = StratifiedShuffleSplit(labels_train, n_iter=n_iter , test_size=test_size, random_state=42) clf = GridSearchCV(pipeline, parameters, scoring=score_func, cv=sss, n_jobs=-1) clf.fit(features_train, labels_train) return clf
def get_k_best_features(data_dict, features_list, k): """ runs scikit-learn's SelectKBest feature selection to get k best features Args: data_dict: data dictionary for enron feature_list: a list of features with first feature as target label k: Number of best features which need to be selected Returns: returns a list of k best features and list of lists where inner list's first element is feature and the second element is feature score """ data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features.keys(), map(list, sorted_pairs)
def get_k_best(df, features_list, k): """ runs scikit-learn's SelectKBest feature selection returns dict where keys=features, values=scores """ # feature, label = feature_format_scale(data_dict, features_list) from poi_dataprocess import * from feature_format import featureFormat, targetFeatureSplit data_dict_new = df[features_list].T.to_dict() data = featureFormat(data_dict_new, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # df = df[features_list] # features = df.drop('poi', axis=1)#.astype(float) # labels = df['poi'] from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) from sklearn.feature_selection import SelectKBest k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features
def select_k_best(data_dict, features_list, k): # Create dataset from feature list data = featureFormat(data_dict, features_list) # Split dataset into labels and features labels, features = targetFeatureSplit(data) # Create Min/Max Scaler scaler = preprocessing.MinMaxScaler() # Scale Features features = scaler.fit_transform(features) # Create k_best feature selection k_best = SelectKBest(k=k) # Fit k_best k_best.fit(features, labels) # Get k_best scores scores = k_best.scores_ # Create list with features and scores unsorted_pairs = zip(features_list[1:], scores) # Sort list sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) # Create dict if k == "all": k_best_features = dict(sorted_pairs) else: k_best_features = dict(sorted_pairs[:k]) return k_best_features
def main(): ### load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Run testing script test_classifier(clf, features, labels)
def regressionBonusAndLongTermInc(): ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "long_term_incentive"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) #, sort_keys = '../../tools/python2_lesson06_keys.pkl' target, features = targetFeatureSplit( data ) #print target #print features ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn import linear_model ### name your regression reg reg = linear_model.LinearRegression() ### your code goes here! reg.fit(feature_train, target_train) #find the score on the test data print reg.score(feature_test, target_test)
def ptest(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) pred = clf.predict(features_test) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 precision = 1.0*true_positives/(true_positives+false_positives) return precision
def get_k_best(dictionary, features_list, k): """ runs scikit-learn's SelectKBest feature selection returning: {feature:score} """ data = featureFormat(dictionary, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ pairs = zip(features_list[1:], scores) #combined scores and features into a pandas dataframe then sort k_best_features = pd.DataFrame(pairs,columns = ['feature','score']) k_best_features = k_best_features.sort('score',ascending = False) #merge with null counts df_nan_counts = get_nan_counts(dictionary) k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature') #eliminate infinite values k_best_features = k_best_features[np.isinf(k_best_features.score)==False] print 'Feature Selection by k_best_features\n' print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k]) print '{0}\n'.format(k_best_features[:k]) return k_best_features[:k]
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) ## Tester lacks feature scaling, lets put it here: # Scale features: mins = np.min(data, axis=0) maxs = np.max(data, axis=0) data = (data - mins) / (maxs - mins) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format( total_predictions, true_positives, false_positives, false_negatives, true_negatives ) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def tune_classifier(clf_name, clf, dataset, features_list, scores, folds = 1000): data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) scale = True if clf_name in {'kNN', 'SVM', 'kNN (hand-tuned)'} else False if scale: # Perform feature scaling from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) if clf_name == 'kNN': parameter_grid = [{'p': [1, 2, 3], 'n_neighbors': [1, 5, 7, 10, 15], 'leaf_size': [30, 50, 70, 100]}] elif clf_name == 'Decision Tree': parameter_grid = [{'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf':[2, 3, 4, 5], 'splitter': ['random', 'best']}] best_params={} for score in scores: grid_clf = GridSearchCV(clf, parameter_grid, cv=cv, scoring="{0}_weighted".format(score)) grid_clf.fit(features, labels) best_params = grid_clf.best_params_ #print("Grid scores:") #for params, mean_score, scores in grid_clf.grid_scores_: # print("{:0.3f} {:+0.03f} for {!r}".format(mean_score, scores.std() * 2, params)) print("Classifier {0} has tuned parameters {1}".format(clf_name, best_params)) return best_params
def select_k_best_features(data, feature_list, k): """ For E+F dataset, select k best features based on SelectKBest from sklearn.feature_selection Input: data: data in dictionary format feature_list: the full list of features to selection from k: the number of features to keep Return: the list of length of k+1 with the first element as 'poi' and other k best features """ data = featureFormat(data_dict, feature_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) impt_unsorted = zip(feature_list[1:], k_best.scores_) impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True)) k_best_features = [elem[0] for elem in impt_sorted][:k] print k, "best features:" print k_best_features return ['poi'] + k_best_features
# Supplied in the zip file from Udacity was a list of Persons of Interests, containing 35 individuals, sourced from # USA Today article (http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm). # We identify a discrepency between the what was provided in the source file and the final_project_dataset.pkl file. # Is the dataset a better indicator of POI? with open("poi_names.txt") as f: poi_list_usat = len(f.readlines()[2:]) print 'Number of POIs from USA Today:', (poi_list_usat) ### Task 2: Remove outliers # We can visualize some of the features we think may be indicators of fraud to get a # good idea of what the data looks like, potentially identifying some outliers. # Using the enron61702insiderpay.pdf, we see high dollar values for feature "bonus" and # "total_stock_value". features_outlier_viz = ['bonus', 'total_stock_value'] features = featureFormat(data_dict, features_outlier_viz, sort_keys=True) for i in features: bonus = i[0] total_stock_value = i[1] plt.scatter(bonus, total_stock_value) plt.xlabel("Bonus") plt.ylabel("Total Stock Value") plt.show() # In the Outlier Mini-Project we identified "TOTAL" as an important outlier for removal. # We will include the removal of this "individual" as well as "The Travel Agency in the # Park" because they are not really individuals working at Enron. We will also remove # individuals with no data (NaN) for all features, which seemed out of place. These # outliers are fairly easy to identify and remove.
'from_this_person_to_poi') new_feature_2_inputs_add('total_poi_emails', 'to_and_from_poi_emails', 'shared_receipt_with_poi') new_feature_2_inputs_divide('percent_of_poi_to_emails', 'from_this_person_to_poi', 'to_messages') new_feature_2_inputs_divide('percent_of_poi_from_emails', 'from_poi_to_this_person', 'from_messages') new_feature_4_inputs_divide('percent_poi_emails', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'from_messages') ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #Draw a plot comparing two features: f1_name and f2_name, along with their prediction line: pred. def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"): #plot each cluster with a different color--add more colors for #drawing more than five clusters colors = ["b", "c", "k", "m", "g"]
] # You will need to use more features ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Task 2: Remove outliers data_dict.pop('TOTAL', 0) data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB clf = GaussianNB() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 all_importance = [] #for holding feature importance from each fold for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) importance = clf.feature_importances_ all_importance.append(importance) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives avg_importance = np.mean(all_importance, axis=0) accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) cm = [[true_negatives, false_positives], [false_negatives, true_positives]] print clf print "Feature importances", avg_importance print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" return cm except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predictions."
""" import sys from sklearn import datasets, linear_model import pickle sys.path.append("tools/") from feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load(open("tools/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature #features_list = ["bonus", "long_term_incentive"] features_list = ["bonus", "salary"] data = featureFormat(dictionary, features_list, remove_any_zeroes=True, sort_keys="tools/python2_lesson06_keys.pkl") target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" reg = linear_model.LinearRegression() # Train the model using the training sets reg.fit(feature_train, target_train) prediction = reg.predict(feature_train)
print '' for user in data_dict: data_dict[user]['ratio_to_poi'] = str( float(data_dict[user]['from_this_person_to_poi']) / float(data_dict[user]['from_messages'])) ### Store to my_dataset for easy export below. my_dataset = data_dict print 'Extract features and labels from dataset' print '' ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True, remove_NaN=False) labels, features = targetFeatureSplit(data) #%% ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info:
#============================================================================== # # ### Task3 : Plotting of features #============================================================================== #============================================================================== from pandas.plotting import scatter_matrix from sklearn.model_selection import cross_val_score, ShuffleSplit from sklearn.ensemble import RandomForestRegressor from sklearn import preprocessing ##Scaling the features scaler = preprocessing.MinMaxScaler() data1,data2,list_base,list_base_2,df,df_2,FinalDf,FinalDf_2,df2_mean,X,Y,Y_dict,Y_list = {},{},{},{},{},{},{},{},{},{},{},{},{} for i in Retailers: data1[i] = featureFormat(my_dataset[i], valid_[i]) data2[i] = featureFormat_nan(my_dataset[i], valid_[i]) data1[i] = scaler.fit_transform(data1[i]) #list_base = {} #for i in Retailers: list_ = [] for k in range(len(valid_[i])): j = [] for point in data1[i]: j.append(point[k]) list_.append(j) list_base.update({i: list_}) df[i] = pd.DataFrame(list_base[i], index=valid_[i]) FinalDf[i] = df[i].transpose()
Draws a little scatterplot of the training/testing data You fill in the regression code where indicated: """ import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat(dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn import linear_model
#!/usr/bin/python """ Starter code for the validation mini-project. The first step toward building your POI identifier! Start by loading/formatting the data After that, it's not our code anymore--it's yours! """ import pickle import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list, sort_keys='../tools/python2_lesson13_keys.pkl') labels, features = targetFeatureSplit(data) ### it's all yours from here forward!
word_features = vectorizer.get_feature_names() sys.stdout.write("Done\n") sys.stdout.write("Preprocessing data... ") sys.stdout.flush() # Here I'm not adding all the word features into data_dict or my_dataset, # that will waste too much time and they'll be extemely large. Instead, # I preprocess the data_dict with featureFormat, then concatenate with # the matrix generated by TfidfVectorizer. # After that, do feature scaling and selection, then transform the # final numpy array into original dict format data = featureFormat(data_dict, all_features, remove_all_zeroes=False, sort_keys=True) # Concatenate two arrays vertically with np.hstack data = np.hstack((data, tf.toarray())) labels, features = targetFeatureSplit(data) # Feature scaling # Note that this applies to the final dataset used by tester.py, # see L110~L115 and L178~L185 features = MinMaxScaler().fit_transform(features) # Add an underscore before every word feature name to avoid ambiguity with # original features for feature in word_features:
# else: # to avoid this person from being removed from the master list # my_dataset[name][fname] = 0.0 #features_list.append(fname) ### add a feature for Total_stock/total_payments to see who had most to gain from the stock fname = 'total_stock_to_payments' for name in keys: if (my_dataset[name]['total_stock_value'] != 'NaN' and my_dataset[name]['total_payments'] != 'NaN'): my_dataset[name][fname] = float(my_dataset[name]['total_stock_value'])/my_dataset[name]['total_payments'] else: # to avoid this person from being removed from the master list my_dataset[name][fname] = 0.0 features_list.append(fname) nfeat = len(features_list) print "number of features:", nfeat ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, remove_all_zeroes=False, sort_keys = True) labels, features = targetFeatureSplit(data) # scale features to normalize them #features = preprocessing.scale(np.array(features)) features = np.array(features) labels = np.array(labels) nsample = len(labels) print "number of keys:", len(keys) print " number of samples:", nsample print "emp name ", for feature in features_list: print '{:>10}'.format(feature), print '' i = 0 select_feature = 1
print ('new score = {0}'.format(reg2.score(ages_test, net_worths_test))) # ## Enron Outliers # In[20]: from feature_format import featureFormat, targetFeatureSplit ### read in data dictionary, convert to numpy array data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb") ) features = ["salary", "bonus"] data = featureFormat(data_dict, features) plt.scatter(data[:,0], data[:,1]) plt.xlabel("salary") plt.ylabel("bonus") import pandas as pd df = pd.DataFrame(data_dict) df.loc['salary',:] = pd.to_numeric(df.loc['salary',:], errors='coerce') df.loc['bonus',:] = pd.to_numeric(df.loc['bonus',:], errors='coerce') x = df.loc['salary',:].astype('float64') print(x.idxmax(axis=1)) # ## Any More Outliers?
'shared_receipt_with_poi' ] enron_pd = pd.DataFrame.from_dict(data_dict, orient='index') enron_pd[all_features] = enron_pd[all_features].astype(float) print enron_pd.describe() #how many POIs poi_count = 0 for p in range(len(enron_pd)): if enron_pd['poi'][p] == True: poi_count += 1 print "There are", poi_count, "POI (persons of interest) and", 146 - poi_count, "non-POI" # REMOVE "TOTAL", "THE TRAVEL AGENCY IN THE PARK" rows # code from Lesson: Enron Outliers to plot data_out = featureFormat(data_dict, features_list) for point in data_out: salary = point[1] bonus = point[2] #matplotlib.pyplot.scatter( salary, bonus ) plt.scatter(salary, bonus) plt.xlabel("salary") plt.ylabel("bonus") plt.show() # 2.2 Function to remove outliers def remove_outlier(dict_object, keys): ### removes list of outliers keys from dict object for key in keys:
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(n_splits=folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) ##modifiquei a função para ela retornar os resultados ao inves de imprimi-los print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") return accuracy, precision, recall except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
] # In[183]: from sklearn.preprocessing import scale from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.cluster import KMeans from sklearn.ensemble import AdaBoostClassifier import tester # In[184]: from sklearn import preprocessing data = featureFormat(my_dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) # In[185]: # dt_clf number of features n_features = np.arange(1, len(feature_list)) dt_pipe = Pipeline([('select_features', SelectKBest()), ('classify', DecisionTreeClassifier())]) param_grid = [{'select_features__k': n_features}] dt_clf = GridSearchCV(dt_pipe, param_grid=param_grid, scoring='f1', cv=10)
from time import time import pickle import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) #original #data = featureFormat(data_dict, features_list, sort_keys = '../tools/python2_lesson13_keys.pkl') # labels, features = targetFeatureSplit(data) print "type(labels)=", type(labels), "len(labels)=", len(labels) print "type(features)=", type(features), "len(features)=", len(features) ### it's all yours from here forward! from sklearn import tree start_time = time() clf = tree.DecisionTreeClassifier() print("--- time to initialise tree.DecisionTreeClassifier %s seconds ---" % (time() - start_time))
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) # The inital script raised an error : StratifiedShuffleSplit not iterable # I rewrote the cv StratifiedShuffleSplit object with the same parameters according to sklearn doc: # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html #cv = StratifiedShuffleSplit(labels, folds, random_state = 42) cv = StratifiedShuffleSplit(n_splits=folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 #for train_idx, test_idx in cv: for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit from tester import dump_classifier_and_data import matplotlib.pyplot ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". outlier_check = ['salary', 'bonus'] ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) my_data = data_dict data = featureFormat(my_data, outlier_check, sort_keys=True) ### Task 2: Remove outliers for point in data: salary = point[0] bonus = point[1] matplotlib.pyplot.scatter(salary, bonus) matplotlib.pyplot.xlabel("salary") matplotlib.pyplot.ylabel("bonus") matplotlib.pyplot.show() my_data.pop('TOTAL', 0) my_data.pop('THE TRAVEL AGENCY IN THE PARK', 0)
# In[10]: #For convinience of data cleaning (Removing NaN) using pandas (Ref: Data visulization) import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[11]: unwanted_features = ["poi", "email_address"] features_temp = [ele for ele in features_temp if ele not in unwanted_features] # to make the first element in the list "poi" features_temp = ["poi"] + features_temp feature_data = featureFormat(data_dict, features_temp, remove_NaN=False) # In[12]: temp_df = pd.DataFrame(data=feature_data, columns=features_temp, index=name_data_point) print "With NaN" print temp_df.info() # In[13]: poi = temp_df['poi'] == 1 temp_df[poi].count() # In[14]:
def main(): ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". financial_features = ['salary', 'deferral_payments', 'total_payments', \ 'loan_advances', 'bonus', 'restricted_stock_deferred',\ 'deferred_income', 'total_stock_value', 'expenses', \ 'exercised_stock_options', 'other', 'long_term_incentive', \ 'restricted_stock', 'director_fees'] #(all units are in US dollars) email_features = ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi'] #(units are generally number of emails messages; notable exception is ‘email_address’, # which is a text string) #email_address feature was removed from list poi_label = ['poi'] ###(boolean, represented as integer) features_list = poi_label + email_features + financial_features ### Load the dictionary containing the dataset with open("final_project_dataset_unix.pkl", "rb") as data_file: data_dict = pickle.load(data_file) #convert to a pandas dataframe for exploratory analysis df = pd.DataFrame.from_dict(data_dict, orient='index') #iterate df and convert string 'NaN' to actual np.nan for label, content in df.items(): if label == 'email_address': for i in content: if i == 'NaN': df[label][i] = np.nan else: df[label] = pd.to_numeric(df[label], errors='coerce') ### Investigate contents of dataset: # Total Number of data points total_people = df.shape[0] print('The total number of data points (people) in our data set is {}.\n'\ .format(total_people)) # Total Number of Features Used all_features = df.shape[1] print('There are {} features for each person in our dataset.\n'\ .format(all_features)) # Total Number of Persons Of Interest (POIs) poi_count = df['poi'][(df['poi'] == True)].count() print('Our dataset has {} persons of interest.\n'.format(poi_count)) # Total Number of Non-POIs non_poi_count = total_people - poi_count print('Our dataset has {} Non persons of interest.\n'.format(non_poi_count)) # Features with missing values? print('The following categories have missing values (NaN values)\n') print (df.isna().sum()) ### Task 2: Remove outliers #visualize_features('salary', 'bonus', data_dict) #visualize_features('from_poi_to_this_person', 'from_this_person_to_poi', data_dict) #visualize_features('loan_advances', 'total_stock_value', data_dict) print() print('Searching for Outliers...') find_outlier('salary', df) print () find_outlier('bonus', df) print() find_outlier('from_poi_to_this_person', df) print () find_outlier('from_this_person_to_poi', df) print () find_outlier('loan_advances', df) print () find_outlier('total_stock_value', df) #get a count of number of NaN columns for each person nan_count = df.isna().sum(axis=1) print('\nThe top 5 people by number of NaN columns are:\n') print (nan_count.sort_values(ascending=False).head(5)) print('\nLooking closer at Eugene Lockhart...\n') print( df.loc['LOCKHART EUGENE E']) print ('\nLooking closer at THE TRAVEL AGENCY IN THE PARK...\n') print (df.loc['THE TRAVEL AGENCY IN THE PARK']) ### Remove outliers df = df.drop(['TOTAL'], axis=0) df = df.drop(["LOCKHART EUGENE E"], axis=0) df = df.drop(["THE TRAVEL AGENCY IN THE PARK"], axis=0) #replace NaN with 0 df = df.fillna(0) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = df.to_dict('index') for person in my_dataset: to_poi_count = my_dataset[person]['from_this_person_to_poi'] from_poi_count = my_dataset[person]['from_poi_to_this_person'] total_received_emails = my_dataset[person]['from_messages'] total_sent_emails = my_dataset[person]['to_messages'] try: my_dataset[person]['to_poi_ratio'] = float(to_poi_count) /\ float(total_sent_emails) except: my_dataset[person]['to_poi_ratio'] = 0 try: my_dataset[person]['from_poi_ratio'] = float(from_poi_count) /\ float(total_received_emails) except: my_dataset[person]['from_poi_ratio'] = 0 features_list = features_list + ['to_poi_ratio', 'from_poi_ratio'] ### Preprocessing ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) #Scaling features (normalizing all features) min_max_scaler = MinMaxScaler() features = min_max_scaler.fit_transform(features) ### Select the best features: # Removes all but the k highest scoring features n = 6 # adjust for optimization skb = SelectKBest(f_classif, k=n) skb.fit_transform(features, labels) #pprint(sorted(skb.scores_, reverse=True)) #skip poi feature and combine with returned scores (key:value --> feature:score) scores = zip(features_list[1:], skb.scores_) #sort by highest scoring feature from scores sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True) #print '\nOur {} highest feature scores are:'.format(n) #pprint(sorted_scores[:n]) #add k highest scoring features to create new features_list new_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[:n] #print '\nOur new features list includes: ' #pprint(new_features_list) ### Extract features and labels from dataset using optimized features_list data = featureFormat(my_dataset, new_features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a variety of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html print ('\nRunning GaussianNB classifier...') run_classifier(GaussianNB(), features, labels) print ('\nRunning SVM classifier...') run_classifier(SVC(), features, labels) print ('\nRunning AdaBoost classifier...') run_classifier(AdaBoostClassifier(), features, labels) print ('\nRunning DecisionTree classifier...') run_classifier(DecisionTreeClassifier(), features, labels) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ### Re-Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Adjust SVM parameters to refine accuracy # variables will be passed to fine_tune_algorithm to use in a Pipeline print ('\nThe best fit SVM has the following scores:\n') svm_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('SVM', SVC())] svm_parameters = {'SVM__kernel': ('linear', 'rbf'), 'SVM__C':[0.001, 0.01, .1, 1, 10, 100, 1000], 'SVM__gamma':[0.01, .1, 1, 10, 100, 1000], 'SKB__k': [2,3,4,5,6,7,8,9,10]} svm_clf = fine_tune_algorithm(svm_steps, svm_parameters, features, labels) # Adjust DecisionTreeClassifier parameters to refine accuracy print ('\nThe best fit DecisionTreeClassifer has the following scores:\n') dt_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('DT', DecisionTreeClassifier())] dt_parameters = {'DT__criterion': ('gini', 'entropy'), 'DT__min_samples_split':[2,3,4,5,6,7,8,9,10], 'DT__random_state':[13], 'SKB__k': [2,3,4,5,6,7,8,9,10]} dt_clf = fine_tune_algorithm(dt_steps, dt_parameters, features, labels) # Adjust AdaBoostClassifier parameters to refine accuracy # variables will be passed to fine_tune_algorithm to use in a Pipeline print ('\nThe best fit AdaBoostClassifier has the following scores:\n') ab_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('AB', AdaBoostClassifier())] ab_parameters = {'AB__algorithm': ('SAMME', 'SAMME.R'), 'AB__learning_rate':[.5, .6, .7, .8, .9,1], 'SKB__k': [2,3,4,5,6,7,8,9,10]} ada_clf = fine_tune_algorithm(ab_steps, ab_parameters, features, labels) # Adjust GaussianNB parameters to refine accuracy print ('\nThe best fit GaussianNB Classifier has the following scores:\n') nb_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('NB', GaussianNB())] nb_parameters = {'SKB__k': [2,3,4,5,6,7,8,9,10]} nb_clf = fine_tune_algorithm(nb_steps, nb_parameters, features, labels) #final best fitting classifier clf = nb_clf ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
### "NaN" values for this feature, then this feature will not be considered. ### So email_address, loan advances, deferral_payments, director_fees features_list = [ 'poi', 'to_messages', 'expenses', 'deferred_income', 'long_term_incentive', 'fraction_from_poi', 'shared_receipt_with_poi', 'from_messages', 'bonus', 'total_stock_value', 'from_poi_to_this_person', 'from_this_person_to_poi', 'restricted_stock', 'salary', 'total_payments', 'fraction_to_poi', 'exercised_stock_options' ] ############################################################################### ### Task 2: Remove outliers # Look for outliers points by salary and bonus values features = ["salary", "bonus", "poi"] data = featureFormat(data_dict, features) max_salary = 0 max_bonus = 0 for point in data: salary = point[0] bonus = point[1] poi = point[2] if poi: plt.scatter(salary, bonus, c="r") else: plt.scatter(salary, bonus) if point[0] > max_salary: max_salary = point[0] if point[1] > max_bonus: max_bonus = point[1] plt.xlabel("salary")
plt.show() ### load in the dict of dicts containing all the data on each person in the dataset data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" poi = "poi" features_list = [poi, feature_1, feature_2] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2 in finance_features: plt.scatter(f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans data2 = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data2)
""" import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] # salary data = featureFormat(dictionary, features_list, remove_any_zeroes=True ) #, "long_term_incentive"], remove_any_zeroes=True ) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### your regression goes here! ### please name it reg, so that the plotting code below picks it up and ### plots it correctly ##
#============================================================================== #============================================================================== # # ### Task3 : Plotting of features #============================================================================== #============================================================================== from pandas.plotting import scatter_matrix from sklearn.model_selection import cross_val_score, ShuffleSplit from sklearn.ensemble import RandomForestRegressor from sklearn import preprocessing ##Scaling the features scaler = preprocessing.MinMaxScaler() #data1,data2,list_base,list_base_2,df,df_2,FinalDf,FinalDf_2,df2_mean,X,Y,Y_dict,Y_list = {},{},{},{},{},{},{},{},{},{},{},{},{} data1 = featureFormat(my_dataset, valid_) data2 = featureFormat_nan(my_dataset, valid_) data1 = scaler.fit_transform(data1) #list_base = {} #for i in Retailers: list_base = [] for k in range(len(valid_)): j = [] for point in data1: j.append(point[k]) list_base.append(j) # list_base.update({i:list_}) #df = {} #FinalDf = {} #for i in Retailers: df = pd.DataFrame(list_base, index=valid_)
new_data = pd.DataFrame(my_dataset.values())[features_list] new_data.index = my_dataset.keys() new_data['new_total_stock'] = new_data['exercised_stock_options'] + new_data[ 'restricted_stock'] new_dataset = {} key = list(new_data.index) for j in range(len(key)): v = {} key_v = list(new_data.columns.values) for i in range(len(key_v)): value_v = list(new_data.loc[key[j]]) v[key_v[i]] = value_v[i] new_dataset[key[j]] = v ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) features_list_new = [ 'poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages', 'new_total_stock' ] data_new = featureFormat(new_dataset, features_list_new, sort_keys=True) labels_new, features_new = targetFeatureSplit(data_new) from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) features_new = scaler.fit_transform(features_new) from numpy import mean from sklearn import cross_validation
print "The Number of Users: ", len( data_dict.keys()) # There are 146 users in dataset. # The number of poi count_poi = 0 for POIs in data_dict: if data_dict[POIs]['poi'] == True: count_poi += 1 # Replace 'NaN' values to 0s for NA_keys in data_dict[POIs]: if data_dict[POIs][NA_keys] == 'NaN': data_dict[POIs][NA_keys] = 0 print "The Number of POIs: ", count_poi # There are 18 POI in dataset. ### Task 2: Remove outliers outlier_tester = ["salary", "bonus", "poi"] outlier_data = featureFormat(data_dict, outlier_tester) from operator import itemgetter for point in outlier_data: if point[2] == False: salary = point[0] bonus = point[1] matplotlib.pyplot.scatter(salary, bonus) matplotlib.pyplot.xlabel("salary") matplotlib.pyplot.ylabel("bonus") matplotlib.pyplot.show() # Remove the outlier(s) data_dict.pop("TOTAL") #print len(data_dict.keys())
### load in the dict of dicts containing all the data on each person in the dataset data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" #feature_3 = "total_payments" #poi = "poi" features_list = [feature_1, feature_2] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) filterdata = {k: v for k, v in data_dict.iteritems() if v['salary'] != 'NaN'} from operator import attrgetter min_num = min(filterdata.values(), key=lambda x: x['salary']) max_num = max(filterdata.values(), key=lambda x: x['salary']) print "min_value =", min_num['salary'] print "max_num =", max_num['salary'] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(data) scaler.transform(data)
from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings("ignore") sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit with open("../submission/my_dataset.pkl", "r") as data_file: my_dataset = pickle.load(data_file) with open("../submission/my_feature_list.pkl", "r") as data_file: features_list = pickle.load(data_file) data = featureFormat(my_dataset, features_list) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) lass_clf = Lasso(alpha=1, tol=1) lass_clf.fit(features_train, labels_train) feature_weights = {} feature_weight_normalizer = 0 for i in range(len(features_list[1:])): feature_weights.update({features_list[i + 1]: lass_clf.coef_[i]}) feature_weight_normalizer += lass_clf.coef_[i] for feat in feature_weights: feature_weights.update(
### Now add these above features + some more additional features to the feature_list features1 = features_list + [ 'fraction_from_poi', 'fraction_to_poi', 'shared_receipt_with_poi', 'expenses', 'loan_advances', 'long_term_incentive', 'restricted_stock', 'salary', 'total_stock_value', 'exercised_stock_options', 'total_payments', 'bonus', 'wealth' ] print "" print "Two new features succesfully added to the feature list - 'fraction_from_poi', 'fraction_to_poi' and 'wealth'" print "" print "Selected Feature list - before Feature_Selection", features1 ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features1, sort_keys=True) labels, features = targetFeatureSplit(data) ### We do not know yet if feature scaling and feature filering using kbest will benefit our model yet. ### But lets try it anyway # Scale features scaler = MinMaxScaler() features = scaler.fit_transform(features) # K-best features - choosing 6 features for a trial k_best = SelectKBest(k=6) k_best.fit(features, labels) result_list = zip(k_best.get_support(), features1[1:], k_best.scores_) result_list = sorted(result_list, key=lambda x: x[2], reverse=True)
for name in tfidf_dict: if tfidf_dict[name]['poi'] == True and tfidf_dict[name][feature] != 0.0: poi_count += 1 if tfidf_dict[name][ 'poi'] == False and tfidf_dict[name][feature] != 0.0: npoi_count += 1 if poi_count == 12 and npoi_count < 50: poi_relevant_features.append(feature) list_of_features = (list(set(list_of_features) - set(poi_relevant_features))) list_of_features.insert(0, 'poi') for name in tfidf_dict: if tfidf_dict[name]['poi'] != 0.0 and tfidf_dict[name]['poi'] != 1: tfidf_dict[name]['poi'] = 0 data = featureFormat(tfidf_dict, list_of_features) labels, features = targetFeatureSplit(data) selector = SelectKBest(k=50) # When selector was select k best - k = 50: Accuracy = 0.9082, Precision = 0.83676, Recall = 0.387 # When selector was select percentile - percentile = 10: Acc =0.83733 , Pre = 0.24419 , Rec = 0.105 # This is a significant decrease # When Selector was select precentile - percentile = 5: Acc =0.83773 , Pre = 0.21144 , Rec = 0.07950 # This was another significant decrease selector.fit(features, labels) selected = selector.get_support() list_of_features.pop(0) list_of_features = np.array(list_of_features) selected_features = list_of_features[selected] for feature in selected_features:
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Scale features if(scale_features): scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print 'Total predictions: '+str(total_predictions) print 'Accuracy: '+str(accuracy) print 'Precision: '+str(precision) print 'Recall: '+str(recall) print 'F1: '+str(f1) print 'F2: '+str(f2) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."