# remove outliers outlier_keys = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E'] enron_functions.remove_outliers(data_dict, outlier_keys) print len(data_dict) # instantiate copies of dataset and features for grading purposes my_dataset = copy(data_dict) my_feature_list = copy(features_list) # get K-best features num_features = 4 #2 for KN best_features = enron_functions.get_k_best(my_dataset, my_feature_list, num_features) my_feature_list = [target_label] + best_features.keys() # add two new features enron_functions.add_financial_sum(my_dataset, my_feature_list) enron_functions.add_poi_interaction_fraction(my_dataset, my_feature_list) #enron_functions.visualize(data_dict, 'total_stock_value', 'poi_interaction') # print features print "{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:]) # extract the features specified in features_list data = featureFormat(my_dataset, my_feature_list) # split into labels and features (this line assumes that the first # feature in the array is the label, which is why "poi" must always # be first in the features list labels, features = targetFeatureSplit(data) # scale features via min-max from sklearn import preprocessing scaler = preprocessing.MinMaxScaler()
best_features = enron_functions.get_k_best(my_dataset, my_feature_list, num_features) my_feature_list = [target_label] + best_features.keys() '''# Convert NAN to 0 in selected features for name, item in my_dataset.items(): for key in my_feature_list: if item[key] == 'NaN': my_dataset[name][key] = 0 #print my_dataset[name] ''' # add two new features #enron_functions.add_financial_sum(my_dataset, my_feature_list) enron_functions.add_poi_interaction_fraction(my_dataset, my_feature_list) # Adding only this feature #enron_functions.visualize(data_dict, 'total_stock_value', 'poi_interaction') # print features print "{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:]) # extract the features specified in features_list data = featureFormat(my_dataset, my_feature_list) # split into labels and features (this line assumes that the first # feature in the array is the label, which is why "poi" must always # be first in the features list labels, features = targetFeatureSplit(data) # scale features via min-max from sklearn import preprocessing scaler = preprocessing.MinMaxScaler()