from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score from sklearn import preprocessing from sklearn import cross_validation import enron_evaluate sep = '##############################################################################################' sep2 = '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### create list of functions for use as argument to add_features function add_feature_function_list = [enron_tools.add_poi_to_ratio,enron_tools.add_poi_from_ratio,enron_tools.add_poi_interaction_ratio] ## add features to data_dict enron_tools.add_features(add_feature_function_list,data_dict) ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". data_label = 'poi' features_list = enron_tools.get_features(data_dict) ## email address does not help with prediction and causes exeception, remove features_list.remove('email_address') ## other is not a well defined feature, remove #features_list.remove('other')
# outliers_dict[count] += [employee_name] # else: # outliers_dict[count] = [employee_name] # print outliers_dict outliers = ['TOTAL', "LOCKHART EUGENE E", 'THE TRAVEL AGENCY IN THE PARK'] for i in outliers: data_dict.pop(i, 0) ### Task 3: Create new feature(s) # Calculate frations fraction_from_poi_email = enron_tools.calculate_fraction(data_dict, "from_poi_to_this_person", "to_messages") fraction_to_poi_email = enron_tools.calculate_fraction(data_dict, "from_this_person_to_poi", "from_messages") # Add new feature values to data_dict data_dict = enron_tools.add_features(data_dict, "fraction_from_poi_email", fraction_from_poi_email) data_dict = enron_tools.add_features(data_dict, "fraction_to_poi_email", fraction_to_poi_email) # Add new features to feature list # features_list += ["fraction_from_poi_email", "fraction_to_poi_email"] ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations,