def ensemble(data_set, features_list): from data_sorter import featuresProcess from data_sorter import split data = featuresProcess(data_set, features_list) labels, features = split(data) from sklearn.ensemble import BaggingClassifier clf = BaggingClassifier(n_estimators=int(sys.argv[1]), random_state=202, bootstrap=True) pickle.dump(clf, open("classifier_ada.pkl", "w")) pickle.dump(features_list, open("features_list.pkl", "w")) pickle.dump(data_set, open("dataset.pkl", "w"))
def ensemble(data_set, features_list): print data_set from data_sorter import featuresProcess from data_sorter import split data = featuresProcess(data_set, features_list) labels, features = split(data) from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=int(sys.argv[1]), random_state=202, learning_rate=float(sys.argv[2]), algorithm="SAMME.R") pickle.dump(clf, open("classifier_ada.pkl", "w")) pickle.dump(features_list, open("features_list.pkl", "w")) pickle.dump(data_set, open("dataset.pkl", "w"))
email_from_poi = get_information("from_poi_to_this_person", "to_messages", data) email_to_poi = get_information("from_this_person_to_poi", "from_messages", data) count = 0 for ii in data: data[ii]["email_from_poi"] = email_from_poi[count] data[ii]["email_to_poi"] = email_to_poi[count] count += 1 features_list = ["poi", "salary", "bonus", "email_from_poi", "email_to_poi",'deferral_payments', 'total_payments'] dataset = featuresProcess(data, features_list) labels, features = split(dataset) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42) time_start = time.time() dtree = DecisionTreeClassifier() dtree.fit(features_train, labels_train) score = dtree.score(features_test, labels_test) print "Accuracy ", score print "Decesion tree took time : ", time.time() - time_start feat_ranks = dtree.feature_importances_ indices = np.argsort(feat_ranks)[::-1]
import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import sys import pickle import numpy as np import pylab from data_sorter import split from data_sorter import featuresProcess datadict = pickle.load(open("enron.pkl", "r")) features = ["salary", "bonus", "total_stock_value"] data = featuresProcess(datadict, features) temp_list = list() for ii in data: temp_list.append((ii[0], ii[1], ii[1])) temp_list = sorted(temp_list, key=lambda x: x[0], reverse=True) for ii in range(4): temp_list.pop(0) fig = pylab.figure() ax = Axes3D(fig) ax.scatter([ii[0] for ii in temp_list], [ii[1] for ii in temp_list], [ii[2] for ii in temp_list]) plt.xlabel('Salaries') plt.ylabel('Bonus') ax.set_zlabel('Stocks') plt.savefig("Enron Salalries No Outliers 3d")
elif data[ii][key] >= 0: retval.append(float(data[ii][key]) / float(data[ii][total])) return retval email_from_poi = get_information("from_poi_to_this_person", "to_messages", data) email_to_poi = get_information("from_this_person_to_poi", "from_messages", data) count = 0 for ii in data: data[ii]["email_from_poi"] = email_from_poi[count] data[ii]["email_to_poi"] = email_to_poi[count] count += 1 features = ["poi", "email_from_poi", "email_to_poi"] dataset = featuresProcess(data, features) for ii in dataset: from_poi = ii[1] to_poi = ii[2] if ii[0] == 0: plt.scatter(from_poi, to_poi, color="g") if ii[0] == 1: plt.scatter(from_poi, to_poi, color="r", marker="*") plt.xlabel("Emails from POIs to Individual") plt.ylabel("Emails from Individual to POIs") plt.savefig("Email % from an Individual to POIs and Non POIs")