def ensemble(data_set, features_list):
    from data_sorter import featuresProcess
    from data_sorter import split
    data = featuresProcess(data_set, features_list)
    labels, features = split(data)
    from sklearn.ensemble import BaggingClassifier
    clf = BaggingClassifier(n_estimators=int(sys.argv[1]),
                            random_state=202,
                            bootstrap=True)
    pickle.dump(clf, open("classifier_ada.pkl", "w"))
    pickle.dump(features_list, open("features_list.pkl", "w"))
    pickle.dump(data_set, open("dataset.pkl", "w"))
def ensemble(data_set, features_list):
    print data_set
    from data_sorter import featuresProcess
    from data_sorter import split
    data = featuresProcess(data_set, features_list)
    labels, features = split(data)
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=int(sys.argv[1]),
                             random_state=202,
                             learning_rate=float(sys.argv[2]),
                             algorithm="SAMME.R")
    pickle.dump(clf, open("classifier_ada.pkl", "w"))
    pickle.dump(features_list, open("features_list.pkl", "w"))
    pickle.dump(data_set, open("dataset.pkl", "w"))
email_from_poi = get_information("from_poi_to_this_person", "to_messages", data)

email_to_poi = get_information("from_this_person_to_poi", "from_messages", data)

count = 0
for ii in data:
  data[ii]["email_from_poi"] = email_from_poi[count]
  data[ii]["email_to_poi"] = email_to_poi[count]
  count += 1




features_list = ["poi", "salary", "bonus", "email_from_poi", "email_to_poi",'deferral_payments', 'total_payments']

dataset = featuresProcess(data, features_list)

labels, features = split(dataset)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

time_start = time.time()

dtree = DecisionTreeClassifier()
dtree.fit(features_train, labels_train)
score = dtree.score(features_test, labels_test)
print "Accuracy ", score

print "Decesion tree took time : ", time.time() - time_start

feat_ranks = dtree.feature_importances_
indices = np.argsort(feat_ranks)[::-1]
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import sys
import pickle
import numpy as np
import pylab
from data_sorter import split
from data_sorter import featuresProcess

datadict = pickle.load(open("enron.pkl", "r"))
features = ["salary", "bonus", "total_stock_value"]
data = featuresProcess(datadict, features)
temp_list = list()
for ii in data:
    temp_list.append((ii[0], ii[1], ii[1]))
temp_list = sorted(temp_list, key=lambda x: x[0], reverse=True)
for ii in range(4):
    temp_list.pop(0)
fig = pylab.figure()
ax = Axes3D(fig)
ax.scatter([ii[0] for ii in temp_list], [ii[1] for ii in temp_list],
           [ii[2] for ii in temp_list])
plt.xlabel('Salaries')
plt.ylabel('Bonus')
ax.set_zlabel('Stocks')
plt.savefig("Enron Salalries No Outliers 3d")
        elif data[ii][key] >= 0:
            retval.append(float(data[ii][key]) / float(data[ii][total]))
    return retval


email_from_poi = get_information("from_poi_to_this_person", "to_messages",
                                 data)

email_to_poi = get_information("from_this_person_to_poi", "from_messages",
                               data)

count = 0
for ii in data:
    data[ii]["email_from_poi"] = email_from_poi[count]
    data[ii]["email_to_poi"] = email_to_poi[count]
    count += 1

features = ["poi", "email_from_poi", "email_to_poi"]
dataset = featuresProcess(data, features)

for ii in dataset:
    from_poi = ii[1]
    to_poi = ii[2]
    if ii[0] == 0:
        plt.scatter(from_poi, to_poi, color="g")
    if ii[0] == 1:
        plt.scatter(from_poi, to_poi, color="r", marker="*")
plt.xlabel("Emails from POIs to Individual")
plt.ylabel("Emails from Individual to POIs")
plt.savefig("Email % from an Individual to POIs and Non POIs")