Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess(percentile=1) clf = DecisionTreeClassifier(min_samples_split=40) clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(pred, labels_test) print "Decision Tree accuracy: %r" % acc """ You found in the SVM mini-project that the parameter tune can significantly speed up the training time of a machine learning algorithm. A general rule is that the parameters can tune the complexity of the algorithm, with more complex algorithms generally running more slowly. Another way to control the complexity of an algorithm is via the number of features that you use in training/testing. The more features the algorithm
Use a Decision Tree to identify emails from the Enron corpus by author: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn import tree clf = tree.DecisionTreeClassifier(min_samples_split=40) clf = clf.fit(features_train, labels_train) pred = clf.predict(features_test) ######################################################### from sklearn.metrics import accuracy_score
Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("C:/WorkSpace/MachineLearning/MachineLearning/tools") from email_preprocess import preprocess from sklearn.svm import SVC from sklearn.metrics import accuracy_score import numpy as np ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess(words_file = "C:/WorkSpace/MachineLearning/MachineLearning/tools/word_data.pkl", authors_file="C:/WorkSpace/MachineLearning/MachineLearning/tools/email_authors.pkl") ######################################################### ### your code goes here ### def classify(features_train, labels_train,features_test,labels_test): # just training 1% of the full training set #features_train = features_train[:len(features_train)/100] #labels_train = labels_train[:len(labels_train)/100] #clf = SVC(kernel="linear") clf = SVC(kernel="rbf",C=10000.0) t0 = time() clf.fit(features_train,labels_train) print "training time:", round(time()-t0, 3), "s"
authors and labels: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess from sklearn.naive_bayes import GaussianNB ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### print type(preprocess()) print type(features_train) print len(features_train) clf = GaussianNB() # counting time for fitting model t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s"
This is the code to accompany the Lesson 2 (SVM) mini-project. Use a SVM to identify emails from the Enron corpus by their authors: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess # features_train and features_test are the features for the training # and testing datasets, respectively # labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.svm import SVC # features_train = features_train[:len(features_train) / 100] # labels_train = labels_train[:len(labels_train) / 100] # Optimize C Parameter """ for i in range(1, 5): c = 10**i print "C=" + str(c) clf = SVC(kernel="rbf", C=c)
""" This is the code to accompany the Lesson 3 (decision tree) mini-project. Use a Decision Tree to identify emails from the Enron corpus by author: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") import email_preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = email_preprocess.preprocess() from sklearn import tree clf = tree.DecisionTreeClassifier(min_samples_split=40) t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" t0 = time() acc = clf.score(features_test, labels_test) print "predict time:", round(time()-t0, 3), "s" print acc
def __init__(self, words_file, authors_file): self.words_file = "../tools/word_data.pkl" self.authors_file = "../tools/email_authors.pkl" ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels self.features_train, self.features_test, self.labels_train, self.labels_test = preprocess(words_file=self.words_file, authors_file=self.authors_file)
Use a Decision Tree to identify emails from the Enron corpus by author: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("./tools/") from email_preprocess import preprocess from sklearn.tree import DecisionTreeClassifier # features_train and features_test are the features for the training # and testing datasets, respectively # labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess( words_file="./tools/word_data.pkl", authors_file="./tools/email_authors.pkl") # Number of features: 3785 print "Number of features: ", len(features_train[0]) # clf 1 # feature : percentile=10 # accuracy : 0.978 # time : 62.838 # clf = DecisionTreeClassifier(min_samples_split=40) # t0 = time() # clf.fit(features_train, labels_train) # print("training time:", round(time() - t0, 3), "s") # print("accuracy:", clf.score(features_test, labels_test))
import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess from data_formating import format_mails #Formatting mails (coverting into pickle files) from the csv database file format_mails() #features_train is a numpy array contains emails for training #features_test is a numpy array containing emails for testing #labels_train is a numpy array containing training labels(spams/ham) #mail_detect is a numpy array containing processed email to be checked as spam features_train, features_test, labels_train, labels_test, final_transformed = preprocess( ) #Random Forest algorithm to train classifier from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=75, criterion='entropy', min_samples_split=3) t = time() clf.fit(features_train, labels_train) print "Training Time:", round(time() - t, 3), "s" t = time() pred = clf.predict(features_test) print "Prediction Time:", round(time() - t, 3), "s" #code to check accuracy from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test)
import sys # from time import time import time # sys.path.append("../tools/") # from email_preprocess import preprocess import email_preprocess import sklearn.svm ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = email_preprocess.preprocess() # only use 1%, 0.01 of features_train - performance increases, time decreases, accuracy decreases # len(features_train) / 100 print("len(features_test) - {}".format(len(features_test))) # print("len(features_train) - {}".format(len(features_train))) # print("len(features_train) / 100 - {}".format(len(features_train) / 100)) # print("round(len(features_train) / 100) - {}".format(round(len(features_train) / 100))) # features_train = features_train[:round(len(features_train)/100)] # labels_train = labels_train[:round(len(labels_train)/100)] ######################################################### ### your code goes here ### #########################################################
authors and labels: Sara has label 0 Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess from utils import execute ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = execute(lambda : preprocess(), "Process data") ######################################################### ### your code goes here ### # Imports from sklearn.naive_bayes import GaussianNB from sklearn import metrics # Visualize data # Create classifier clf = GaussianNB()