Python preprocess示例，email_preprocess.preprocess Python示例

示例#1

0

显示文件

文件： dt_author_id.py 项目： ChangJungWu/machine-learning

    Sara has label 0
    Chris has label 1

"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(percentile=1)
clf = DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = accuracy_score(pred, labels_test)
print "Decision Tree accuracy: %r" % acc

"""
	
	You found in the SVM mini-project that the parameter tune can significantly 
	speed up the training time of a machine learning algorithm. A general rule is 
	that the parameters can tune the complexity of the algorithm, with more 
	complex algorithms generally running more slowly.

	Another way to control the complexity of an algorithm is via the number of 
	features that you use in training/testing. The more features the algorithm

示例#2

0

显示文件

文件： dt_author_id.py 项目： pierri/ud120-projects

    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()



#########################################################
### your code goes here ###

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

#########################################################

from sklearn.metrics import accuracy_score

示例#3

0

显示文件

文件： svm_author_id.py 项目： chungvodim/MachineLearning

    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("C:/WorkSpace/MachineLearning/MachineLearning/tools")
from email_preprocess import preprocess
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(words_file = "C:/WorkSpace/MachineLearning/MachineLearning/tools/word_data.pkl", authors_file="C:/WorkSpace/MachineLearning/MachineLearning/tools/email_authors.pkl")




#########################################################
### your code goes here ###
def classify(features_train, labels_train,features_test,labels_test):
    # just training 1% of the full training set
    #features_train = features_train[:len(features_train)/100] 
    #labels_train = labels_train[:len(labels_train)/100] 
    #clf = SVC(kernel="linear")
    clf = SVC(kernel="rbf",C=10000.0)
    t0 = time()
    clf.fit(features_train,labels_train)
    print "training time:", round(time()-t0, 3), "s"

示例#4

0

显示文件

文件： nb_author_id.py 项目： kshannon/Udacity_Data_Analyst

    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.naive_bayes import GaussianNB


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


#########################################################
### your code goes here ###
print type(preprocess())
print type(features_train)
print len(features_train)


clf = GaussianNB()

# counting time for fitting model 
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

示例#5

0

显示文件

文件： svm_author_id.py 项目： sagar-cenation/Intro-To-Machine-Learning

    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess

# features_train and features_test are the features for the training
# and testing datasets, respectively
# labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###
from sklearn.svm import SVC

# features_train = features_train[:len(features_train) / 100]
# labels_train = labels_train[:len(labels_train) / 100]

# Optimize C Parameter
"""
for i in range(1, 5):
    c = 10**i
    print "C=" + str(c)
    clf = SVC(kernel="rbf", C=c)

示例#6

0

显示文件

文件： dt_author_id.py 项目： camroberts/udacity

""" 
    This is the code to accompany the Lesson 3 (decision tree) mini-project.

    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
import email_preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = email_preprocess.preprocess()

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
acc = clf.score(features_test, labels_test)
print "predict time:", round(time()-t0, 3), "s"

print acc

示例#7

0

显示文件

文件： nb_author_id.py 项目： franzaltea/ud120-projects

 def __init__(self, words_file, authors_file):
     self.words_file = "../tools/word_data.pkl"
     self.authors_file = "../tools/email_authors.pkl"
     ### features_train and features_test are the features for the training
     ### and testing datasets, respectively
     ### labels_train and labels_test are the corresponding item labels
     self.features_train, self.features_test, self.labels_train, self.labels_test = preprocess(words_file=self.words_file, authors_file=self.authors_file)

示例#8

0

显示文件

文件： dt_author_id.py 项目： magizbox/machine_learning_build

    Use a Decision Tree to identify emails from the Enron corpus by author:
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("./tools/")
from email_preprocess import preprocess
from sklearn.tree import DecisionTreeClassifier

# features_train and features_test are the features for the training
# and testing datasets, respectively
# labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(
    words_file="./tools/word_data.pkl",
    authors_file="./tools/email_authors.pkl")

# Number of features: 3785
print "Number of features: ", len(features_train[0])

# clf 1
# feature  : percentile=10
# accuracy : 0.978
# time     : 62.838
# clf = DecisionTreeClassifier(min_samples_split=40)
# t0 = time()
# clf.fit(features_train, labels_train)
# print("training time:", round(time() - t0, 3), "s")
# print("accuracy:", clf.score(features_test, labels_test))

示例#9

0

显示文件

文件： detect.py 项目： vidhanjhawar/Spam_Detector

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from data_formating import format_mails

#Formatting mails (coverting into pickle files) from the csv database file
format_mails()

#features_train is a numpy array contains emails for training
#features_test is a numpy array containing emails for testing
#labels_train is a numpy array containing training labels(spams/ham)
#mail_detect is a numpy array containing processed email to be checked as spam
features_train, features_test, labels_train, labels_test, final_transformed = preprocess(
)

#Random Forest algorithm to train classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=75,
                             criterion='entropy',
                             min_samples_split=3)
t = time()
clf.fit(features_train, labels_train)
print "Training Time:", round(time() - t, 3), "s"
t = time()
pred = clf.predict(features_test)
print "Prediction Time:", round(time() - t, 3), "s"

#code to check accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)

示例#10

0

显示文件

    
import sys
# from time import time
import time

# sys.path.append("../tools/")

# from email_preprocess import preprocess
import email_preprocess

import sklearn.svm 

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = email_preprocess.preprocess()

# only use 1%, 0.01 of features_train - performance increases, time decreases, accuracy decreases
# len(features_train) / 100 
print("len(features_test) - {}".format(len(features_test)))
# print("len(features_train) - {}".format(len(features_train)))
# print("len(features_train) / 100 - {}".format(len(features_train) / 100))
# print("round(len(features_train) / 100) - {}".format(round(len(features_train) / 100)))
# features_train = features_train[:round(len(features_train)/100)] 
# labels_train = labels_train[:round(len(labels_train)/100)]
 
#########################################################
### your code goes here ###

#########################################################

示例#11

0

显示文件

    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from utils import execute


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = execute(lambda : preprocess(), "Process data")




#########################################################
### your code goes here ###
# Imports
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Visualize data


# Create classifier
clf = GaussianNB()