def KNN_parameter(path): print "Classifier: K Nearest Neighbors" print "KFOLD parameter test" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test( TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def main_test(path=None): dir_path = path remove_incompatible_files(dir_path) print '\n\n' # load data print colored('Loading files into memory', 'green', attrs=['bold']) files = sklearn.datasets.load_files(dir_path) # refine all refine_all_emails print colored('Refining all files', 'green', attrs=['bold']) util.refine_all_emails(files.data) # calculate the BOW representation print colored('Calculating BOW', 'green', attrs=['bold']) word_counts = util.bagOfWords(files.data) # TFIDF print colored('Calculating TFIDF', 'green', attrs=['bold']) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print '\n\n' # defining test_size test_size = [0.2] #0.2 means 80% training data and 20% test data # create classifier print colored('TFIDF with Naive Bayes', 'red', attrs=['bold']) clf = sklearn.naive_bayes.MultinomialNB() # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False) print '\n\n' print colored('TFIDF with Support Vector Machine', 'red', attrs=['bold']) clf = sklearn.svm.LinearSVC() # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False) print '\n\n' print colored('TFIDF with K-Nearest Neighbours', 'red', attrs=['bold']) n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)
def NB(path): print ("Classifier: Naive Bayes") print ("Train-Test Split") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.naive_bayes.MultinomialNB() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def KNN_parameter(path): print ("Classifier: K Nearest Neighbors") print ("KFOLD parameter test") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def main_test(path=None): dir_path = path or 'dataset' remove_incompatible_files(dir_path) print('\n\n') # load data print((colored('Loading files into memory', 'green', attrs=['bold']))) files = sklearn.datasets.load_files(dir_path) # refine all emails print((colored('Refining all files', 'green', attrs=['bold']))) util.refine_all_emails(files.data) # calculate the BOW representation print((colored('Calculating BOW', 'green', attrs=['bold']))) word_counts = util.bagOfWords(files.data) # TFIDF print((colored('Calculating TFIDF', 'green', attrs=['bold']))) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print('\n\n') # create classifier # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print('\n\n') print((colored('Testing classifier with train-test split', 'magenta', attrs=['bold']))) test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def main_test(path = None): dir_path = path or 'dataset' remove_incompatible_files(dir_path) print '\n\n' # load data print colored('Loading files into memory', 'green', attrs=['bold']) files = sklearn.datasets.load_files(dir_path) # refine all emails print colored('Refining all files', 'green', attrs=['bold']) util.refine_all_emails(files.data) # calculate the BOW representation print colored('Calculating BOW', 'green', attrs=['bold']) word_counts = util.bagOfWords(files.data) # TFIDF print colored('Calculating TFIDF', 'green', attrs=['bold']) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print '\n\n' # create classifier # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print '\n\n' print colored('Testing classifier with train-test split', 'magenta', attrs=['bold']) test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
def SVM(path): print "Classifier: Support Vector Machine" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.svm.LinearSVC() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])