def NB(path): print ("Classifier: Naive Bayes") print ("Train-Test Split") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.naive_bayes.MultinomialNB() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def main_test(path=None): dir_path = path remove_incompatible_files(dir_path) print '\n\n' # load data print colored('Loading files into memory', 'green', attrs=['bold']) files = sklearn.datasets.load_files(dir_path) # refine all refine_all_emails print colored('Refining all files', 'green', attrs=['bold']) util.refine_all_emails(files.data) # calculate the BOW representation print colored('Calculating BOW', 'green', attrs=['bold']) word_counts = util.bagOfWords(files.data) # TFIDF print colored('Calculating TFIDF', 'green', attrs=['bold']) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print '\n\n' # defining test_size test_size = [0.2] #0.2 means 80% training data and 20% test data # create classifier print colored('TFIDF with Naive Bayes', 'red', attrs=['bold']) clf = sklearn.naive_bayes.MultinomialNB() # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False) print '\n\n' print colored('TFIDF with Support Vector Machine', 'red', attrs=['bold']) clf = sklearn.svm.LinearSVC() # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False) print '\n\n' print colored('TFIDF with K-Nearest Neighbours', 'red', attrs=['bold']) n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier # print '\n' for test in test_size: test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)
def KNN_parameter(path): print "Classifier: K Nearest Neighbors" print "KFOLD parameter test" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test( TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def KNN_parameter(path): print ("Classifier: K Nearest Neighbors") print ("KFOLD parameter test") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def main_test(path = None, feature_rep = 'tfidf', classification_algorithm = 'knn' ): dir_path = path or 'dataset' stop_ori='的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自' frwords=codecs.decode(stop_ori,'utf-8') stoplist=set(frwords.split()) #remove_incompatible_files(dir_path) print '\n\n' # load data print colored('Loading files into memory', 'green', attrs=['bold']) files = sklearn.datasets.load_files(dir_path) # calculate the BOW representation print colored('Calculating BOW', 'green', attrs=['bold']) word_counts,vocab_list = util.bagOfWords(files.data) count=0 for i in vocab_list: if len(i)==1: count+=1 print "len ==1 words:",count # TFIDF print colored('Calculating TFIDF', 'green', attrs=['bold']) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) if feature_rep == 'bow': X = word_counts print X.shape print '\n\n' # create classifier if classification_algorithm == 'nb': clf = sklearn.naive_bayes.MultinomialNB() elif classification_algorithm == 'svm': clf = sklearn.svm.LinearSVC() elif classification_algorithm == 'knn': n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print '\n\n' print feature_rep, classification_algorithm print colored('Testing classifier with train-test split', 'magenta', attrs=['bold']) test_classifier(X, files.target, clf, test_size=0.1, y_names=files.target_names, confusion=False)
def main_test(path=None): dir_path = path or 'dataset' remove_incompatible_files(dir_path) print('\n\n') # load data print((colored('Loading files into memory', 'green', attrs=['bold']))) files = sklearn.datasets.load_files(dir_path) # refine all emails print((colored('Refining all files', 'green', attrs=['bold']))) util.refine_all_emails(files.data) # calculate the BOW representation print((colored('Calculating BOW', 'green', attrs=['bold']))) word_counts = util.bagOfWords(files.data) # TFIDF print((colored('Calculating TFIDF', 'green', attrs=['bold']))) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print('\n\n') # create classifier # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print('\n\n') print((colored('Testing classifier with train-test split', 'magenta', attrs=['bold']))) test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def main_test(path = None): dir_path = path or 'dataset' remove_incompatible_files(dir_path) print '\n\n' # load data print colored('Loading files into memory', 'green', attrs=['bold']) files = sklearn.datasets.load_files(dir_path) # refine all emails print colored('Refining all files', 'green', attrs=['bold']) util.refine_all_emails(files.data) # calculate the BOW representation print colored('Calculating BOW', 'green', attrs=['bold']) word_counts = util.bagOfWords(files.data) # TFIDF print colored('Calculating TFIDF', 'green', attrs=['bold']) tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts) print '\n\n' # create classifier # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 11 weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print '\n\n' print colored('Testing classifier with train-test split', 'magenta', attrs=['bold']) test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
def SVM(path): print "Classifier: Support Vector Machine" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.svm.LinearSVC() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def main(): origin_data = pd.read_csv('./assets/res_purchase_card_cleaned.csv', sep=",", error_bad_lines=False) predict_data = pd.read_csv('./assets/out_wei_labelled_full.csv', sep=",", error_bad_lines=False) # print("Number of Columns:\n", origin_data.shape[1], "\n\n") # print("List of Columns:\n", ", ".join(origin_data.columns), "\n\n") # print("Data:\n", origin_data.head(), "\n\n") # print("Size of train data(m):\n", origin_data.shape[0]) origin_items = pd.DataFrame( origin_data, columns=['Description', 'Vendor', 'category_draft_1']) origin_items = origin_items.sample(frac=0.05, replace=True, random_state=4252) origin_text = origin_items["Description"] + " " + origin_items["Vendor"] print("Size of train data(m):\n", origin_text.shape[0]) test_items = pd.DataFrame(predict_data, columns=['description', 'category']) a_test_items, b_test_items = train_test_split(test_items, test_size=0.2) a_test_text = a_test_items["description"] b_test_text = b_test_items["description"] all_title = origin_text.tolist() + a_test_text.tolist( ) + b_test_text.tolist() all_title = [re.sub(r'([^a-zA-Z0-9])+', ' ', s) for s in all_title] all_title = [re.sub(r'(\s)+', ' ', s) for s in all_title] word_counts = util.bagOfWords(all_title) all_label = origin_items["category_draft_1"].append( a_test_items["category"]).append(b_test_items["category"]) all_label_code = all_label.astype('category').cat.codes all_label_code = all_label_code.tolist() origin_label = all_label_code[0:len(origin_text) + len(a_test_text)] test_label = all_label_code[len(origin_text) + len(a_test_text):] label_names = dict( enumerate( all_label.astype('category').astype( 'category').cat.categories)).values() # TFIDF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=True).fit(word_counts) X = tf_transformer.transform(word_counts[0:len(origin_text) + len(a_test_text)]) X_predict = tf_transformer.transform(word_counts[len(origin_text) + len(a_test_text):]) # create classifier if METHOD == "MNB": clf = sklearn.naive_bayes.MultinomialNB() elif METHOD == "SVM": clf = svm.LinearSVC() else: n_neighbors = 11 weights = 'uniform' # weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # test the classifier print( colored('Testing classifier with train-test split', 'magenta', attrs=['bold'])) validation_classifier(X, origin_label, clf, test_size=0.1, y_names=label_names, confusion=False) test_classifier(X, origin_label, X_predict, test_label, clf, y_names=label_names, confusion=False)