Пример #1
0
def NB(path):
	print ("Classifier: Naive Bayes")
	print ("Train-Test Split")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	clf = sklearn.naive_bayes.MultinomialNB()

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)
	
	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
Пример #2
0
def main_test(path=None):
    dir_path = path

    remove_incompatible_files(dir_path)

    print '\n\n'

    # load data
    print colored('Loading files into memory', 'green', attrs=['bold'])
    files = sklearn.datasets.load_files(dir_path)

    # refine all refine_all_emails
    print colored('Refining all files', 'green', attrs=['bold'])
    util.refine_all_emails(files.data)

    # calculate the BOW representation
    print colored('Calculating BOW', 'green', attrs=['bold'])
    word_counts = util.bagOfWords(files.data)

    # TFIDF
    print colored('Calculating TFIDF', 'green', attrs=['bold'])
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts)

    print '\n\n'
    
    # defining test_size
    test_size = [0.2]   #0.2 means 80% training data and 20% test data

    # create classifier
    print colored('TFIDF with Naive Bayes', 'red', attrs=['bold'])
    clf = sklearn.naive_bayes.MultinomialNB()

    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)


    print '\n\n'

    print colored('TFIDF with Support Vector Machine', 'red', attrs=['bold'])
    clf = sklearn.svm.LinearSVC()

    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)


    print '\n\n'
    
    print colored('TFIDF with K-Nearest Neighbours', 'red', attrs=['bold'])
    n_neighbors = 11
    weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # test the classifier
    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)
Пример #3
0
def KNN_parameter(path):
    print "Classifier: K Nearest Neighbors"
    print "KFOLD parameter test"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # k in kfold
    n_cross_val = 5

    # calculate results
    i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(
        TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5)

    # plot
    plot_results(i, [uniform_results, weighted_results],
                 ['uniform', 'weighted'])
Пример #4
0
def KNN_parameter(path):
	print ("Classifier: K Nearest Neighbors")
	print ("KFOLD parameter test")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# k in kfold
	n_cross_val = 5

	# calculate results
	i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5)

	# plot
	plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
Пример #5
0
def main_test(path = None, feature_rep = 'tfidf', classification_algorithm = 'knn' ):
    dir_path = path or 'dataset'
    stop_ori='的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自'

    frwords=codecs.decode(stop_ori,'utf-8')
    stoplist=set(frwords.split())


    #remove_incompatible_files(dir_path)

    print '\n\n'

    # load data
    print colored('Loading files into memory', 'green', attrs=['bold'])
    files = sklearn.datasets.load_files(dir_path)

    # calculate the BOW representation
    print colored('Calculating BOW', 'green', attrs=['bold'])
    word_counts,vocab_list = util.bagOfWords(files.data)

    count=0
    for i in vocab_list:
        if len(i)==1:
            count+=1
    print "len ==1 words:",count

    # TFIDF
    print colored('Calculating TFIDF', 'green', attrs=['bold'])
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts)

    if feature_rep == 'bow':
        X = word_counts
    print X.shape


    print '\n\n'

    # create classifier

    if classification_algorithm == 'nb':
        clf = sklearn.naive_bayes.MultinomialNB()
    elif classification_algorithm == 'svm':
        clf = sklearn.svm.LinearSVC()
    elif classification_algorithm == 'knn':
        n_neighbors = 11
        weights = 'uniform'
        weights = 'distance'
        clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # test the classifier
    print '\n\n'
    print feature_rep, classification_algorithm
    print colored('Testing classifier with train-test split', 'magenta', attrs=['bold'])
    test_classifier(X, files.target, clf, test_size=0.1, y_names=files.target_names, confusion=False)
Пример #6
0
def main_test(path=None):
    dir_path = path or 'dataset'

    remove_incompatible_files(dir_path)

    print('\n\n')

    # load data
    print((colored('Loading files into memory', 'green', attrs=['bold'])))
    files = sklearn.datasets.load_files(dir_path)

    # refine all emails
    print((colored('Refining all files', 'green', attrs=['bold'])))
    util.refine_all_emails(files.data)

    # calculate the BOW representation
    print((colored('Calculating BOW', 'green', attrs=['bold'])))
    word_counts = util.bagOfWords(files.data)

    # TFIDF
    print((colored('Calculating TFIDF', 'green', attrs=['bold'])))
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts)

    print('\n\n')

    # create classifier
    # clf = sklearn.naive_bayes.MultinomialNB()
    # clf = sklearn.svm.LinearSVC()
    n_neighbors = 11
    weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # test the classifier
    print('\n\n')
    print((colored('Testing classifier with train-test split',
                   'magenta',
                   attrs=['bold'])))
    test_classifier(X,
                    files.target,
                    clf,
                    test_size=0.2,
                    y_names=files.target_names,
                    confusion=False)
Пример #7
0
def KNN(path):
    print "Classifier: K Nearest Neighbors"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    n_neighbors = 5
    # weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
def main_test(path = None):
	dir_path = path or 'dataset'

	remove_incompatible_files(dir_path)

	print '\n\n'

	# load data
	print colored('Loading files into memory', 'green', attrs=['bold'])
	files = sklearn.datasets.load_files(dir_path)

	# refine all emails
	print colored('Refining all files', 'green', attrs=['bold'])
	util.refine_all_emails(files.data)

	# calculate the BOW representation
	print colored('Calculating BOW', 'green', attrs=['bold'])
	word_counts = util.bagOfWords(files.data)

	# TFIDF
	print colored('Calculating TFIDF', 'green', attrs=['bold'])
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
	X = tf_transformer.transform(word_counts)


	print '\n\n'

	# create classifier
	# clf = sklearn.naive_bayes.MultinomialNB()
	# clf = sklearn.svm.LinearSVC()
	n_neighbors = 11
	weights = 'uniform'
	weights = 'distance'
	clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

	# test the classifier
	print '\n\n'
	print colored('Testing classifier with train-test split', 'magenta', attrs=['bold'])
	test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
Пример #9
0
def SVM(path):
    print "Classifier: Support Vector Machine"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    clf = sklearn.svm.LinearSVC()

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
Пример #10
0
def KNN(path):
	print "Classifier: K Nearest Neighbors"
	print "Train-Test Split"

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	n_neighbors = 5
	# weights = 'uniform'
	weights = 'distance'
	clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
Пример #11
0
def main():
    origin_data = pd.read_csv('./assets/res_purchase_card_cleaned.csv',
                              sep=",",
                              error_bad_lines=False)
    predict_data = pd.read_csv('./assets/out_wei_labelled_full.csv',
                               sep=",",
                               error_bad_lines=False)

    # print("Number of Columns:\n", origin_data.shape[1], "\n\n")
    # print("List of Columns:\n", ", ".join(origin_data.columns), "\n\n")
    # print("Data:\n", origin_data.head(), "\n\n")
    # print("Size of train data(m):\n", origin_data.shape[0])

    origin_items = pd.DataFrame(
        origin_data, columns=['Description', 'Vendor', 'category_draft_1'])
    origin_items = origin_items.sample(frac=0.05,
                                       replace=True,
                                       random_state=4252)
    origin_text = origin_items["Description"] + " " + origin_items["Vendor"]
    print("Size of train data(m):\n", origin_text.shape[0])

    test_items = pd.DataFrame(predict_data,
                              columns=['description', 'category'])

    a_test_items, b_test_items = train_test_split(test_items, test_size=0.2)
    a_test_text = a_test_items["description"]
    b_test_text = b_test_items["description"]

    all_title = origin_text.tolist() + a_test_text.tolist(
    ) + b_test_text.tolist()
    all_title = [re.sub(r'([^a-zA-Z0-9])+', ' ', s) for s in all_title]
    all_title = [re.sub(r'(\s)+', ' ', s) for s in all_title]
    word_counts = util.bagOfWords(all_title)

    all_label = origin_items["category_draft_1"].append(
        a_test_items["category"]).append(b_test_items["category"])
    all_label_code = all_label.astype('category').cat.codes
    all_label_code = all_label_code.tolist()
    origin_label = all_label_code[0:len(origin_text) + len(a_test_text)]
    test_label = all_label_code[len(origin_text) + len(a_test_text):]
    label_names = dict(
        enumerate(
            all_label.astype('category').astype(
                'category').cat.categories)).values()

    # TFIDF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts[0:len(origin_text) +
                                             len(a_test_text)])
    X_predict = tf_transformer.transform(word_counts[len(origin_text) +
                                                     len(a_test_text):])

    # create classifier
    if METHOD == "MNB":
        clf = sklearn.naive_bayes.MultinomialNB()
    elif METHOD == "SVM":
        clf = svm.LinearSVC()
    else:
        n_neighbors = 11
        weights = 'uniform'
        # weights = 'distance'
        clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors,
                                                     weights=weights)

    # test the classifier
    print(
        colored('Testing classifier with train-test split',
                'magenta',
                attrs=['bold']))
    validation_classifier(X,
                          origin_label,
                          clf,
                          test_size=0.1,
                          y_names=label_names,
                          confusion=False)
    test_classifier(X,
                    origin_label,
                    X_predict,
                    test_label,
                    clf,
                    y_names=label_names,
                    confusion=False)