Пример #1
0
def main(dataset_name, testset_name, new_emails=False):
    '''Runs the knn classifier for a training set dataset_name and test set testset_name'''
    current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
    trainingset_path = current_path + dataset_name + "\\"
    testset_path = current_path + testset_name + "\\"
    results_path = testset_path + "results\\"

    if not os.path.exists(results_path):
        os.mkdir(results_path)

    folder_names = next(os.walk(testset_path + "."))[1]
    if 'results' in folder_names:
        folder_names.remove('results')
    if new_emails:
        folder_names = [""]

    workfilename = 'mergedworkfile.csv'
    wordfilename = 'wordfile.csv'
    # klist = [1, 3, 7, 15, 24, 33, 42, 50]
    klist = [1, 3]
    acc = []
    ks = []
    trainingSet = []

    print("Loading Training Set...")
    wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(
        trainingset_path, workfilename, wordfilename, trainingSet)
    print("Training Set loaded.")

    print('Collecting ' + 'New' * new_emails + 'Test' * (not new_emails) +
          ' Emails...')
    testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd,
                                        subd, digramsd, trigramsd)
    print('New' * new_emails + 'Test' * (not new_emails) +
          ' Emails Collected.')

    assert (len(trainingSet[0]) == len(testSet[0]))

    list_of_predictions = knn.classify(klist, trainingSet, testSet,
                                       results_path)

    if not new_emails:
        #Finds the predictions and accuracy for new test mails given the predictions for these mails
        for i in range(len(klist)):
            predictions = []
            for x in range(len(testSet)):
                predictions.append(list_of_predictions[x][i])
            accuracy = knn.getAccuracy(testSet, predictions)
            acc.append(accuracy)
            ks.append(klist[i])
            print('K: ' + repr(klist[i]))
            print('Accuracy: ' + repr(accuracy) + '%')

        print('Overall Accuracy: ' + str(sum(acc) / len(acc)) + "%")
        plt.plot(ks, acc)
        plt.xlabel('K')
        plt.ylabel('Accuracy')
        plt.show()

    print('Find the results at: ' + results_path)
def main(dataset_name, testset_name, new_emails = False):

	if platform.system() == 'Windows':
		current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
		trainingset_path = current_path + dataset_name + "\\"
		testset_path = current_path + testset_name + "\\"
		results_path = testset_path + "results\\"
	elif platform.system() == 'Linux':
		current_path = os.path.dirname(os.path.abspath(__file__)) + "/"
		trainingset_path = current_path + dataset_name + "/"
		testset_path = current_path + testset_name + "/"
		results_path = testset_path + "results/"

	if not os.path.exists(results_path):
		os.mkdir(results_path)

	folder_names = next(os.walk(testset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	# folder_names = ["calendar"]
	if new_emails:
		folder_names = [""]

	workfilename = 'mergedworkfile.csv'
	wordfilename = 'wordfile.csv'
	trainingSet = []
	predicted_folders = []
	
	print("Loading Training Set...")
	wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet)
	print("Training Set loaded.")

	print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...')
	testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd)
	print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.')

	assert(len(trainingSet[0]) == len(testSet[0]))

	# prepare model
	summaries, classproirprobabilities = mnb.summarizeByClass(trainingSet)

	# test model
	predictions = mnb.getPredictions(summaries, classproirprobabilities, testSet, results_path)
	
	folder_names = next(os.walk(trainingset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	for fname in folder_names:
		if not os.path.exists(results_path + fname):
			os.mkdir(results_path + fname)
	for i in range(len(predictions)):
		shutil.copy2(all_files[i], results_path + folder_names[predictions[i]])
		predicted_folders.append(folder_names[predictions[i]])

	if not new_emails:
		accuracy = mnb.getAccuracy(testSet, predictions)
		print('Accuracy: {0}%'.format(accuracy))

	print('Find the results at: ' + results_path)
	return predicted_folders
def main(dataset_name, testset_name, new_emails = False):
	'''Runs the mnb classifier for a training set dataset_name and test set testset_name'''
	current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
	trainingset_path = current_path + dataset_name + "\\"
	testset_path = current_path + testset_name + "\\"
	results_path = testset_path + "results\\"

	if not os.path.exists(results_path):
		os.mkdir(results_path)

	folder_names = next(os.walk(testset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	# folder_names = ["calendar"]
	if new_emails:
		folder_names = [""]

	workfilename = 'mergedworkfile.csv'
	wordfilename = 'wordfile.csv'
	trainingSet = []
	predicted_folders = []
	
	print("Loading Training Set...")
	wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet)
	print("Training Set loaded.")

	print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...')
	testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd)
	print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.')

	assert(len(trainingSet[0]) == len(testSet[0]))

	# prepare model
	summaries, classpriorprobabilities = mnb.summarizeByClass(trainingSet)

	# test model
	predictions = mnb.getPredictions(summaries, classpriorprobabilities, testSet, results_path)
	
	folder_names = next(os.walk(trainingset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	for fname in folder_names:
		if not os.path.exists(results_path + fname):
			os.mkdir(results_path + fname)
	for i in range(len(predictions)):
		shutil.copy2(all_files[i], results_path + folder_names[predictions[i]])
		predicted_folders.append(folder_names[predictions[i]])
		
	if not new_emails:
				#Finds the accuracy for new test mails given the predictions for these mails
		accuracy = mnb.getAccuracy(testSet, predictions)
		print('Accuracy: {0}%'.format(accuracy))

	print('Find the results at: ' + results_path)
	return predicted_folders
Пример #4
0
def main(dataset_name, testset_name, new_emails = False):
	'''Runs the knn classifier for a training set dataset_name and test set testset_name'''
	current_path = os.path.dirname(os.path.abspath(__file__)) + "\\"
	trainingset_path = current_path + dataset_name + "\\"
	testset_path = current_path + testset_name + "\\"
	results_path = testset_path + "results\\"

	if not os.path.exists(results_path):
		os.mkdir(results_path)

	folder_names = next(os.walk(testset_path + "."))[1]
	if 'results' in folder_names:
		folder_names.remove('results')
	if new_emails:
		folder_names = [""]
	
	workfilename = 'mergedworkfile.csv'
	wordfilename = 'wordfile.csv'
	# klist = [1, 3, 7, 15, 24, 33, 42, 50]
	klist = [1, 3]
	acc = []
	ks = []
	trainingSet=[]

	print("Loading Training Set...")
	wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet)
	print("Training Set loaded.")

	print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...')
	testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd)
	print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.')

	assert(len(trainingSet[0]) == len(testSet[0]))

	list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path)

	if not new_emails:
		#Finds the predictions and accuracy for new test mails given the predictions for these mails
		for i in range(len(klist)):
			predictions = []
			for x in range(len(testSet)):	
				predictions.append(list_of_predictions[x][i])
			accuracy = knn.getAccuracy(testSet, predictions)
			acc.append(accuracy)
			ks.append(klist[i])
			print('K: ' + repr(klist[i]))
			print('Accuracy: ' + repr(accuracy) + '%')
			
		print('Overall Accuracy: '+ str(sum(acc)/len(acc)) + "%")
		plt.plot(ks, acc)
		plt.xlabel('K')
		plt.ylabel('Accuracy')
		plt.show()
	
	print('Find the results at: ' + results_path)