Exemplo n.º 1
0
def spam_rank(word):
	word = word.lower()
	rank_dict = training.get_rank_dict()

	if word in rank_dict:
		return rank_dict[word]
	else:
		return 0.4
Exemplo n.º 2
0
def spam_detect_email_file(filename, method, threshold, verbose):
	
	email = open(filename, 'U', encoding='utf-8', errors='replace')

	rank_dict = training.get_rank_dict()
	# print("Length of rank_dict is", len(rank_dict))

	# mail_dict = {'hello': 0.8, 'andrew': 0.2, 'sign': 0.95, 'up': 0.6, 'for': 0.3, 'free': 0.99, 'access': 0.99, 'to': 0.4, 'the': 0.4, 'world\'s': 0.81, 'best': 0.85, 'parties': 0.94, 'around': 0.2, 'globe': 0.78, 'do': 0.1, 'you': 0.31, 'think': 0.01, 'have': 0.35, 'what': 0.06, 'it': 0.4, 'takes': 0.89}
	mail_dict = {}
	interesting_dict = {}
	spam_prob_threshold = threshold
	num_of_words_to_grab = 15

	# create mail_dict as a set of words in the email and their corresponding spam rankings
	email_words = file_handler.parseEmail(email)

	for word in email_words:
		if word not in mail_dict:
			if(spam_rank(word) != None):
				mail_dict[word] = spam_rank(word)
			else:
				mail_dict[word] = 0.4

	# create interesting_dict as a set of the most interesting words from the email
	# and their corresponding spam rankings
	while(len(interesting_dict) <= num_of_words_to_grab):
		interesting_word, interesting_value = find_most_interesting_word(mail_dict)

		# if the email has no more words to draw from, break
		if len(mail_dict) == 0:
			break
		else:
			del mail_dict[interesting_word]
			interesting_dict[interesting_word] = interesting_value
			
			if verbose == True:
				print(interesting_word, ": ", interesting_value)

	if method == "mean":
		# average of most interesting word rankings
		prob = sum(interesting_dict.values())/len(interesting_dict)
		if (prob >= threshold):
			return 1
		else:
			return 0

	elif method == "majority":
		count_words_above_threshold = 0
		# count the number of words in the interesting dictionary which
		# have values greater than the threshold
		for word, value in interesting_dict.items():
			if( value >= threshold ):
				count_words_above_threshold += 1

		# if there are more words above the threshold than below, mark
		# the email as spam
		if(count_words_above_threshold/len(interesting_dict) > 0.5):
			return 1
		# otherwise, mark the email as non-spam
		else:
			return 0

	elif method == "combined":
		prob_product = 1
		one_minus_prob_product = 1

		for word, value in interesting_dict.items():
			prob_product *= value
			one_minus_prob_product *= (1 - value)

		prob = prob_product/(prob_product + one_minus_prob_product)

		if verbose == True:
			print()
			if ( prob >= threshold):
				print("This message is spam with probability", prob)
				return 1
			else:
				print("This message is not spam with probability", 1-prob)
				return 0
	else:
		print("Invalid method", method, "used for test_accuracy")
		exit()

	return 0
Exemplo n.º 3
0
def spam_detect_email(email, method, threshold):
	
	rank_dict = training.get_rank_dict()

	mail_dict = {}
	interesting_dict = {}
	spam_prob_threshold = threshold
	num_of_words_to_grab = 15

	# create mail_dict as a set of words in the email and their corresponding spam rankings
	email_words = file_handler.parseEmail(email)

	for word in email_words:
		if word not in mail_dict:
			if spam_rank(word) != None:
				mail_dict[word] = spam_rank(word)
			else:
				mail_dict[word] = 0.4

	# create interesting_dict as a set of the most interesting words from the email
	# and their corresponding spam rankings
	while(len(interesting_dict) <= num_of_words_to_grab):
		interesting_word, interesting_value = find_most_interesting_word(mail_dict)

		# if the email has no more words to draw from, break
		if len(mail_dict) == 0:
			break
		else:
			del mail_dict[interesting_word]
			interesting_dict[interesting_word] = interesting_value
			# print(interesting_word, ": ", interesting_value)

	if method == "mean":
		# average of most interesting word rankings
		prob = sum(interesting_dict.values())/len(interesting_dict)
		if (prob >= threshold):
			return 1
		else:
			return 0

	elif method == "majority":
		count_words_above_threshold = 0
		# count the number of words in the interesting dictionary which
		# have values greater than the threshold
		for word, value in interesting_dict.items():
			if( value >= threshold ):
				count_words_above_threshold += 1

		# if there are more words above the threshold than below, mark
		# the email as spam
		if count_words_above_threshold/len(interesting_dict) > 0.5:
			return 1
		# otherwise, mark the email as non-spam
		else:
			return 0

	elif method == "combined":
		prob_product = 1
		one_minus_prob_product = 1

		for word, value in interesting_dict.items():
			prob_product *= value
			one_minus_prob_product *= (1 - value)

		prob = prob_product/(prob_product + one_minus_prob_product)
		# print(prob)

		if prob >= threshold:
			return 1
		else:
			return 0