def filter(ham_word_pro, spam_word_pro, test_file):
    test_paths = fun(test_file)
    for test_path in test_paths:
        email_spam_prob = 0.0
        spam_prob = 0.5
        ham_prob = 0.5
        file_name = test_path.split('\\')[-1]
        prob_dict = {}
        words = set(email_parser(test_path))
        for word in words:
            Psw = 0.0
            if word not in spam_word_pro:
                Psw = 0.4
            else:
                Pws = spam_word_pro[word]
                Pwh = ham_word_pro[word]
                Psw = spam_prob*(Pws/(Pwh*ham_prob+Pws*spam_prob))
            prob_dict[word] = Psw
        numerator = 1
        denominator_h = 1
        for k, v in prob_dict.items():
            numerator *= v
            denominator_h *= (1-v)
        email_spam_prob = round(numerator/(numerator+denominator_h), 4)
        if email_spam_prob > 0.5:
            print(file_name, 'spam', email_spam_prob)
        else:
            print(file_name, 'ham', email_spam_prob)
        print(prob_dict)
        print('******************************************************')
예제 #2
0
def main():
    file_path = r'..\data\data_of_movie'
    # file_path = r'..\data\test'
    output_path = r'..\data\tenTimesTraining'
    files = fun(file_path)
    # output_path = buildfile(output_path)
    data_storage = split_ten(files)
    group_data(data_storage, output_path)
def get_word(email_file):
    word_list = []
    word_set = []
    punctuations = """,.<>()*&^%$#@!'";~`[]{}|、\\/~+_-=?"""
    email_paths = fun(email_file)
    for email_path in email_paths:
        clean_word = email_parser(email_path)
        word_list.append(clean_word)
        word_set.extend(clean_word)
    return word_list, set(word_set)
예제 #4
0
def get_data(data_path):
    label_vec = []
    files = fwalker.fun(data_path)
    for file in files:
        ech_label_vec = []
        ech_label = int((file.split('\\'))[-1][0])
        ech_vec = ((np.loadtxt(file)).ravel())
        ech_label_vec.append(ech_label)
        ech_label_vec.append(ech_vec)
        label_vec.append(ech_label_vec)
    return label_vec
예제 #5
0
def main():
    filepath = r'..\email'
    testpath = r'..\test'
    files = fun(filepath)
    random.shuffle(files)
    top10 = files[:10]
    for ech in top10:
        ech_name = testpath + '\\' + ('_'.join(ech.split('\\')[-2:]))
        shutil.move(ech, testpath)
        os.rename(testpath + '\\' + ech.split('\\')[-1], ech_name)
        print('%s moved' % ech_name)
예제 #6
0
def get_word(email_file):
    word_list = []
    word_set = []
    punctuations = """,.<>()*&^%$#@!'";~`[]{}|、\\/~+_-=?"""
    email_paths = fun(email_file)
    for email_path in email_paths:
        # content_list = readtxt(email_path, 'utf8')
        # content = (' '.join(content_list)).replace(
        #     '\r\n', ' ').replace('\t', ' ')
        # for punctuation in punctuations:
        #     # content = content.replace(punctuation, '').replace('  ', ' ')
        #     content = (' '.join(content.split(punctuation))).replace('  ', ' ')
        # clean_word = [word.lower()
        #               for word in content.split(' ') if len(word) > 2]
        clean_word = email_parser(email_path)
        word_list.append(clean_word)
        word_set.extend(clean_word)
    return word_list, set(word_set)
예제 #7
0
def main():
    datapath = r'D:\DevelopmentLanguage\Python\MachineLearning\KNN\lab3_0930\digits'
    inputpath = bfile.buildfile(
        r'D:\DevelopmentLanguage\Python\MachineLearning\KNN\lab3_0930\input_digits')
    files = fwalker.fun(datapath)
    change_data(files, inputpath)