Пример #1
0
def testModelBySame(blockwordsList, blockfromList):  #判断是否为垃圾邮件
    words_blocklist, from_blocklist = getBlockedWords()
    white_list = getWhitelist()
    df = getMails()
    dfForEvaluate = df[df['type'] == 2]  #获取待分类邮件信息,以pandas表格存储
    dfSafe = df[df['type'] == 0]  #含附件的邮件默认为正常邮件
    wordsStr = list(dfForEvaluate["content"].astype("str"))
    fromStr = list(dfForEvaluate["from"].astype("str"))
    titleStr = list(dfForEvaluate["title"].astype("str"))
    i = -1  #记录读取的序号
    for myword in fromStr:  #如在白名单中,直接判为正常邮件。在黑名单中,直接判为垃圾邮件。
        i = i + 1
        for word in myword.strip().split(","):
            if (word in white_list):
                dfForEvaluate.ix[i, 'blocked'] = 0
                dfForEvaluate.ix[i, 'type'] = 0
            elif (word in from_blocklist):
                dfForEvaluate.ix[i, 'blocked'] = 1
                dfForEvaluate.ix[i, 'type'] = 1
            else:
                dfForEvaluate.ix[i, 'blocked'] = 2

    j = -1  #记录读取的序号
    for myword2 in titleStr:  #如果标题中存在屏蔽词,直接判为垃圾邮件
        j = j + 1
        for eachword1 in words_blocklist:
            if (myword2.find(eachword1) != -1):
                dfForEvaluate.ix[j, 'blocked'] = 1
                dfForEvaluate.ix[j, 'type'] = 1

    k = -1
    for myword1 in wordsStr:  #如果内容中存在屏蔽词,直接判为垃圾邮件
        k = k + 1
        for eachword in words_blocklist:
            if (myword1.find(eachword) != -1):
                dfForEvaluate.ix[k, 'blocked'] = 1
                dfForEvaluate.ix[k, 'type'] = 1

    dfBlocked = dfForEvaluate[dfForEvaluate['blocked'] == 1]  #被屏蔽的邮件
    dfWhitelist = dfForEvaluate[dfForEvaluate['blocked'] == 0]  #白名单邮件
    dfLeft = dfForEvaluate[dfForEvaluate['blocked'] == 2]  #除以上两者剩余邮件
    transformer_model = joblib.load(
        "../data/result_save_TFM_try")  #载入保存的模型进行预测
    svd_model = joblib.load("../data/result_save_SVDM_try")
    model = joblib.load("../data/result_save_AdaBoost_try")
    jieba_cut_content = list(dfLeft["content"].astype("str"))
    jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content]
    y_test = dfLeft["type"]
    data_test = pd.DataFrame(
        svd_model.transform(transformer_model.transform(jieba_cut_content)))
    y_predict = model.predict(data_test)
    resultList = list(y_predict)  #存放预测结果
    resultList = [int(i) for i in resultList]
    dfLeft['type'] = resultList
    return dfWhitelist, dfSafe, dfLeft, dfBlocked
Пример #2
0
def testModelBySame(testFilePath):
    transformer_model = joblib.load("../data/result_save_TFM_try")
    svd_model = joblib.load("../data/result_save_SVDM_try")
    model = joblib.load("../data/result_save_AdaBoost_try")
    df = pd.read_csv(testFilePath,names = ['frome','to','title','content','classes'] ,encoding="utf-8",sep=",")
    #df.dropna(axis=0,how="any",inplace=True)
    jieba_cut_content = list(df["content"].astype("str"))
    jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content]
    data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content)))
    y_predict = model.predict(data_test)
    resultList =  list(y_predict)
    resultList =  [int(i) for i in resultList]
    df['classes'] = resultList
    return df
Пример #3
0
def process_file(file_path):
    content_dict = read_file(file_path)

    #进行处理(拼接),get()函数返回指定键的值,指定键的值不存在用指定的默认值unkown代替
    result_str = content_dict.get("from", "unkown").replace(",",
                                                            "").strip() + ","
    result_str += content_dict.get("to", "unkown").replace(",",
                                                           "").strip() + ","
    result_str += content_dict.get("data", "unkown").replace(",",
                                                             "").strip() + ","
    firstText = content_dict.get("content", "unkown").replace(",", "").strip()
    firstText = firstText.replace("+", "").strip()
    firstText = firstText.replace("_", "").strip()
    result_str += jiebaclearText(firstText)
    return result_str
Пример #4
0
def testModelBySame(testFilePath):
    #读取存储下来的训练训练模型
    transformer_model = joblib.load("../data/result_save_TFM_try")
    svd_model = joblib.load("../data/result_save_SVDM_try")
    model = joblib.load("../data/result_save_AdaBoost_try")
    #预处理数据
    df = pd.read_csv(testFilePath,names = ['frome','to','title','content','classes'] ,encoding="utf-8",sep=",")
    jieba_cut_content = list(df["content"].astype("str"))
    jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content]
    data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content)))
    #进行判断
    y_predict = model.predict(data_test)
    resultList =  list(y_predict)
    #返回结果的数组
    resultList =  [int(i) for i in resultList]
    df['classes'] = resultList
    return df
Пример #5
0
def testModelBySame(blockwordsList, blockfromList):  #判断是否为垃圾邮件
    words_blocklist, from_blocklist = getBlockedWords()
    white_list = getWhitelist()
    df = getMails()
    dfForEvaluate = df[df['type'] == 2]
    #print("dfForEvaluate")
    #print(dfForEvaluate)
    dfSafe = df[df['type'] == 0]
    #print("dfSafe")
    #print(dfSafe)
    wordsStr = list(dfForEvaluate["content"].astype("str"))
    fromStr = list(dfForEvaluate["from"].astype("str"))
    titleStr = list(dfForEvaluate["title"].astype("str"))
    # print(titleStr)
    i = -1
    for myword in fromStr:  #如在白名单中,直接判为正常邮件。在黑名单中,直接判为垃圾邮件。
        i = i + 1
        for word in myword.strip().split(","):
            if (word in white_list):
                dfForEvaluate.ix[i, 'blocked'] = 0
                dfForEvaluate.ix[i, 'type'] = 0
            elif (word in from_blocklist):
                dfForEvaluate.ix[i, 'blocked'] = 1
                dfForEvaluate.ix[i, 'type'] = 1
            else:
                dfForEvaluate.ix[i, 'blocked'] = 2

    #print(wordsStr)
    j = -1
    for myword2 in titleStr:
        j = j + 1
        for eachword1 in words_blocklist:
            if (myword2.find(eachword1) != -1):
                # print(myword2)
                dfForEvaluate.ix[j, 'blocked'] = 1
                dfForEvaluate.ix[j, 'type'] = 1

    k = -1
    for myword1 in wordsStr:
        k = k + 1
        for eachword in words_blocklist:
            if (myword1.find(eachword) != -1):
                # print(myword1)
                dfForEvaluate.ix[k, 'blocked'] = 1
                dfForEvaluate.ix[k, 'type'] = 1

    #print(dfForEvaluate)
    dfBlocked = dfForEvaluate[dfForEvaluate['blocked'] == 1]
    dfWhitelist = dfForEvaluate[dfForEvaluate['blocked'] == 0]
    dfLeft = dfForEvaluate[dfForEvaluate['blocked'] == 2]
    transformer_model = joblib.load("../data/result_save_TFM_try")
    svd_model = joblib.load("../data/result_save_SVDM_try")
    model = joblib.load("../data/result_save_AdaBoost_try")
    #print(model)
    jieba_cut_content = list(dfLeft["content"].astype("str"))
    jieba_cut_content = [jiebaclearText(line) for line in jieba_cut_content]
    #print(jieba_cut_content)
    #print(testList)
    y_test = dfLeft["type"]
    data_test = pd.DataFrame(
        svd_model.transform(transformer_model.transform(jieba_cut_content)))

    y_predict = model.predict(data_test)
    resultList = list(y_predict)
    resultList = [int(i) for i in resultList]
    dfLeft['type'] = resultList
    print(resultList)
    return dfWhitelist, dfSafe, dfLeft, dfBlocked