def return_essay_object(file_name):
    """
    获取训练集所有文章内容,按照对象形式存储
    :param file_name: 源训练集
    :return: 对象列表
    """
    lemmatizer = WordNetLemmatizer()  # 抽词元(词形还原)  2
    essays_object_list = []
    with open(file_name, 'r', errors='ignore') as f:
        line = f.readline()
        i = 0
        with open(r'./rdata/object_essay_wanzheng10.pkl',
                  'wb') as f_pkl_object:
            while line:
                paragraphs = []
                list_essay_info = line.split('\t')
                essay_extracted = (json.loads(json.loads(list_essay_info[0]))
                                   )  # 文章
                essay_extracted = cleanText(essay_extracted)  # 清洗
                # num = 1
                for paragraph in essay_extracted.split('\n'):  # 存储
                    paragraphed = paragraph.strip()
                    # if not paragraphed.isspace():    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 警告这里还没有处理完,无法判断内容为空字符却添加的异常
                    # print((paragraph.strip()))
                    # print(num)
                    # num += 1
                    paragraphs.append(paragraphed)
                # print("上面是一片文章的段落区分")
                essay_object = essay.Essay()
                essay_object.essay_str = essay_extracted  # 清洗后的文章
                essay_object.paragraphs = paragraphs  # 文章的段落列表
                essay_object.sentences = nltk.sent_tokenize(
                    essay_extracted)  # 文章的句子列表
                essay_object.tokens = nltk.word_tokenize(
                    essay_extracted)  # 文章的token列表
                essay_object.tokens_pos_lemma = parse_token_pos_lemma(
                    essay_object, lemmatizer
                )  # 文章token,pos,lemma  注释:[0,1,2]-->[token,pos,lemma]
                essay_object.paragraphs_num = len(paragraphs)  # 文章的段落数
                essay_object.essay_words_num = len(
                    nltk.word_tokenize(essay_extracted))  # 文章的长度

                pickle.dump(essay_object, f_pkl_object)
                # essays_object_list.append(essay_object)
                i += 1
                print("文章对象--{}---写入完成".format(i))
                # if i == 10000:
                #     break
                line = f.readline()
Пример #2
0
def parsing_web_data(essay_text):
    essay_text_cleaned = cleanText(essay_text)   # 清洗数据
    essay_object = essay.Essay()  # 创建essay对象
    f = Features()  # 创建特征对象
    lemmatizer = WordNetLemmatizer()  # 抽词元(词形还原)  2

    paragraphs = []
    for paragraph in essay_text_cleaned.split('\n'):  # 处理段落
        paragraphed = paragraph.strip()
        paragraphs.append(paragraphed)

    essay_object.essay_str = essay_text_cleaned  # 清洗后的文章
    essay_object.paragraphs = paragraphs  # 文章的段落列表
    essay_object.sentences = nltk.sent_tokenize(essay_text_cleaned)  # 文章的句子列表
    essay_object.tokens = nltk.word_tokenize(essay_text_cleaned)  # 文章的token列表
    essay_object.tokens_pos_lemma = parse_token_pos_lemma(essay_object,lemmatizer)  # 文章token,pos,lemma  注释:[0,1,2]-->[token,pos,lemma]
    essay_object.paragraphs_num = len(paragraphs)  # 文章的段落数
    essay_object.essay_words_num = len(nltk.word_tokenize(essay_text_cleaned))  # 文章的长度

    return f.returnFeatures(essay_object)
    essay_object.tokens = nltk.word_tokenize(essay_text_cleaned)  # 文章的token列表
    essay_object.tokens_pos_lemma = parse_token_pos_lemma(
        essay_object,
        lemmatizer)  # 文章token,pos,lemma  注释:[0,1,2]-->[token,pos,lemma]
    essay_object.paragraphs_num = len(paragraphs)  # 文章的段落数
    essay_object.essay_words_num = len(
        nltk.word_tokenize(essay_text_cleaned))  # 文章的长度

    return f.returnFeatures(essay_object)


if __name__ == "__main__":
    # return_essay_object(r'./train_test_set/pigai.train')
    score_list = []
    model = joblib.load(r'./model/basic_liner_model_allData2.pkl')  # 模型加载
    essay_object = essay.Essay()  # 创建essay对象
    fs = Features()  # 创建特征对象
    lemmatizer = WordNetLemmatizer()  # 抽词元(词形还原)  2

    with open(r'./train_test_set/pigaiessay.test', 'r',
              errors='ignore') as f:  # 测试数据加载
        with open(r'./rdata/test_result_2_all.txt', 'w',
                  errors='ignore') as ff:  # 结果数据写入
            i = 1
            line = f.readline()
            while line:
                paragraphs = []
                essay_text = json.loads(line)
                essay_extracted = cleanText(essay_text)  # 清洗

                for paragraph in essay_extracted.split('\n'):