def return_essay_object(file_name): """ 获取训练集所有文章内容,按照对象形式存储 :param file_name: 源训练集 :return: 对象列表 """ lemmatizer = WordNetLemmatizer() # 抽词元(词形还原) 2 essays_object_list = [] with open(file_name, 'r', errors='ignore') as f: line = f.readline() i = 0 with open(r'./rdata/object_essay_wanzheng10.pkl', 'wb') as f_pkl_object: while line: paragraphs = [] list_essay_info = line.split('\t') essay_extracted = (json.loads(json.loads(list_essay_info[0])) ) # 文章 essay_extracted = cleanText(essay_extracted) # 清洗 # num = 1 for paragraph in essay_extracted.split('\n'): # 存储 paragraphed = paragraph.strip() # if not paragraphed.isspace(): # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 警告这里还没有处理完,无法判断内容为空字符却添加的异常 # print((paragraph.strip())) # print(num) # num += 1 paragraphs.append(paragraphed) # print("上面是一片文章的段落区分") essay_object = essay.Essay() essay_object.essay_str = essay_extracted # 清洗后的文章 essay_object.paragraphs = paragraphs # 文章的段落列表 essay_object.sentences = nltk.sent_tokenize( essay_extracted) # 文章的句子列表 essay_object.tokens = nltk.word_tokenize( essay_extracted) # 文章的token列表 essay_object.tokens_pos_lemma = parse_token_pos_lemma( essay_object, lemmatizer ) # 文章token,pos,lemma 注释:[0,1,2]-->[token,pos,lemma] essay_object.paragraphs_num = len(paragraphs) # 文章的段落数 essay_object.essay_words_num = len( nltk.word_tokenize(essay_extracted)) # 文章的长度 pickle.dump(essay_object, f_pkl_object) # essays_object_list.append(essay_object) i += 1 print("文章对象--{}---写入完成".format(i)) # if i == 10000: # break line = f.readline()
def parsing_web_data(essay_text): essay_text_cleaned = cleanText(essay_text) # 清洗数据 essay_object = essay.Essay() # 创建essay对象 f = Features() # 创建特征对象 lemmatizer = WordNetLemmatizer() # 抽词元(词形还原) 2 paragraphs = [] for paragraph in essay_text_cleaned.split('\n'): # 处理段落 paragraphed = paragraph.strip() paragraphs.append(paragraphed) essay_object.essay_str = essay_text_cleaned # 清洗后的文章 essay_object.paragraphs = paragraphs # 文章的段落列表 essay_object.sentences = nltk.sent_tokenize(essay_text_cleaned) # 文章的句子列表 essay_object.tokens = nltk.word_tokenize(essay_text_cleaned) # 文章的token列表 essay_object.tokens_pos_lemma = parse_token_pos_lemma(essay_object,lemmatizer) # 文章token,pos,lemma 注释:[0,1,2]-->[token,pos,lemma] essay_object.paragraphs_num = len(paragraphs) # 文章的段落数 essay_object.essay_words_num = len(nltk.word_tokenize(essay_text_cleaned)) # 文章的长度 return f.returnFeatures(essay_object)
essay_object.tokens = nltk.word_tokenize(essay_text_cleaned) # 文章的token列表 essay_object.tokens_pos_lemma = parse_token_pos_lemma( essay_object, lemmatizer) # 文章token,pos,lemma 注释:[0,1,2]-->[token,pos,lemma] essay_object.paragraphs_num = len(paragraphs) # 文章的段落数 essay_object.essay_words_num = len( nltk.word_tokenize(essay_text_cleaned)) # 文章的长度 return f.returnFeatures(essay_object) if __name__ == "__main__": # return_essay_object(r'./train_test_set/pigai.train') score_list = [] model = joblib.load(r'./model/basic_liner_model_allData2.pkl') # 模型加载 essay_object = essay.Essay() # 创建essay对象 fs = Features() # 创建特征对象 lemmatizer = WordNetLemmatizer() # 抽词元(词形还原) 2 with open(r'./train_test_set/pigaiessay.test', 'r', errors='ignore') as f: # 测试数据加载 with open(r'./rdata/test_result_2_all.txt', 'w', errors='ignore') as ff: # 结果数据写入 i = 1 line = f.readline() while line: paragraphs = [] essay_text = json.loads(line) essay_extracted = cleanText(essay_text) # 清洗 for paragraph in essay_extracted.split('\n'):