test_case = fp.get_pickle_data('../data_mid/keyword_tfidf_200.pkl')['test'] del data model = load_model('../data_model/lstm_model_plus1.h5') cace_i = 0 result = [] for cace in caces: case_input = [cace] * len(laws) pred = model.predict({ 'case_input': np.array(case_input), 'law_input': np.array(laws) }) pred = pred[:, 0] # 选择得分最高的前5个法条 cur_index = np.where(pred >= 0.85)[0] cur_val = pred[cur_index] sort_val = sorted([(cur_index[i] + 1, cur_val[i]) for i in range(len(cur_index))], key=lambda k: k[1], reverse=True) tmp_result = [val for val in sort_val[0:4]] result.append(tmp_result) # 每隔20次存结果 if cace_i % 20 == 0: fp.save_pickle_data('../data_result/result1.h5', result) # 输出信息 print(test_case[cace_i]) print(cace_i, tmp_result) print('-----------------------------') cace_i += 1
dictionary = corpora.Dictionary(texts) # 根据词袋构建文档向量 corpus = [dictionary.doc2bow(text) for text in texts] # 训练tf-idf模型 tfidf = models.TfidfModel(corpus) # 获取文档的所有文档的tf-idf # text_tfidf数据格式: [[(index1, tf-idf1), (index2, tf-idf2), (index3, tf-idf3)], [...], ...] # text_index数据格式:{index1:word1, index2:word2, index3:word3,} text_tfidf = tfidf[corpus] text_index = dict([val, key] for key, val in (dictionary.token2id).items()) save_midle = { 'text_tfidf': text_tfidf, 'text_index': text_index } #保存模型 fp.save_pickle_data('../data_result/tf_idf_modle.pkl', save_midle) tf_id_midle = fp.get_pickle_data('../data_result/tf_idf_modle.pkl') # text_tfidf = tf_id_midle['text_tfidf'] # text_index = tf_id_midle['text_index'] # # # 找出每个文档中最大的前100个词 # len_train_word = 40000 # word_num = 100 # max_result = [] # # for doc_tfidf in text_tfidf: # tmp_text_word = [] # sorted_list = sorted(doc_tfidf, key=lambda d: d[1], reverse=True) # for key, value in sorted_list[0:word_num]: # tmp_text_word.append(text_index[key]) # max_result.append(tmp_text_word)
tmp_word_set.add(key) j = 0 # 将其他词按照tfidf排序并加入到结果 sorted_list = sorted(text_tfidf[i], key=lambda d: d[1], reverse=True) while len(tmp_word_set) < word_num and j < len(sorted_list): word = index_word[sorted_list[j][0]] if len(word) > 1 and not contain_number.match(word): tmp_word_set.add(sorted_list[j][0]) j += 1 result = [] for key, value in text_tfidf[i]: if key in tmp_word_set: result.append(index_word[key]) keyword_tfidf_result.append(result) print(i, len(result), result) print( '-------------------------------------------------------------------------------------------------------' ) print(len(keyword_tfidf_result)) print(len(key_word)) keyword_tfidf_result = { 'train': keyword_tfidf_result[:train_len], 'test': keyword_tfidf_result[train_len:], } fp.save_pickle_data('../data_mid/keyword_tfidf_200_clean.pkl', keyword_tfidf_result)
# # fp.save_pickle_data('../data_feature/train_punish2.pkl', train) # 构造测试集的特征 test_texts = fp.get_pickle_data('../data_mid/keyword_tfidf_200_clean.pkl') test_texts = test_texts['test'] word_index = fp.get_pickle_data('../data_result/word_embedding.pkl') word_index = word_index['word_index'] test = [] max_num = max([len(texts) for texts in test_texts]) print(max_num) for i in range(len(test_texts)): text_feature = np.zeros(max_num) for j in range(len(test_texts[i])): text_feature[j] = word_index[test_texts[i][j]] test.append(text_feature) file = codecs.open("../data_origin/test.txt", "rb", "utf-8") test_id = [] for line in file: tmp = re.split('\t|\n', line) test_id.append(tmp[0]) file.close() print(len(test_id), len(test)) print(test_id[0], test[0]) test = {'id': test_id, 'value': test} fp.save_pickle_data('../data_feature/test_punish2.pkl', test)
tmp_frature = [0]*max_num for i in range(len(law)): tmp_frature[i] = word_index[law[i]] law_vector.append(np.array(tmp_frature)) # 处理案情 case_vector = [] mutiple = 4 law_num = 452 max_num = max([len(texts) for texts in test_texts]) print(max_num) # len(train_texts) for i in range(len(test_texts)): text_feature = np.zeros(max_num) for j in range(len(test_texts[i])): text_feature[j] = word_index[test_texts[i][j]] case_vector.append(text_feature) print(case_vector[0:5]) result = { 'law_vector': law_vector, 'case_vector': case_vector } fp.save_pickle_data('../data_feature/test.pkl', result)
' ', '', '。', ',', '-', ':', '“', '”', '"', '‘', '’', '!', '(', ')', '~', '、', ',', '.', '(', ')', ';', ';', ':', '?', '?', '~', '《', '》', '<', '>', '──', '─', '…', '……', '[', ']', '【', '】', '~', ']', ']' ] for line in file: tmp = re.split('\t|\n', line) word.append( [word for word in jieba.cut(tmp[1]) if word not in stop_word_list]) panish_class.append(int(tmp[2])) law.append([int(num) for num in tmp[3].split(',')]) file.close() print(word[666]) print(panish_class[666]) print(law[666]) result = {'word': word, 'panish_class': panish_class, 'law': law} fp.save_pickle_data('data_mid/train_data.pkl', result) # ———————————————————————————————————————————————————————————————————————————— file = codecs.open("data_origin/test.txt", "rb", "utf-8") word = [] stop_word_list = [ ' ', '', '。', ',', '-', ':', '“', '”', '"', '‘', '’', '!', '(', ')', '~', '、', ',', '.', '(', ')', ';', ';', ':', '?', '?', '~', '《', '》', '<', '>', '──', '─', '…', '……', '[', ']', '【', '】', '~', ']', ']' ] for line in file: tmp = re.split('\t|\n', line) word.append( [word for word in jieba.cut(tmp[1]) if word not in stop_word_list]) file.close() print(word[666]) fp.save_pickle_data('data_mid/test_data.pkl', word)
from utils import file_pickle as fp train = fp.get_pickle_data('../data_mid/train_data.pkl') texts = train['word'] law = train['law'] panish_class = train['panish_class'] new_word = [] new_law = [] new_panish_class = [] for i in range(len(texts)): if len(texts[i]) >= 100: new_word.append(texts[i]) new_law.append(law[i]) new_panish_class.append(panish_class[i]) result = {'word': new_word, 'panish_class': new_panish_class, 'law': new_law} fp.save_pickle_data('../data_mid/train_data.pkl', result)
del train_word, test_word, law_word # 将词打上索引 i = 1 word_index = {} for text in texts: for word in text: if word not in word_index: word_index[word] = i i += 1 # 把索引和向量结合 model = gensim.models.Word2Vec.load('../data_result/word2vec_100.model') index_vector = {} for word in word_index.keys(): index_vector[word_index[word]] = model[word] print(len(index_vector)) # 从索引为1的词语开始,用词向量填充矩阵 embedding_weights = np.zeros((len(index_vector) + 1, 100)) for index, w in index_vector.items(): embedding_weights[index, :] = w print(embedding_weights[0:3]) result = { 'word_index': word_index, 'index_vector': index_vector, 'embedding_weights': embedding_weights } fp.save_pickle_data('../data_result/word_embedding.pkl', result)