def raw_data_to_model(file_path, tokenizer, word2id, tag2id, batch_size, contain_y=True): sample_list_, tag_list_ = construct_data(file_path) sample_list_, tag_list_ = sort_sequence(sample_list_, tag_list_) x, y, lengths = [], [], [] for i in range(0, len(sample_list_), batch_size): # 每个batch按照最大len进行to id操作 # seq_len_ = max(map(lambda xx: len(xx), sample_list_[i:i+batch_size])) seq_len_ = len(sample_list_[i]) x_, lengths_ = content_to_id(sample_list_[i:i + batch_size], line_sep=None, tokenizer=tokenizer, seq_len=seq_len_, vocab_dic=word2id, with_real_seq_len=True) if contain_y: y_ = content_to_id(tag_list_[i:i + batch_size], line_sep=None, tokenizer=tokenizer, seq_len=seq_len_, vocab_dic=tag2id) y.extend(y_.tolist()) x.extend(x_.tolist()) lengths.extend(lengths_.tolist()) if contain_y: return np.array(x), np.array(y), np.array(lengths) else: return np.array(x), np.array(lengths)
def construct_data(path, vocab_dic, tokenizer, seq_len, line_sep): # cls_index = vocab_dic.get("[CLS]") # 首位置添加[CLS] x, y, lengths = content_to_id(path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep, with_real_seq_len=True) # x = np.insert(x, 0, cls_index, axis=1) # mask = np.array([make_sequence_mask(real_len=i + 1, seq_len=seq_len + 1) for i in lengths]) # 多加了一个cls_index mask = (x > 0).astype(int) print(f"x sample number is {len(x)}, label sample number is {len(y)}") return x, y, mask
def main_entry(save_dir): # seed_everything(987, use_np=True, use_cpu=True, use_gpu=False) # [0]. 转换数据格式, 形如 sentences+label train_pri_path = "./data/tnews/train.json" train_path = "./data/tnews/train_trans.txt" valid_pri_path = "./data/tnews/dev.json" valid_path = "./data/tnews/dev_trans.txt" test_pri_path = "./data/tnews/test.json" test_path = "./data/tnews/test_trans.txt" label_path = "./data/tnews/labels.json" label_dic = get_label_map(label_path) transform_data(train_pri_path, label_dic, train_path) transform_data(valid_pri_path, label_dic, valid_path) transform_data(test_pri_path, label_dic, test_path) # [1]. 创建词汇表字典 # [1.1]. 无词汇表,从指定文件创建并保存 vocab_file_path = train_path save_path = os.path.join(save_dir, "train_vocab.pkl") tokenizer = "char" line_sep = "\t" vocab_dic = build_vocab_by_raw_file(vocab_file_path, line_sep=line_sep, tokenizer=tokenizer, word_dic_save_path=save_path) # [1.2]. 有词汇表,从指定文件创建 # [1.3]. 有词汇表,手动从pickle文件中加载 # [1.4]. 有词汇表,基于此进行更新 # [2]. 文本转换为id # train_path = "./data/THUCNews/train.txt" # valid_path = "./data/THUCNews/dev.txt" # test_path = "./data/THUCNews/test.txt" seq_len = 100 train_x, train_y = content_to_id(train_path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep) print( f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}" ) valid_x, valid_y = content_to_id(valid_path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep) print( f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}" ) # test_x, test_y = content_to_id(test_path, tokenizer=tokenizer, seq_len=seq_len, # vocab_dic=vocab_dic, line_sep=line_sep) # print(f"content sample number is {len(test_x)}, label sample number is {len(test_y)}") # [3]. 切分数据为三部分(训练、验证和测试集),(随机切分或者标签比例切分) # 当然如果第二步已经切分,则此部分可以忽略 # train_ind, valid_ind, test_ind = split_data_with_index(indexes=len(content), split_ratios=(0.7, 0.1, 0.2)) # train_x, train_y = np.array(content)[train_ind], np.array(label)[train_ind] # valid_x, valid_y = np.array(content)[valid_ind], np.array(label)[valid_ind] # test_x, test_y = content[test_ind], label[test_ind] # 也有可能[2]和[3]颠倒,即我首先读取数据,可以通过pandas等等方式,先处理数据, # 然后通过切分策略,切分数据,此时数据已经划分为三部分或者两部分,然后再走[2],将这两部分逻辑分别写例子 # [4]. 数据策略,比如按类别做上采样,下采样; # 此时数据已经为numpy格式 # for i in np.unique(train_y): # print(f"label {i} number is {sum(train_y == i)}") # sample_ind = sample_data_by_label(train_y, sampler={"1": 10, "2": 20}) # train_x, train_y = train_x[sample_ind], train_y[sample_ind] # for i in np.unique(train_y): # print(f"label {i} number is {sum(train_y == i)}") # [5]. 构建Iterator # train_iter = self_iterator(batch_data=(train_x, train_y, ), batch_size=4, ) # valid_iter = self_iterator(batch_data=(valid_x, valid_y, ), batch_size=4) # test_iter = self_iterator(batch_data=(test_x, test_y), batch_size=4) batch_size = 128 small_sample_test = False small_sample_num = 10000 if small_sample_test: train_x, train_y = train_x[: small_sample_num], train_y[: small_sample_num] train_iter = torch_iterator(batch_data=( train_x, train_y, ), batch_size=batch_size) valid_iter = torch_iterator(batch_data=( valid_x, valid_y, ), batch_size=batch_size) # test_iter = torch_iterator(batch_data=(test_x, test_y), batch_size=batch_size) # [6]. 初始化模型 seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True) # model = TextRNN(vocab_size=len(vocab_dic), embedding_dim=8, hidden_size=20, # num_layers=2, num_classes=10, dropout=0.5) model = TextCNN(num_filters=128, filter_sizes=(2, 3, 4), num_classes=len(label_dic), vocab_size=len(vocab_dic), embedding_dim=300, dropout=0.5) init_network(model) print(model) # [7]. 模型训练 num_epochs = 6 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lr = 1e-3 model_save_path = os.path.join( save_dir, "text_cnn_model.pt") # "./data/THUCNews/text_cnn_model.pt" print("now the device is ", device) loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) clf = SelfModel(model=model) t1 = datetime.now() clf.train(train_iter, num_epochs, loss=loss, optimizer=optimizer, valid_iter=valid_iter, early_stopping_batch=100, batch_check_frequency=2, print_every_batch=10, model_save_path=model_save_path, device=device) t2 = datetime.now() print(f"train cost {(t2-t1).seconds} seconds") # Epoch Num [6/6], Batch num [395/417]: train loss is 0.5875816802612598 valid loss is 1.1788143689119364 # Epoch Num [6/6], Batch num [415/417]: train loss is 0.5919032108297737 valid loss is 1.1893426436412184 # train cost 2202 seconds # [8]. 模型预测 # pred = clf.predict(data=train_iter, do_func=lambda x: x[0]) # [9]. 查看效果 def get_max_prob_index(pred): return torch.max(pred, 1)[1] # pred = torch.nn.functional.softmax(pred, dim=1).cpu().numpy() y_score, y_true = evaluate(clf.model, train_iter, y_score_processor=get_max_prob_index) train_acc = accuracy_score(y_true, y_score) y_score, y_true = evaluate(clf.model, valid_iter, y_score_processor=get_max_prob_index) valid_acc = accuracy_score(y_true, y_score) # y_score, y_true = evaluate(clf.model, test_iter, y_score_processor=get_max_prob_index) # test_acc = accuracy_score(y_true, y_score) print(f"train accuracy is {train_acc}, valid accuracy is {valid_acc}.") # train accuracy is 0.8219827586206897, valid accuracy is 0.6129. # [10]. 对测试集进行预测, 构造线上cluemark提交格式, 提交到线上查看效果 inverse_label_dic = {} for key, val in label_dic.items(): inverse_label_dic[val["label_index"]] = { "label": key, "label_desc": val["label_desc"] } f_out = open("./data/tnews/tnews_predict.json", "w", encoding="utf-8") with open(test_path, "r") as f: line_num = 0 for line in f: line_json = {"id": line_num} line = line.strip("\n") line_ids = content_to_id([line], tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic) line_pred = clf.model( torch.LongTensor(line_ids).to(device)) # 返回预测每个类别的预测 line_pred_ind = torch.max(line_pred, 1)[1].item() # 获取最大概率对应的index line_json.update(inverse_label_dic[line_pred_ind]) # 构造线上格式 f_out.write( f"{json.dumps(line_json, ensure_ascii=False)}\n") # 写入文件中 line_num += 1 f_out.close()
def main_entry(): save_dir = "./data/cluener" vocab_file_path = "./data/cluener/train.json" tokenizer = lambda x: x # 输入是list, 相当于已经tokenize # 1. 构建词典 sample_list, tag_list = construct_data(vocab_file_path) # 1 bad line, 实体嵌套实体 ## 1.1 构建word2id词典 # word_save_path = os.path.join(save_dir, "train_word_vocab.pkl") word2id = build_vocab_by_raw_file(sample_list, line_sep=None, tokenizer=tokenizer) ## 1.2 构建tag2id词典 # tag_save_path = os.path.join(save_dir, "train_tag_crf_vocab.pkl") tag2id = build_vocab_by_raw_file(tag_list, line_sep=None, tokenizer=tokenizer) tag2id[START_TAG] = len(tag2id) tag2id[END_TAG] = len(tag2id) # 2. 构造训练、验证和测试数据 # 构造三部分数据并将其转换为ID train_path = "./data/cluener/train.json" valid_path = "./data/cluener/dev.json" test_path = "./data/cluener/test.json" batch_size = 128 train_x, train_y, train_lengths = raw_data_to_model(train_path, tokenizer, word2id, tag2id, batch_size) print(f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}") valid_x, valid_y, valid_lengths = raw_data_to_model(valid_path, tokenizer, word2id, tag2id, batch_size) print(f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}") # test_x, test_y, test_lengths = raw_data_to_model(test_path, tokenizer, word2id, tag2id, batch_size) # print(f"test_x sample number is {len(test_x)}, label sample number is {len(test_y)}") # 3. 转换数据为迭代器 # batch_size = 128 # small_sample_test = False # small_sample_num = 10000 # if small_sample_test: # train_x, train_lengths, train_y = train_x[:small_sample_num], train_lengths[:small_sample_num], train_y[:small_sample_num] train_iter = torch_iterator(batch_data=(train_x, train_lengths, train_y,), batch_size=batch_size) valid_iter = torch_iterator(batch_data=(valid_x, valid_lengths, valid_y,), batch_size=batch_size) # test_iter = torch_iterator(batch_data=(test_x, test_lengths, test_y), batch_size=batch_size) # 4. 初始化模型 seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True) model = BiLSTM_CRF(vocab_size=len(word2id), emb_size=50, hidden_size=32, num_tags=len(tag2id), start_idx=tag2id[START_TAG], stop_idx=tag2id[END_TAG]) init_network(model) print(model) # 4. 模型训练 num_epochs = 15 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lr = 1e-3 model_save_path = os.path.join(save_dir, "bilstm_crf_model.pt") print("now the device is ", device) loss = model.crf.loss optimizer = torch.optim.Adam(model.parameters(), lr=lr) clf = SelfModel(model=model) t1 = datetime.now() clf.train(train_iter, num_epochs, loss=loss, optimizer=optimizer, valid_iter=valid_iter, early_stopping_batch=30, batch_check_frequency=2, print_every_batch=3, model_save_path=model_save_path, device=device) t2 = datetime.now() print(f"train cost {(t2-t1).seconds} seconds") # Epoch Num [15/15], Batch num [84/84]: train loss is 0.24247267132713682 valid loss is 13.60905595259233 # train cost 1064 seconds # 5. 模型评估 ## 5.1 解码 涉及到crf的解码,没有使用内置的evaluate,自己调用模型进行预测然后进行解码 decode = model.crf.viterbi_decode id2tag_dic = {id_: tag for tag, id_ in tag2id.items()} # y_score, y_true = evaluate(clf.model, train_iter, y_score_processor=get_max_prob_index) y_score, y_true = [], [] for sent, leng, y_true_ in valid_iter: y_true_ = y_true_.cpu() crf_score = clf.model(sent.to(device), leng.to(device)) y_score_tag = decode(crf_score.cpu(), sent.gt(0).cpu())[1] lengs = leng.cpu().numpy() for i in range(len(lengs)): # 遍历样本 y_score.append(id2tag(y_score_tag[i][:lengs[i]], id2tag_dic)) y_true.append(id2tag(y_true_[i][:lengs[i]].numpy(), id2tag_dic)) ## 5.2 评估指标 metrices = evaluate_all_sentence(y_true, y_score) print(metrices) # 3072 2909 1944 共有3072个实体, 模型识别的实体为2909个, 其中1944个预测正确 # (0.6328125, 0.6682708834651083, 0.6500585186423675) 分别为召回 精准和f1 # 6. 预测 # 对测试集进行预测,然后将格式整理为cluemark的格式,提交到线上查看测试集的效果 with open(test_path, "r") as f: y_score = [] for line in f: line = line.strip("\n") line_text = json.loads(line)["text"] sent, leng = content_to_id([list(line_text)], tokenizer=tokenizer, line_sep=None, seq_len=len(list(line_text)), vocab_dic=word2id, with_real_seq_len=True) crf_score = clf.model(torch.LongTensor(sent).to(device), torch.LongTensor(leng).to(device)) y_score_tag = decode(crf_score.cpu(), torch.LongTensor(sent).gt(0).cpu())[1] y_score.append(id2tag(y_score_tag[0][:leng[0]], id2tag_dic)) def __submit_format(indexs, sent): ret = {} for start_idx, end_idx in indexs: ner_name = sent[start_idx: end_idx + 1] if ner_name in ret: ret[ner_name].append([start_idx, end_idx]) else: ret[ner_name] = [[start_idx, end_idx]] return ret def submit(write_path, test_path): with open(test_path, "r", encoding='utf-8') as f: test_sample = f.readlines() with open(write_path, "w", encoding="utf-8") as f: line_num = 0 for i in range(len(y_score)): label = {} write_line = {"id": line_num} tag_entity = parse_entity_from_sequence(y_score[i]) line_text = json.loads(test_sample[i])["text"] for tag in tag_entity: label[tag] = __submit_format(tag_entity[tag], line_text) write_line["label"] = label f.write(json.dumps(write_line, ensure_ascii=False) + "\n") line_num += 1 submit("./data/cluener/cluener_predict.json", test_path)
def main_entry(save_dir): # seed_everything(987, use_np=True, use_cpu=True, use_gpu=False) # [1]. 创建词汇表字典 # [1.1]. 无词汇表,从指定文件创建并保存 vocab_file_path = "./data/THUCNews/train.txt" save_path = os.path.join(save_dir, "train_vocab.pkl") tokenizer = "char" line_sep = "\t" vocab_dic = build_vocab_by_raw_file(vocab_file_path, line_sep=line_sep, tokenizer=tokenizer, word_dic_save_path=save_path) # [1.2]. 有词汇表,从指定文件创建 # [1.3]. 有词汇表,手动从pickle文件中加载 # [1.4]. 有词汇表,基于此进行更新 # [2]. 文本转换为id train_path = "./data/THUCNews/train.txt" valid_path = "./data/THUCNews/dev.txt" test_path = "./data/THUCNews/test.txt" seq_len = 32 train_x, train_y = content_to_id(train_path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep) print( f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}" ) valid_x, valid_y = content_to_id(valid_path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep) print( f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}" ) test_x, test_y = content_to_id(test_path, tokenizer=tokenizer, seq_len=seq_len, vocab_dic=vocab_dic, line_sep=line_sep) print( f"content sample number is {len(test_x)}, label sample number is {len(test_y)}" ) # [3]. 切分数据为三部分(训练、验证和测试集),(随机切分或者标签比例切分) # 当然如果第二步已经切分,则此部分可以忽略 # train_ind, valid_ind, test_ind = split_data_with_index(indexes=len(content), split_ratios=(0.7, 0.1, 0.2)) # train_x, train_y = np.array(content)[train_ind], np.array(label)[train_ind] # valid_x, valid_y = np.array(content)[valid_ind], np.array(label)[valid_ind] # test_x, test_y = content[test_ind], label[test_ind] # 也有可能[2]和[3]颠倒,即我首先读取数据,可以通过pandas等等方式,先处理数据, # 然后通过切分策略,切分数据,此时数据已经划分为三部分或者两部分,然后再走[2],将这两部分逻辑分别写例子 # [4]. 数据策略,比如按类别做上采样,下采样; # 此时数据已经为numpy格式 # for i in np.unique(train_y): # print(f"label {i} number is {sum(train_y == i)}") # sample_ind = sample_data_by_label(train_y, sampler={"1": 10, "2": 20}) # train_x, train_y = train_x[sample_ind], train_y[sample_ind] # for i in np.unique(train_y): # print(f"label {i} number is {sum(train_y == i)}") # [5]. 构建Iterator # train_iter = self_iterator(batch_data=(train_x, train_y, ), batch_size=4, ) # valid_iter = self_iterator(batch_data=(valid_x, valid_y, ), batch_size=4) # test_iter = self_iterator(batch_data=(test_x, test_y), batch_size=4) batch_size = 128 small_sample_test = True small_sample_num = 1000 if small_sample_test: train_x, train_y = train_x[: small_sample_num], train_y[: small_sample_num] train_iter = torch_iterator(batch_data=( train_x, train_y, ), batch_size=batch_size) valid_iter = torch_iterator(batch_data=( valid_x, valid_y, ), batch_size=batch_size) test_iter = torch_iterator(batch_data=(test_x, test_y), batch_size=batch_size) # [6]. 初始化模型 seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True) # model = TextRNN(vocab_size=len(vocab_dic), embedding_dim=8, hidden_size=20, # num_layers=2, num_classes=10, dropout=0.5) model = TextCNN(num_filters=128, filter_sizes=(2, 3, 4), num_classes=10, vocab_size=len(vocab_dic), embedding_dim=300, dropout=0.5) init_network(model) print(model) # [7]. 模型训练 num_epochs = 20 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lr = 1e-3 model_save_path = os.path.join( save_dir, "text_cnn_model.pt") # "./data/THUCNews/text_cnn_model.pt" print("now the device is ", device) loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) clf = SelfModel(model=model) t1 = datetime.now() clf.train(train_iter, num_epochs, loss=loss, optimizer=optimizer, valid_iter=valid_iter, early_stopping_batch=100, batch_check_frequency=2, print_every_batch=10, model_save_path=model_save_path, device=device) t2 = datetime.now() print(f"train cost {(t2-t1).seconds} seconds") # [8]. 模型预测 # pred = clf.predict(data=train_iter, do_func=lambda x: x[0]) # [9]. 查看效果 def get_max_prob_index(pred): return torch.max(pred, 1)[1] # pred = torch.nn.functional.softmax(pred, dim=1).cpu().numpy() y_score, y_true = evaluate(clf.model, train_iter, y_score_processor=get_max_prob_index) train_acc = accuracy_score(y_true, y_score) y_score, y_true = evaluate(clf.model, valid_iter, y_score_processor=get_max_prob_index) valid_acc = accuracy_score(y_true, y_score) y_score, y_true = evaluate(clf.model, test_iter, y_score_processor=get_max_prob_index) test_acc = accuracy_score(y_true, y_score) print( f"train accuracy is {train_acc}, valid accuracy is {valid_acc}, test accuracy is {test_acc}." )