def __init__(self): # 加载数据 data_dir = "./corpus/对联" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_dir) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_duilian_model.bin" self.batch_size = 16 self.lr = 1e-5 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def ner_print(model, test_data, vocab_path, device="cpu"): model.eval() word2idx = load_chinese_base_vocab(vocab_path) tokenier = Tokenizer(word2idx) trans = model.state_dict()["crf_layer.trans"] for text in test_data: decode = [] text_encode, text_ids = tokenier.encode(text) text_tensor = torch.tensor(text_encode, device=device).view(1, -1) out = model(text_tensor).squeeze(0) # 其实是nodes labels = viterbi_decode(out, trans) starting = False for l in labels: if l > 0: label = target[l.item()] decode.append(label) else : decode.append("other") flag = 0 res = {} for index, each_entity in enumerate(decode): if each_entity != "other": if flag != each_entity: cur_text = text[index - 1] if each_entity in res.keys(): res[each_entity].append(cur_text) else : res[each_entity] = [cur_text] flag = each_entity elif flag == each_entity: res[each_entity][-1] += text[index - 1] else : flag = 0 print(res)
def __init__(self): # 加载数据 data_path = "./corpus/细粒度NER/train.json" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./细粒度_bert_ner_model_crf.bin" self.batch_size = 8 self.lr = 1e-5 self.crf_lr = 1e-2 ## crf层学习率为0.01 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters()) self.optimizer = torch.optim.Adam([ {"params": base_params}, {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self,model_save_path="model/",data_path="corpus/",batch_size=64,lr=1e-5,model_name="roberta",device='cpu'): # 加载数据 data_path = data_path+"train_data.json" self.vocab_path = "./state_dict/vocab.txt" # roberta模型字典的位置 self.data = load_data(data_path) self.model_name = model_name # 选择模型名字 self.model_path = "./state_dict/pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = model_save_path+"bert_model_relation_extrac.bin" self.batch_size = batch_size self.lr = lr # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) # 判断是否有可用GPU if device =='cpu': self.device =device else: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="relation_extrac", target_size=len(predicate2id)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = ExtractDataset(self.data, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def read_corpus(dir_path, vocab_path): """ 读原始数据 """ sents_src = [] sents_tgt = [] word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) files = os.listdir(dir_path) #得到文件夹下的所有文件名称 for file1 in files: #遍历文件夹 if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开 file_path = dir_path + "/" + file1 print(file_path) if file_path[-3:] != "csv": continue df = pd.read_csv(file_path) # 先判断诗句的类型 再确定是否要构造数据 for index, row in df.iterrows(): if type(row[0]) is not str or type(row[3]) is not str: continue if len(row[0].split(" ")) > 1: # 说明题目里面存在空格,只要空格前面的数据 row[0] = row[0].split(" ")[0] if len(row[0]) > 10 or len(row[0]) < 1: # 过滤掉题目长度过长和过短的诗句 continue encode_text = tokenizer.encode(row[3])[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue if len(row[3]) == 24 and (row[3][5] == "," or row[3][5] == "。"): # 五言绝句 sents_src.append(row[0] + "##" + "五言绝句") sents_tgt.append(row[3]) elif len(row[3]) == 32 and (row[3][7] == "," or row[3][7] == "。"): # 七言绝句 sents_src.append(row[0] + "##" + "七言绝句") sents_tgt.append(row[3]) elif len(row[3]) == 48 and (row[3][5] == "," or row[3][5] == "。"): # 五言律诗 sents_src.append(row[0] + "##" + "五言律诗") sents_tgt.append(row[3]) elif len(row[3]) == 64 and (row[3][7] == "," or row[3][7] == "。"): # 七言律诗 sents_src.append(row[0] + "##" + "七言律诗") sents_tgt.append(row[3]) print("第一个诗句数据集共: " + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def __init__(self, data, vocab_path) : ## 一般init函数是加载所有数据 super(ExtractDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.data = data self.word2idx = load_chinese_base_vocab(vocab_path) self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx)
def __init__(self, sents_src, sents_tgt, vocab_path): ## 一般init函数是加载所有数据 super(BertDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.sents_src = sents_src self.sents_tgt = sents_tgt self.word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx)
def read_corpus_2(dir_path, vocab_path): """读取最近的一个数据集 唐诗和宋诗 """ sents_src = [] sents_tgt = [] word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) files = os.listdir(dir_path) #得到文件夹下的所有文件名称 for file1 in files: #遍历文件夹 if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开 file_path = dir_path + "/" + file1 print(file_path) # data = json.load(file_path) with open(file_path) as f: poem_list = eval(f.read()) for each_poem in poem_list: string_list = each_poem["paragraphs"] poem = "" for each_s in string_list: poem += each_s cc = opencc.OpenCC('t2s') poem = cc.convert(poem) encode_text = tokenizer.encode(poem)[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue title = cc.convert(each_poem["title"]) if len(title) > 10 or len(title) < 1: # 过滤掉题目长度过长和过短的诗句 continue if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"): # 五言绝句 sents_src.append(title + "##" + "五言绝句") sents_tgt.append(poem) elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"): # 七言绝句 sents_src.append(title + "##" + "七言绝句") sents_tgt.append(poem) elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"): # 五言律诗 sents_src.append(title + "##" + "五言律诗") sents_tgt.append(poem) elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"): # 七言律诗 sents_src.append(title + "##" + "七言律诗") sents_tgt.append(poem) print("第二个诗句数据集共:" + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def read_corpus_ci(dir_path, vocab_path): """ 读取宋词数据集""" import json, sys import sqlite3 from collections import OrderedDict word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) try: # Python 2 reload(sys) sys.setdefaultencoding('utf-8') except NameError: # Python 3 pass c = sqlite3.connect(dir_path + '/ci.db') cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;") d = {"name": None, "description": None, "short_description": None} cursor = c.execute("SELECT rhythmic, author, content from ci;") d = {"rhythmic": None, "author": None, "paragraphs": None} # cis = [] sents_src = [] sents_tgt = [] for row in cursor: ci = OrderedDict(sorted(d.items(), key=lambda t: t[0])) ci["rhythmic"] = row[0] ci["author"] = row[1] ci["paragraphs"] = row[2].split('\n') string = "" for s in ci["paragraphs"]: if s == " >> " or s == "词牌介绍": continue string += s encode_text = tokenizer.encode(string)[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue sents_src.append(row[0] + "##词") sents_tgt.append(string) # cis.append(ci) # print(cis[:10]) print("词共: " + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def __init__(self, vocab_path, size="base"): super().__init__() if size == "base": config = T5Config() elif size == "small": config = T5SmallConfig() else: raise Exception("not support this model type") self.model = T5ForConditionalGeneration(config) self.word2idx = load_chinese_base_vocab(vocab_path) self.tokenizer = T5PegasusTokenizer(self.word2idx) self.bos_id = self.word2idx["[CLS]"] self.eos_id = self.word2idx["[SEP]"] self.unk_id = self.word2idx["[UNK]"]
def __init__(self, vocab_path, target_size, model_name="roberta"): super(BertClsClassifier, self).__init__() self.word2ix = load_chinese_base_vocab(vocab_path) self.tokenizer = Tokenizer(self.word2ix) self.target_size = target_size config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) else: raise Exception("model_name_err") self.final_dense = nn.Linear(config.hidden_size, self.target_size)
def __init__(self, vocab_path, target_size, model_name="roberta"): super(BertSeqLabelingCRF, self).__init__() self.word2ix = load_chinese_base_vocab(vocab_path) self.target_size = target_size config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertPredictionHeadTransform config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.transform = BertPredictionHeadTransform(config) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertPredictionHeadTransform config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.transform = BertPredictionHeadTransform(config) else: raise Exception("model_name_err") self.final_dense = nn.Linear(config.hidden_size, self.target_size) self.crf_layer = CRFLayer(self.target_size)
def __init__(self, vocab_path, model_name="roberta"): super(Seq2SeqModel, self).__init__() self.word2ix = load_chinese_base_vocab(vocab_path) self.tokenizer = Tokenizer(self.word2ix) config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) else : raise Exception("model_name_err") self.hidden_dim = config.hidden_size self.vocab_size = config.vocab_size
def __init__(self, vocab_path, predicate_num, model_name="roberta"): super(BertRelationExtrac, self).__init__() self.word2ix = load_chinese_base_vocab(vocab_path) self.predicate_num = predicate_num config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertPredictionHeadTransform, BertLayerNorm config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.layer_norm = BertLayerNorm(config.hidden_size) self.layer_norm_cond = BertLayerNorm(config.hidden_size, conditional=True) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertPredictionHeadTransform, BertLayerNorm config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.layer_norm = BertLayerNorm(config.hidden_size) self.layer_norm_cond = BertLayerNorm(config.hidden_size, conditional=True) else : raise Exception("model_name_err") self.subject_pred = nn.Linear(config.hidden_size, 2) self.activation = nn.Sigmoid() self.object_pred = nn.Linear(config.hidden_size, 2 * self.predicate_num)
from tqdm import tqdm import torch.nn as nn from torch.optim import Adam import numpy as np import os import json import time import glob import bert_seq2seq from torch.utils.data import Dataset, DataLoader from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert import re vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 word2idx = load_chinese_base_vocab(vocab_path) model_name = "roberta" # 选择模型名字 model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # 模型位置 recent_model_path = "./state_dict/bert_math_ques_model.bin" # 用于把已经训练好的模型继续训练 model_save_path = "./state_dict/bert_math_ques_model.bin" batch_size = 16 lr = 1e-5 maxlen = 256 train_data_path = "./state_dict/train.ape.json" val_data_path = "./state_dict/test.ape.json" def remove_bucket(equation): """去掉冗余的括号 """ l_buckets, buckets = [], [] for i, c in enumerate(equation):
import numpy as np import os import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model auto_title_model = "./state_dict/bert_auto_title_model.bin" if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx, keep_tokens = load_chinese_base_vocab(vocab_path, simplfied=True) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name) bert_model.eval() # ## 加载预训练的模型参数~ # load_model_params(bert_model, model_path) bert_model.load_state_dict(torch.load(auto_title_model, map_location="cpu"), strict=False) test_data = ["针对央视3·15晚会曝光的电信行业乱象工信部在公告中表示将严查央视3·15晚会曝光通信违规违法行为工信部称已约谈三大运营商有关负责人并连夜责成三大运营商和所在省通信管理局进行调查依法依规严肃处理"] # # test_data = [ # # "本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", # # "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", # # "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元", # # "新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条"] for text in test_data: print(bert_model.generate(text, beam_size=3))
from re import I import torch import sys sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq") from bert_seq2seq.utils import load_bert import torch.nn.functional as F from bert_seq2seq.bert_cls_classifier_sigmoid import BertClsClassifier from bert_seq2seq.model.bert_model import BertConfig from bert_seq2seq.tokenizer import load_chinese_base_vocab if __name__ == '__main__': # config = BertConfig(vocab_size=1000) word2ix = load_chinese_base_vocab("./state_dict/roberta_wwm_vocab.txt") model = BertClsClassifier(word2ix, target_size=1, model_name="")
num_samples=1) if eos_id == next_token.item(): break # pass output_ids.append(next_token.item()) # token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1) input_decoder_ids = torch.cat( (input_decoder_ids, next_token.long().unsqueeze(0)), dim=1) return tokenizer.decode(output_ids) if __name__ == '__main__': config = T5Config() model = T5ForConditionalGeneration(config) checpoints = torch.load("./state_dict/t5-chinese/pytorch_model.bin") model.load_state_dict(checpoints) word2ix = load_chinese_base_vocab("./state_dict/t5-chinese/vocab.txt") tokenizer = T5PegasusTokenizer(word2ix) text = '从那之后,一发不可收拾。此后的近百年间,一共有十七位新娘在与君山一带失踪。有时十几年相安无事,有时短短一个月内失踪两名。一个恐怖传说迅速传开:与君山里住着一位鬼新郎,若是他看中了一位女子,便会在她出嫁的路上将她掳走,再把送亲的队伍吃掉。' out = sample_generate_encoder_decoder(model, text, tokenizer, word2ix["[SEP]"], word2ix["[CLS]"], device="cpu") print(out)
import pandas as pd import numpy as np import os import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert relation_extrac_model = "./state_dict/bert_model_relation_extrac.bin" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) idx2word = {v: k for k, v in word2idx.items()} predicate2id, id2predicate = {}, {} with open('./corpus/三元组抽取/all_50_schemas') as f: for l in f: l = json.loads(l) if l['predicate'] not in predicate2id: id2predicate[len(predicate2id)] = l['predicate'] predicate2id[l['predicate']] = len(predicate2id) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。