def __init__(self): self.tokenizer = Tokenizer(VocabPath) self.seg = pkuseg.pkuseg() self.vocab_size = self.tokenizer._vocab_size self.token_pad_id = self.tokenizer._token_pad_id self.token_cls_id = self.tokenizer._token_start_id self.token_sep_id = self.tokenizer._token_end_id self.token_mask_id = self.tokenizer._token_mask_id
def random_wrong(text): tokenizer = Tokenizer(VocabPath) length = len(text) position = random.randint(0, length-1) number = random.randint(672, 7992) text = list(text) text[position] = tokenizer.id_to_token(number) text = ''.join(text) return text
def __init__(self): self.NerClassDict = NerClassDict self.tokenizer = Tokenizer(VocabPath) with open(Class2NumFile, 'rb') as f: self.class_to_num = pickle.load(f) self.num_to_class = {} for k, v in self.class_to_num.items(): self.num_to_class[v] = k self.model = torch.load(NerFinetunePath).to(device).eval() print('加载模型完成!')
def __init__(self, test_path): self.tokenizer = Tokenizer(VocabPath) self.test_path = test_path self.test_lines = [] self.label_lines = [] # 读取数据 with open(self.test_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line_list = line.split('-***-') self.test_lines.append(line_list[1]) self.label_lines.append(line_list[0])
def __init__(self, number_of_categories, vocab_size=VocabSize, hidden=HiddenSize, max_len=MedicineLength, num_hidden_layers=HiddenLayerNum, attention_heads=AttentionHeadNum, dropout_prob=DropOut, intermediate_size=IntermediateSize): super(RobertaNer, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden self.max_len = max_len self.num_hidden_layers = num_hidden_layers self.attention_head_num = attention_heads self.dropout_prob = dropout_prob self.attention_head_size = hidden // attention_heads self.tokenizer = Tokenizer(VocabPath) self.intermediate_size = intermediate_size self.number_of_categories = number_of_categories # 申明网络 self.roberta_emd = RobertaEmbeddings(vocab_size=self.vocab_size, max_len=self.max_len, hidden_size=self.hidden_size) self.transformer_blocks = nn.ModuleList( Transformer(hidden_size=self.hidden_size, attention_head_num=self.attention_head_num, attention_head_size=self.attention_head_size, intermediate_size=self.intermediate_size).to(device) for _ in range(self.num_hidden_layers)) self.mlm = Mlm(self.hidden_size, self.number_of_categories)
def __init__(self, number_of_categories, vocab_size=VocabSize, hidden=HiddenSize, max_len=SentenceLength, num_hidden_layers=HiddenLayerNum, attention_heads=AttentionHeadNum, dropout_prob=DropOut, intermediate_size=IntermediateSize): super(RobertaNer, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden self.max_len = max_len self.num_hidden_layers = num_hidden_layers self.attention_head_num = attention_heads self.dropout_prob = dropout_prob self.attention_head_size = hidden // attention_heads self.tokenizer = Tokenizer(VocabPath) self.intermediate_size = intermediate_size self.number_of_categories = number_of_categories # 申明网络 self.roberta_emb = TokenEmbedding() self.position_emb = PositionEmbedding() self.bi_gru = BiGRU(self.number_of_categories, self.number_of_categories) self.transformer_blocks = nn.ModuleList( Transformer(hidden_size=self.hidden_size, attention_head_num=self.attention_head_num, attention_head_size=self.attention_head_size, intermediate_size=self.intermediate_size).to(device) for _ in range(self.num_hidden_layers)) self.mlm = Mlm(self.hidden_size, self.number_of_categories) self.crf = CRF(self.number_of_categories, batch_first=True)
class NerInference(object): def __init__(self): self.tokenizer = Tokenizer(VocabPath) with open(Class2NumFile, 'rb') as f: self.class_to_num = pickle.load(f) self.num_to_class = {} for k, v in self.class_to_num.items(): self.num_to_class[v] = k try: self.model = torch.load(NerFinetunePath).to(device).eval() except: self.model = torch.load(NerFinetunePath, map_location='cpu').eval() print('加载模型完成!') def parse_inference_text(self, ori_line): ori_line = ori_line.strip().replace(' ', '') if len(list(ori_line)) > SentenceLength: print('文本过长!') return None, None input_tokens_id = [] segment_ids = [] for token in list(ori_line): id = self.tokenizer.token_to_id(token) input_tokens_id.append(id) for i in range(SentenceLength - len(input_tokens_id)): input_tokens_id.append(0) for x in input_tokens_id: if x: segment_ids.append(1) else: segment_ids.append(0) return input_tokens_id, segment_ids def inference_single(self, text): input_tokens_id, segment_ids = self.parse_inference_text(text) input_tokens_id = torch.tensor(input_tokens_id) segment_ids = torch.tensor(segment_ids) input_token = input_tokens_id.unsqueeze(0).to(device) segment_ids = torch.tensor(segment_ids).unsqueeze(0).to(device) input_token_list = input_token.tolist() input_len = len([x for x in input_token_list[0] if x]) mlm_output = self.model(input_token, segment_ids)[:, :input_len, :] output_tensor = torch.nn.Softmax(dim=-1)(mlm_output) output_topk = torch.topk(output_tensor, 1).indices.squeeze(0).tolist() output2class = [] for i, output in enumerate(output_topk): output = output[0] # output2class.append((text[i], self.num_to_class[output])) output2class.append(self.num_to_class[output]) return output2class
def __init__(self, oce_corpus_path, ocn_corpus_path, tnews_corpus_path, c2n_pickle_path): self.oce_data_tuple = [] self.ocn_data_tuple = [] self.tnews_data_tuple = [] self.tokenizer = Tokenizer(CharsVocabPath) with open(c2n_pickle_path, 'rb') as f: self.classes2num = pickle.load(f) with open(oce_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.oce_data_tuple.append( [self.classes2num[line[0]], line[1]]) with open(ocn_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.ocn_data_tuple.append( [self.classes2num[line[0]] - 7, line[1]]) with open(tnews_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.tnews_data_tuple.append( [self.classes2num[line[0]] - 10, line[1]]) self.source_oce_data = self.oce_data_tuple self.source_ocn_data = self.ocn_data_tuple self.source_tnews_data = self.tnews_data_tuple random.shuffle(self.oce_data_tuple) random.shuffle(self.ocn_data_tuple) random.shuffle(self.tnews_data_tuple)
def __init__(self, corpus_path, c2n_pickle_path): self.data_tuple = [] self.corpus_path = corpus_path if self.corpus_path == OceEvalPath: self.type_id = 0 if self.corpus_path == OcnEvalPath: self.type_id = 1 if self.corpus_path == TnewsEvalPath: self.type_id = 2 self.tokenizer = Tokenizer(CharsVocabPath) with open(c2n_pickle_path, 'rb') as f: self.classes2num = pickle.load(f) with open(self.corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.data_tuple.append( [self.classes2num[line[0]], line[1]]) self.source_eval_data = self.data_tuple random.shuffle(self.data_tuple)
class RobertaTestSet(Dataset): def __init__(self, test_path): self.tokenizer = Tokenizer(VocabPath) self.test_path = test_path self.test_lines = [] self.label_lines = [] # 读取数据 with open(self.test_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line_list = line.split('-***-') self.test_lines.append(line_list[1]) self.label_lines.append(line_list[0]) def __len__(self): return len(self.label_lines) def __getitem__(self, item): output = {} test_text = self.test_lines[item] label_text = self.label_lines[item] test_token = self.__gen_token(test_text) label_token = self.__gen_token(label_text) segment_ids = [1 if x else 0 for x in label_token] output['input_token_ids'] = test_token output['token_ids_labels'] = label_token output['segment_ids'] = segment_ids instance = { k: torch.tensor(v, dtype=torch.long) for k, v in output.items() } return instance def __gen_token(self, tokens): tar_token_ids = [101] tokens = list(tokens) tokens = tokens[:(SentenceLength - 2)] for token in tokens: token_id = self.tokenizer.token_to_id(token) tar_token_ids.append(token_id) tar_token_ids.append(102) if len(tar_token_ids) < SentenceLength: for i in range(SentenceLength - len(tar_token_ids)): tar_token_ids.append(0) return tar_token_ids
def __init__(self): self.tokenizer = Tokenizer(VocabPath) self.src_lines = [] self.tar_lines = [] # 载入类别和编号的映射表 with open(Class2NumFile, 'rb') as f: self.class_to_num = pickle.load(f) # 读取训练数据 with open(NerCorpusPath, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() self.src_lines.append(line) for line in self.src_lines: items = line.split(',') input_tokens, input_tokens_id, input_tokens_class, input_tokens_class_id = items if not input_tokens: continue input_tokens_id = [int(x) for x in input_tokens_id.split(' ')] input_tokens_class_id = [ int(x) for x in input_tokens_class_id.split(' ') ] segment_ids = [] for x in input_tokens_class_id: if x: segment_ids.append(1) else: segment_ids.append(0) tmp = { 'input_tokens_id': input_tokens_id, 'input_tokens_class_id': input_tokens_class_id, 'segment_ids': segment_ids } tmp = { k: torch.tensor(v, dtype=torch.long) for k, v in tmp.items() } self.tar_lines.append(tmp)
def parse_source_data(): """ :return: [123, 233, 334, 221, 299, ..., ...] [ptzf, b-ypcf, i-ypcf, i-ypcf, e-ypcf, e-yplb, ..., pytzf, ...] """ MaxLen = 0 class2num = {'pad': 0, 'ptzf': 1} total_data = {} tokenizer = Tokenizer(VocabPath) input_path = os.path.join(NerSourcePath, 'data') label_path = os.path.join(NerSourcePath, 'label') f_train = open(NerCorpusPath, 'w', encoding='utf-8') # f_eval = open(NerEvalPath, 'w', encoding='utf-8') category_list = [] relabel_list = [] for data_file in os.listdir(input_path): label_word_pool = {} if '.txt' not in data_file: continue file_num = data_file.split('.')[0] f1 = open(os.path.join(input_path, data_file), 'r', encoding='utf-8') f2 = open(os.path.join(label_path, file_num + '.csv'), 'r', encoding='utf-8') sentence = f1.read().strip().replace(',', ',') # 初始化数据结构 total_data[int(file_num)] = {} total_data[int(file_num)]['sentence'] = sentence total_data[int(file_num)]['tokens_id'] = [0] * len(sentence) total_data[int(file_num)]['tokens_class'] = ['ptzf'] * len(sentence) total_data[int(file_num)]['tokens_class_num'] = [1] * len(sentence) # 存储原句tokenid, 101表示cls for i, token in enumerate(sentence): id = tokenizer.token_to_id(token) if not id: print('警告!本地vocab缺少以下字符:%s!' % token) print(sentence) # 100表示UNK total_data[int(file_num)]['tokens_id'][i] = 100 else: total_data[int(file_num)]['tokens_id'][i] = id label_lines = f2.readlines()[1:] for label_line in label_lines: label_line = label_line.split(',', 4) assert len(label_line) == 5 category = label_line[1] begin = int(label_line[2]) end = int(label_line[3]) label_words = label_line[4].strip() category_list.append(category) # if '启示录》' in label_words: # x = 1 # if category == 'organization': # print(file_num, label_words) # 校验标记正确性 ori_words = sentence[begin:end + 1] if ori_words != label_words: print('标记位置错误:%s,%s!' % (file_num, label_words)) # 校验重复标记 for j in range(begin, end + 1): if j in label_word_pool: relabel_list.append(file_num) else: label_word_pool[j] = 'ok' if category in ['QQ', 'vx', 'mobile', 'email']: continue if begin == end: if 'b' + category not in class2num: class2num['b' + category] = len(class2num) total_data[int(file_num)]['tokens_class'][end] = 'b' + category total_data[int(file_num)]['tokens_class_num'][end] = class2num[ 'b' + category] if end - begin > 0: if 'b' + category not in class2num: class2num['b' + category] = len(class2num) if 'i' + category not in class2num: class2num['i' + category] = len(class2num) total_data[int( file_num)]['tokens_class'][begin] = 'b' + category total_data[int(file_num)]['tokens_class'][begin + 1:end] = [ 'i' + category ] * (end - begin) total_data[int( file_num)]['tokens_class_num'][begin] = class2num['b' + category] total_data[int(file_num)]['tokens_class_num'][ begin + 1:end] = [class2num['i' + category]] * (end - begin) # 将长句进行分割 new_total_data = {} tmp_docker = ['', [], [], []] for num in total_data: if len(total_data[num]['sentence']) <= SentenceLength: tl = len(new_total_data) new_total_data[tl] = {} new_total_data[tl]['sentence'] = total_data[num]['sentence'] new_total_data[tl]['tokens_id'] = total_data[num]['tokens_id'] new_total_data[tl]['tokens_class'] = total_data[num][ 'tokens_class'] new_total_data[tl]['tokens_class_num'] = total_data[num][ 'tokens_class_num'] tmp_docker = ['', [], [], []] else: ts = list(total_data[num]['sentence']) ti = total_data[num]['tokens_id'] tc = total_data[num]['tokens_class'] tn = total_data[num]['tokens_class_num'] for i, word in enumerate(ts): if word in [',', ',', '。', '?', '?', '!', '!', '~', ':', ':']: if len(tmp_docker[0]) > MaxLen: MaxLen = len(tmp_docker[0]) if len(tmp_docker[0]) > 200: x = 1 if tc[i][0] == 'i' or 0 < len(tmp_docker[0]) < 10: tmp_docker[0] += word tmp_docker[1].append(ti[i]) tmp_docker[2].append(tc[i]) tmp_docker[3].append(tn[i]) else: tl = len(new_total_data) new_total_data[tl] = {} new_total_data[tl]['sentence'] = tmp_docker[0] new_total_data[tl]['tokens_id'] = tmp_docker[1] new_total_data[tl]['tokens_class'] = tmp_docker[2] new_total_data[tl]['tokens_class_num'] = tmp_docker[3] tmp_docker = ['', [], [], []] continue else: tmp_docker[0] += word tmp_docker[1].append(ti[i]) tmp_docker[2].append(tc[i]) tmp_docker[3].append(tn[i]) # print(list(set(relabel_list))) print('最长句子为:', MaxLen) print(set(category_list)) # 补全所有的句子 total_data = new_total_data for num in total_data: difference = SentenceLength - len(total_data[num]['sentence']) total_data[num]['tokens_id'].extend([0] * difference) total_data[num]['tokens_class'].extend(['pad'] * difference) total_data[num]['tokens_class_num'].extend([class2num['pad']] * difference) total_data[num]['tokens_id'] = [ str(x) for x in total_data[num]['tokens_id'] ] total_data[num]['tokens_class_num'] = [ str(x) for x in total_data[num]['tokens_class_num'] ] # 将类型及编号进行存储 with open(Class2NumFile, 'wb') as f: pickle.dump(class2num, f) for num in total_data: # rad = random.random() # if num > 3000 and rad < 0.02: # if total_data[num]['sentence']: # f_eval.write(total_data[num]['sentence'] + ',' + # ' '.join(total_data[num]['tokens_id']) + ',' + # ' '.join(total_data[num]['tokens_class']) + ',' + # ' '.join(total_data[num]['tokens_class_num']) + '\n' # ) # else: if total_data[num]['sentence']: f_train.write(total_data[num]['sentence'] + ',' + ' '.join(total_data[num]['tokens_id']) + ',' + ' '.join(total_data[num]['tokens_class']) + ',' + ' '.join(total_data[num]['tokens_class_num']) + '\n')
class NerInference(object): def __init__(self): self.NerClassDict = NerClassDict self.tokenizer = Tokenizer(VocabPath) with open(Class2NumFile, 'rb') as f: self.class_to_num = pickle.load(f) self.num_to_class = {} for k, v in self.class_to_num.items(): self.num_to_class[v] = k self.model = torch.load(NerFinetunePath).to(device).eval() print('加载模型完成!') def parse_inference_text(self, ori_line): ori_line = ori_line.strip().replace(' ', '') if len(list(ori_line)) > MedicineLength - 2: print('文本过长!') return None, None input_tokens_id = [101] segment_ids = [] for token in list(ori_line): id = self.tokenizer.token_to_id(token) input_tokens_id.append(id) input_tokens_id.append(102) for i in range(MedicineLength - len(input_tokens_id)): input_tokens_id.append(0) for x in input_tokens_id: if x: segment_ids.append(1) else: segment_ids.append(0) return input_tokens_id, segment_ids def inference_single(self, text): input_tokens_id, segment_ids = self.parse_inference_text(text) input_tokens_id = torch.tensor(input_tokens_id) segment_ids = torch.tensor(segment_ids) input_token = input_tokens_id.unsqueeze(0).to(device) segment_ids = torch.tensor(segment_ids).unsqueeze(0).to(device) input_token_list = input_token.tolist() input_len = len([x for x in input_token_list[0] if x]) - 2 mlm_output = self.model(input_token, segment_ids)[:, 1:input_len + 1, :] output_tensor = torch.nn.Softmax(dim=-1)(mlm_output) output_topk = torch.topk(output_tensor, 1).indices.squeeze(0).tolist() output2class = [] result = [] for i, output in enumerate(output_topk): output = output[0] output2class.append(self.num_to_class[output]) entities = extract_output_entities(output2class) for key, val in entities.items(): entity_len = len(val) current_text = '' current_entity = self.NerClassDict[val[0][1:]] for i in range(entity_len): current_text += text[key + i] result.append((current_text, current_entity)) print('输入数据为:', text) print('实体识别结果为:', result) return result
def parse_ori_line(ori_line, class_to_num): """ :param ori_line: 六味地黄{3,ypcf}丸{1,yplb} :return: [101, 123, 233, 334, 221, 299, ..., 102, ...] [ptzf, b-ypcf, i-ypcf, i-ypcf, e-ypcf, e-yplb, ..., pytzf, ...] """ ori_line = ori_line.strip().replace(' ', '') input_tokens = '' input_tokens_id = [] input_tokens_class = [] input_tokens_class_id = [] tokenizer = Tokenizer(VocabPath) i = 0 l = 0 ori_line_list = list(ori_line) while i < len(ori_line_list): if ori_line_list[i] != '{' and ori_line_list[i] != '}': input_tokens += ori_line_list[i] input_tokens_class.append(NormalChar) i += 1 l += 1 if ori_line_list[i] == '{': current_type = '' current_len = '' j = i while True: j += 1 if ori_line_list[j].isdigit(): current_len += ori_line_list[j] if ori_line_list[j] == ',': break while True: j += 1 if ori_line_list[j] == '}': break current_type += ori_line_list[j] current_len = int(current_len) if current_len == 1: input_tokens_class[l - 1] = 'e' + current_type elif current_len == 2: input_tokens_class[l - 2] = 'b' + current_type input_tokens_class[l - 1] = 'e' + current_type else: input_tokens_class[l - current_len] = 'b' + current_type input_tokens_class[l - 1] = 'e' + current_type for k in range(current_len - 2): input_tokens_class[l - 2 - k] = 'i' + current_type i = j i += 1 for token in input_tokens: id = tokenizer.token_to_id(token) if not id: print('警告!本地vocab缺少以下字符:%s!' % token) continue input_tokens_id.append(id) # 补全类别 if len(input_tokens_id) > MedicineLength - 2: return None, None, None, None else: input_tokens_id.append(102) input_tokens_class.append(NormalChar) for i in range(MedicineLength - len(input_tokens_id) - 1): input_tokens_id.append(0) input_tokens_class.append('pad') # 数值化文字分类 input_tokens_id = [101] + input_tokens_id input_tokens_class = [NormalChar] + input_tokens_class for token_class in input_tokens_class: if token_class in class_to_num: input_tokens_class_id.append(class_to_num[token_class]) else: class_to_num[token_class] = len(class_to_num) input_tokens_class_id.append(class_to_num[token_class]) return input_tokens, input_tokens_id, input_tokens_class, input_tokens_class_id, class_to_num
class DataFactory(object): def __init__(self): self.tokenizer = Tokenizer(VocabPath) self.seg = pkuseg.pkuseg() self.vocab_size = self.tokenizer._vocab_size self.token_pad_id = self.tokenizer._token_pad_id self.token_cls_id = self.tokenizer._token_start_id self.token_sep_id = self.tokenizer._token_end_id self.token_mask_id = self.tokenizer._token_mask_id def __token_process(self, token_id): """ 以80%的几率替换为[MASK],以10%的几率保持不变, 以10%的几率替换为一个随机token。 """ rand = np.random.random() if rand <= 0.8: return self.token_mask_id elif rand <= 0.9: return token_id else: return np.random.randint(0, self.vocab_size) def texts_to_ids(self, texts): texts_ids = [] for text in texts: # 处理每个句子 if ModelClass == 'RobertaMlm': # 注意roberta里并不是针对每个字进行mask,而是对字或者词进行mask words = self.seg.cut(text) for word in words: # text_ids首位分别是cls和sep,这里暂时去除 word_tokes = self.tokenizer.tokenize(text=word)[1:-1] words_ids = self.tokenizer.tokens_to_ids(word_tokes) texts_ids.append(words_ids) else: for word in text: # text_ids首位分别是cls和sep,这里暂时去除 word_tokes = self.tokenizer.tokenize(text=word)[1:-1] words_ids = self.tokenizer.tokens_to_ids(word_tokes) texts_ids.append(words_ids) return texts_ids def ids_to_mask(self, texts_ids): instances = [] total_ids = [] total_masks = [] # 为每个字或者词生成一个概率,用于判断是否mask mask_rates = np.random.random(len(texts_ids)) for i, word_id in enumerate(texts_ids): # 为每个字生成对应概率 total_ids.extend(word_id) if mask_rates[i] < MaskRate: # 因为word_id可能是一个字,也可能是一个词 for sub_id in word_id: total_masks.append(self.__token_process(sub_id)) else: total_masks.extend([0] * len(word_id)) # 每个实例的最大长度为512,因此对一个段落进行裁剪 # 510 = 512 - 2,给cls和sep留的位置 for i in range(math.ceil(len(total_ids) / (SentenceLength - 2))): tmp_ids = [self.token_cls_id] tmp_masks = [self.token_pad_id] tmp_ids.extend( total_ids[i * (SentenceLength - 2):min((i + 1) * (SentenceLength - 2), len(total_ids))]) tmp_masks.extend(total_masks[i * (SentenceLength - 2):min( (i + 1) * (SentenceLength - 2), len(total_masks))]) # 不足512的使用padding补全 diff = SentenceLength - len(tmp_ids) if diff == 1: tmp_ids.append(self.token_sep_id) tmp_masks.append(self.token_pad_id) else: # 添加结束符 tmp_ids.append(self.token_sep_id) tmp_masks.append(self.token_pad_id) # 将剩余部分padding补全 tmp_ids.extend([self.token_pad_id] * (diff - 1)) tmp_masks.extend([self.token_pad_id] * (diff - 1)) instances.append([tmp_ids, tmp_masks]) return instances def ids_all_mask(self, texts_ids, tokenid2count): instances = [] tmp_ids = [101] # 格式化数据 for token_ids in texts_ids: if isinstance(token_ids, list): for token_id in token_ids: tmp_ids.append(token_id) if len(tmp_ids) == SentenceLength - 1: break else: tmp_ids.append(token_ids) if len(tmp_ids) == SentenceLength - 1: break if len(tmp_ids) == SentenceLength - 1: break tmp_ids.append(102) input_length = len(tmp_ids) - 2 if len(tmp_ids) < SentenceLength: for i in range(SentenceLength - len(tmp_ids)): tmp_ids.append(0) for i in range(1, input_length + 1): # 如果某字出现次数很少,则强行增加训练集 if tokenid2count[tmp_ids[i]] < WordGenTimes: for j in range(WordGenTimes - tokenid2count[tmp_ids[i]]): tmp_masks = [0] * SentenceLength rand_num = np.random.randint(672, 7992) tmp_masks[i] = rand_num instances.append([tmp_ids, tmp_masks]) tmp_masks = [0] * SentenceLength if random.random() < RanWrongDivisor: rand_num = np.random.randint(672, 7992) tmp_masks[i] = rand_num else: tmp_masks[i] = tmp_ids[i] instances.append([tmp_ids, tmp_masks]) return instances
class EvalDataGenerator(object): def __init__(self, corpus_path, c2n_pickle_path): self.data_tuple = [] self.corpus_path = corpus_path if self.corpus_path == OceEvalPath: self.type_id = 0 if self.corpus_path == OcnEvalPath: self.type_id = 1 if self.corpus_path == TnewsEvalPath: self.type_id = 2 self.tokenizer = Tokenizer(CharsVocabPath) with open(c2n_pickle_path, 'rb') as f: self.classes2num = pickle.load(f) with open(self.corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.data_tuple.append( [self.classes2num[line[0]], line[1]]) self.source_eval_data = self.data_tuple random.shuffle(self.data_tuple) def reset_batch(self): self.data_tuple = self.source_eval_data random.shuffle(self.data_tuple) def gen_next_batch(self, batch_size): output = {} batch_max_len = 0 if len(self.data_tuple) >= batch_size: current_tuple = self.data_tuple[:batch_size] self.data_tuple = self.data_tuple[batch_size:] else: return None label_list = [] tokens_list = [] segments_list = [] for x in current_tuple: if self.type_id == 0: label_list.append(x[0]) if self.type_id == 1: label_list.append(x[0] - 7) if self.type_id == 2: label_list.append(x[0] - 10) token_ids = self.tokenizer.tokens_to_ids(['[CLS]'] + x[1].split(' ')) if len(token_ids) > batch_max_len: batch_max_len = len(token_ids) tokens_list.append(token_ids) segments_list.append([1] * len(token_ids)) batch_max_len = min(batch_max_len, SentenceLength) for i, tokens in enumerate(tokens_list): if len(tokens) < batch_max_len: tokens_list[i] = tokens_list[i] + [0] * (batch_max_len - len(tokens)) segments_list[i] = segments_list[i] + [0] * (batch_max_len - len(tokens)) else: tokens_list[i] = tokens_list[i][:batch_max_len] segments_list[i] = segments_list[i][:batch_max_len] output['type_id'] = [self.type_id] output['input_token_ids'] = tokens_list output['position_ids'] = [[x for x in range(batch_max_len)]] output['segment_ids'] = segments_list output['token_ids_labels'] = label_list instance = { k: torch.tensor(v, dtype=torch.long) for k, v in output.items() } return instance
class TrainDataGenerator(object): def __init__(self, oce_corpus_path, ocn_corpus_path, tnews_corpus_path, c2n_pickle_path): self.oce_data_tuple = [] self.ocn_data_tuple = [] self.tnews_data_tuple = [] self.tokenizer = Tokenizer(CharsVocabPath) with open(c2n_pickle_path, 'rb') as f: self.classes2num = pickle.load(f) with open(oce_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.oce_data_tuple.append( [self.classes2num[line[0]], line[1]]) with open(ocn_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.ocn_data_tuple.append( [self.classes2num[line[0]] - 7, line[1]]) with open(tnews_corpus_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line = line.split('\t') if line[0] and line[1]: self.tnews_data_tuple.append( [self.classes2num[line[0]] - 10, line[1]]) self.source_oce_data = self.oce_data_tuple self.source_ocn_data = self.ocn_data_tuple self.source_tnews_data = self.tnews_data_tuple random.shuffle(self.oce_data_tuple) random.shuffle(self.ocn_data_tuple) random.shuffle(self.tnews_data_tuple) def get_length(self): return len(self.oce_data_tuple), len(self.ocn_data_tuple), len( self.tnews_data_tuple) def ret_batch(self): self.oce_data_tuple = self.source_oce_data self.ocn_data_tuple = self.source_ocn_data self.tnews_data_tuple = self.source_tnews_data random.shuffle(self.oce_data_tuple) random.shuffle(self.ocn_data_tuple) random.shuffle(self.tnews_data_tuple) def gen_next_batch(self, oce_batch_size, ocn_batch_size, tnews_batch_size): output = {} batch_max_len = 0 if len(self.oce_data_tuple) >= oce_batch_size and \ len(self.ocn_data_tuple) >= ocn_batch_size and \ len(self.tnews_data_tuple) >= tnews_batch_size: oce_current_tuple = self.oce_data_tuple[:oce_batch_size] ocn_current_tuple = self.ocn_data_tuple[:ocn_batch_size] tnews_current_tuple = self.tnews_data_tuple[:tnews_batch_size] self.oce_data_tuple = self.oce_data_tuple[oce_batch_size:] self.ocn_data_tuple = self.ocn_data_tuple[ocn_batch_size:] self.tnews_data_tuple = self.tnews_data_tuple[tnews_batch_size:] else: return None type_list = [] label_list = [] tokens_list = [] segments_list = [] for x in oce_current_tuple: type_list.append([0]) label_list.append(x[0]) token_ids = self.tokenizer.tokens_to_ids(['[CLS]'] + x[1].split(' ')) if len(token_ids) > batch_max_len: batch_max_len = len(token_ids) tokens_list.append(token_ids) segments_list.append([1] * len(token_ids)) for x in ocn_current_tuple: type_list.append([1]) label_list.append(x[0]) token_ids = self.tokenizer.tokens_to_ids(['[CLS]'] + x[1].split(' ')) if len(token_ids) > batch_max_len: batch_max_len = len(token_ids) tokens_list.append(token_ids) segments_list.append([1] * len(token_ids)) for x in tnews_current_tuple: type_list.append([2]) label_list.append(x[0]) token_ids = self.tokenizer.tokens_to_ids(['[CLS]'] + x[1].split(' ')) if len(token_ids) > batch_max_len: batch_max_len = len(token_ids) tokens_list.append(token_ids) segments_list.append([1] * len(token_ids)) batch_max_len = min(batch_max_len, SentenceLength) for i, tokens in enumerate(tokens_list): if len(tokens) < batch_max_len: tokens_list[i] = tokens_list[i] + [0] * (batch_max_len - len(tokens)) segments_list[i] = segments_list[i] + [0] * (batch_max_len - len(tokens)) else: tokens_list[i] = tokens_list[i][:batch_max_len] segments_list[i] = segments_list[i][:batch_max_len] output['type_id'] = type_list output['input_token_ids'] = tokens_list output['position_ids'] = [[ x for x in range(batch_max_len) ] for i in range(oce_batch_size + ocn_batch_size + tnews_batch_size)] output['segment_ids'] = segments_list output['token_ids_labels'] = label_list instance = { k: torch.tensor(v, dtype=torch.long) for k, v in output.items() } return instance
def parse_new_data(): """ :return: [123, 233, 334, 221, 299, ..., ...] [ptzf, b-ypcf, i-ypcf, i-ypcf, e-ypcf, e-yplb, ..., pytzf, ...] """ with open(Class2NumFile, 'rb') as f: class2num = pickle.load(f) # class2num = {'pad': 0, 'ptzf': 1} new_train_data = {} new_eval_data = {} tokenizer = Tokenizer(VocabPath) input_path = 'data/train_new' eval_path = 'data/eval_new' f_train = open(NerCorpusPath, 'a+', encoding='utf-8') f_eval = open(NerEvalPath, 'w', encoding='utf-8') category_list = [] for data_file in os.listdir(input_path): if '.txt' not in data_file: continue file_num = data_file.split('.')[0] f1 = open(os.path.join(input_path, data_file), 'r', encoding='utf-8') lines = f1.readlines() lines = [x.strip().replace(',', ',') for x in lines if x][:-1] new_train_data[file_num] = {} new_train_data[file_num]['sentence'] = '' new_train_data[file_num]['tokens_id'] = [] new_train_data[file_num]['tokens_class'] = [] new_train_data[file_num]['tokens_class_num'] = [] for i, line in enumerate(lines): try: ch, label = tuple(line.lower().split(' ')) except: print(file_num) print(i) print(line) print('\n') ch = ',' label = 'o' new_train_data[file_num]['sentence'] += ch new_train_data[file_num]['tokens_id'].append( tokenizer.token_to_id(ch)) if label == 'o' or label == '0': token_class = 'ptzf' token_class_num = 1 else: token_class = label.lower().replace('-', '') if token_class[1:] in ['qq', 'vx', 'mobile', 'email']: token_class = 'ptzf' if token_class != 'ptzf': category_list.append(token_class[1:]) if token_class in class2num: token_class_num = class2num[token_class] else: token_class_num = len(class2num) class2num[token_class] = token_class_num new_train_data[file_num]['tokens_class'].append(token_class) new_train_data[file_num]['tokens_class_num'].append( token_class_num) for data_file in os.listdir(eval_path): if '.txt' not in data_file: continue file_num = data_file.split('.')[0] f1 = open(os.path.join(eval_path, data_file), 'r', encoding='utf-8') lines = f1.readlines() lines = [x.strip().replace(',', ',') for x in lines if x][:-1] new_eval_data[file_num] = {} new_eval_data[file_num]['sentence'] = '' new_eval_data[file_num]['tokens_id'] = [] new_eval_data[file_num]['tokens_class'] = [] new_eval_data[file_num]['tokens_class_num'] = [] for i, line in enumerate(lines): try: ch, label = tuple(line.lower().split(' ')) except: print(file_num) print(i) print(line) print('\n') ch = ',' label = 'o' new_eval_data[file_num]['sentence'] += ch new_eval_data[file_num]['tokens_id'].append( tokenizer.token_to_id(ch)) if label == 'o': token_class = 'ptzf' token_class_num = 1 else: token_class = label.lower().replace('-', '') if token_class[1:] in ['qq', 'vx', 'mobile', 'email']: token_class = 'ptzf' if token_class != 'ptzf': category_list.append(token_class[1:]) token_class_num = class2num[token_class] new_eval_data[file_num]['tokens_class'].append(token_class) new_eval_data[file_num]['tokens_class_num'].append(token_class_num) print(set(category_list)) # 补全所有的句子 for num in new_train_data: difference = SentenceLength - len(new_train_data[num]['sentence']) new_train_data[num]['tokens_id'].extend([0] * difference) new_train_data[num]['tokens_class'].extend(['pad'] * difference) new_train_data[num]['tokens_class_num'].extend([class2num['pad']] * difference) new_train_data[num]['tokens_id'] = [ str(x) for x in new_train_data[num]['tokens_id'] ] new_train_data[num]['tokens_class_num'] = [ str(x) for x in new_train_data[num]['tokens_class_num'] ] for num in new_eval_data: difference = SentenceLength - len(new_eval_data[num]['sentence']) new_eval_data[num]['tokens_id'].extend([0] * difference) new_eval_data[num]['tokens_class'].extend(['pad'] * difference) new_eval_data[num]['tokens_class_num'].extend([class2num['pad']] * difference) new_eval_data[num]['tokens_id'] = [ str(x) for x in new_eval_data[num]['tokens_id'] ] new_eval_data[num]['tokens_class_num'] = [ str(x) for x in new_eval_data[num]['tokens_class_num'] ] # 将类型及编号进行存储 # with open(Class2NumFile, 'wb') as f: # pickle.dump(class2num, f) for num in new_train_data: if new_train_data[num]['sentence']: if new_train_data[num]['sentence']: f_train.write( new_train_data[num]['sentence'] + ',' + ' '.join(new_train_data[num]['tokens_id']) + ',' + ' '.join(new_train_data[num]['tokens_class']) + ',' + ' '.join(new_train_data[num]['tokens_class_num']) + '\n') for num in new_eval_data: if new_eval_data[num]['sentence']: if new_eval_data[num]['sentence']: f_eval.write(new_eval_data[num]['sentence'] + ',' + ' '.join(new_eval_data[num]['tokens_id']) + ',' + ' '.join(new_eval_data[num]['tokens_class']) + ',' + ' '.join(new_eval_data[num]['tokens_class_num']) + '\n')