def compute_train_pre(self, label_paths, pred_paths): """ train过程中计算每个batch的精确率 """ #origin = [] found = [] right = [] for label_path, pre_path in zip(label_paths, pred_paths): label_entities = get_entities(label_path,self.id2label) pre_entities = get_entities(pre_path,self.id2label) #origin.extend(label_entities) found.extend(pre_entities) right.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities]) return 0 if len(found) == 0 else len(right) / len(found)
def update(self, label_paths, pred_paths): ''' labels_paths: [[],[],[],....] pred_paths: [[],[],[],.....] :param label_paths: :param pred_paths: :return: Example: >>> labels_paths = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> pred_paths = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] ''' for label_path, pre_path in zip(label_paths, pred_paths): label_entities = get_entities(label_path, self.id2label,self.markup) pre_entities = get_entities(pre_path, self.id2label,self.markup) self.origins.extend(label_entities) self.founds.extend(pre_entities) self.rights.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities])
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line['words'] labels = line['labels'] subject = get_entities(labels, id2label=None, markup='bios') examples.append(InputExample(guid=guid, text_a=text_a, subject=subject)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line['words'] labels = [] for x in line['labels']: if 'M-' in x: labels.append(x.replace('M-', 'I-')) elif 'E-' in x: labels.append(x.replace('E-', 'I-')) else: labels.append(x) if i<2: print(text_a) print(labels) subject = get_entities(labels, id2label=None, markup='bios') examples.append(InputExample(guid=guid, text_a=text_a, subject=subject)) return examples
def predict(args,model,processor): model_path = args.output_dir / 'best-model.bin' model = load_model(model, model_path=str(model_path)) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] words = list(text) labels = ['O'] * len(words) json_d['id'] = idx json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 test_data.append(json_d) pbar = ProgressBar(n_total=len(test_data)) results = [] for step, line in enumerate(test_data): token_a = line['context'].split(" ") input_ids = [processor.vocab.to_index(w) for w in token_a] input_mask = [1] * len(token_a) input_lens = [len(token_a)] model.eval() with torch.no_grad(): input_ids = torch.tensor([input_ids], dtype=torch.long) input_mask = torch.tensor([input_mask], dtype=torch.long) input_lens = torch.tensor([input_lens], dtype=torch.long) input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) features = model.forward_loss(input_ids, input_mask, input_lens, input_tags=None) tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) pbar(step=step) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "test_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] with open(str(args.data_dir / 'test.json'), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, processor): # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config = config_model(args) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file) with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, args.output_dir, config, logger) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: tokens = [] json_d = {} line = json.loads(line.strip()) textlist = list(line['text']) for i, word in enumerate(textlist): token = tokenizer.tokenize(word) assert len(token) == 1 tokens.extend(token) assert len(tokens) < args.max_seq_len ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_len = len(input_ids) input_mask = [1] * len(input_ids) while len(input_ids) < args.max_seq_len: input_ids.append(0) input_mask.append(0) segment_ids.append(0) raw_text = [] raw_text.append('[CLS]') raw_text.extend(textlist) raw_text.append('[SEP]') assert len(raw_text) == len(ntokens) assert len(input_ids) == args.max_seq_len assert len(input_mask) == args.max_seq_len assert len(segment_ids) == args.max_seq_len json_d['id'] = idx json_d['input_ids'] = input_ids json_d['input_mask'] = input_mask json_d['segment_ids'] = segment_ids json_d['input_len'] = input_len json_d['text'] = raw_text idx += 1 test_data.append(json_d) results = [] train_data = processor.get_train_examples() test_train = load_pickle(args.data_dir / 'train_test.bin') for step, line in enumerate(test_data): a_input_ids = [] a_input_mask = [] a_label_ids = [] a_input_lens = [] a_segment_ids = [] aux_sentence = [ train_data[i] for i in test_train[step][:args.aug_num] ] for s in aux_sentence: a_input_ids.append(s['input_ids']) # a_label_ids.append(s['label_ids']) #地址信息增强,将所有的标签信息改成adress标签,全1 a_label_ids.append(s['input_mask']) a_input_mask.append(s['input_mask']) a_input_lens.append(s['input_len']) a_segment_ids.append(s['segment_ids']) input_ids = line['input_ids'] input_mask = line['input_mask'] input_lens = line['input_len'] segment_ids = line['segment_ids'] batch = { 'ori': ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]), 'aug': ([a_input_ids], [a_input_mask], [a_label_ids], [a_input_lens], [a_segment_ids]) } tags = model.evaluate_line(sess, batch) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step tags[0] = [args.id2label[idx] for idx in tags[0]] json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "cluener_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] test_submit = [] for x, y in zip(test_data, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] #加了标记 words = x['text'] if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, processor): # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config = config_model(args) config['vocab_size'] = len(processor.vocab) config['keep_prob'] = 1.0 with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, args.output_dir, config, logger) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] words = list(text) labels = ['O'] * len(words) json_d['id'] = idx json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 test_data.append(json_d) results = [] for step, line in enumerate(test_data): token_a = line['context'].split(" ") input_ids = [processor.vocab.to_index(w) for w in token_a] input_mask = [1] * len(token_a) input_lens = [len(token_a)] tags = model.evaluate_line( sess, ([input_ids], [input_mask], [[]], input_lens)) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step tags[0] = [args.id2label[idx] for idx in tags[0]] json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "cluener_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] with open(str(args.data_dir / 'test.json'), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, processor): model_path = args.output_dir / 'best-model.bin' model = load_model(model, model_path=str(model_path)) # metric = SeqEntityScore(args.id2label, markup=args.markup) # 取数据 test_data = [{id: ,context: ,tag: ,raw_context: },{},{}...] start_time = time.time() test_data = load_and_cache_examples(args, processor, data_type='test') # test_data [{'context':,'tag':},{},{}] origins = [] founds = [] rights = [] results = [] for step, line in enumerate(test_data): token_a = line['context'].split(" ") tag_a = line['tag'].split(" ") input_ids = [processor.vocab.to_index(w) for w in token_a] input_mask = [1] * len(token_a) input_lens = [len(token_a)] model.eval() with torch.no_grad(): input_ids = torch.tensor([input_ids], dtype=torch.long) input_mask = torch.tensor([input_mask], dtype=torch.long) input_lens = torch.tensor([input_lens], dtype=torch.long) input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) features = model.forward_loss(input_ids, input_mask, input_lens, input_tags=None) tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens) label_entities = get_entities(tags[0], args.id2label) gold_entities = get_entities(tag_a, args.id2label) # 记录标签 origins.extend(gold_entities) founds.extend(label_entities) rights.extend([ pre_entity for pre_entity in label_entities if pre_entity in gold_entities ]) json_d = {} # json_d['tag_seq'] = " ".join(tags[0]) json_d['pre'] = label_entities json_d['gold'] = gold_entities results.append(json_d) # result [{'pre': ,'gold': },{},{}] test_submit = [] for x, y in zip(test_data, results): json_d = {} context = list(x['context']) json_d['context'] = ''.join(context) json_d['label'] = y['pre'] # entities = y['pre'] # if len(entities) != 0: # for subject in entities: # tag = subject[0] # start = subject[1] # end = subject[2] # word = "".join(context[start:end + 1]) # json_d['label'][tag] = word json_d['gold'] = y['gold'] test_submit.append(json_d) output_submit_file = str(args.output_dir / "test_submit.json") with open(output_submit_file, 'w') as writer: for x in test_submit: writer.write(json.dumps(x, ensure_ascii=False) + '\n') precision = len(rights) / len(founds) recall = len(rights) / len(origins) test_f1 = (2 * precision * recall) / (precision + recall) logger.info( f'test_time: {time.time() - start_time:.1f} test_f1: {test_f1}')