def convert_examples_to_features(examples, config): features = [] for index, example in enumerate(tqdm(examples, desc='Converting Examples')): src_seq, tgt_seq = [], [] for word in example.src.split(): src_seq.append(word) if example.tgt: for word in example.tgt.split(): tgt_seq.append(word) src_seq = [config.sos] + src_seq[:config.max_seq_length] + [config.eos] tgt_seq = [config.sos] + tgt_seq[:config.max_seq_length] + [config.eos] if config.to_lower: src_seq = list(map(str.lower, src_seq)) tgt_seq = list(map(str.lower, tgt_seq)) src_ids = convert_list(src_seq, config.src_2_id, config.pad_id, config.unk_id) tgt_ids = convert_list(tgt_seq, config.tgt_2_id, config.pad_id, config.unk_id) features.append(InputFeatures(example.guid, src_ids, tgt_ids)) if index < 5: logger.info(log_title('Examples')) logger.info('guid: {}'.format(example.guid)) logger.info('source input: {}'.format(src_seq)) logger.info('source ids: {}'.format(src_ids)) logger.info('target input: {}'.format(tgt_seq)) logger.info('target ids: {}'.format(tgt_ids)) return features
def _read_data(self, data_file): src_seq = [] tgt_seq = [] counter = 0 for line in read_json_lines(data_file): src = line.get('src', []) tgt = line.get('tgt', []) if self.config.to_lower: src = list(map(str.lower, src)) tgt = list(map(str.lower, tgt)) src = src[:self.config.sequence_len] tgt = [self.config.sos ] + tgt[:self.config.sequence_len - 2] + [self.config.eos] src_seq.append( convert_list(src, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) tgt_seq.append( convert_list(tgt, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) counter += 1 if counter % 10000 == 0: print('\rprocessing file {}: {:>6d}'.format( data_file, counter), end='') print() return src_seq, tgt_seq
def _read_data(self, data_file): input_ids = [] input_mask = [] segment_ids = [] input_length = [] pos_ids = [] tag_ids = [] counter = 0 with open(data_file, 'r', encoding='utf-8') as fin: for line in fin: line = json.loads(line) context = line['context'] pos_seq = line['pos_seq'] tag_seq = line['tag_seq'] v1, v2, v3, v4, v5, v6 = _convert_single_example( context, pos_seq, tag_seq, self.config.sequence_len, self.tokenizer) v5 = convert_list(v5, self.pos_2_id, self.config.pad_id, self.config.unk_id) v6 = convert_list(v6, self.tag_2_id, 0, 0) input_ids.append(v1) input_mask.append(v2) segment_ids.append(v3) input_length.append(v4) pos_ids.append(v5) tag_ids.append(v6) counter += 1 print('\rprocessing: {}'.format(counter), end='') print() return input_ids, input_mask, segment_ids, input_length, pos_ids, tag_ids
def _read_data(self, data_file): topic = [] triple = [] src = [] tgt = [] data_iter = tqdm(list(read_json_lines(data_file))) for index, line in enumerate(data_iter): topic_seq = ' {} '.format(self.config.sep).join(line['topic']) triple_seq = ' {} '.format(self.config.sep).join( [' '.join(v) for v in line['triples']]) src_seq = ' {} '.format(self.config.sep).join(line['src']) tgt_seq = line['tgt'] if self.config.to_lower: topic_seq = topic_seq.lower() triple_seq = triple_seq.lower() src_seq = src_seq.lower() tgt_seq = tgt_seq.lower() topic_tokens = [self.config.sos ] + topic_seq.split() + [self.config.eos] triple_tokens = [self.config.sos] + triple_seq.split( )[:self.config.max_triple_length] + [self.config.eos] src_tokens = [self.config.sos] + src_seq.split( )[-self.config.max_seq_length:] + [self.config.eos] tgt_tokens = [self.config.sos] + tgt_seq.split( )[:self.config.max_seq_length] + [self.config.eos] topic_ids = convert_list(topic_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) triple_ids = convert_list(triple_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) src_ids = convert_list(src_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) tgt_ids = convert_list(tgt_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) topic.append(topic_ids) triple.append(triple_ids) src.append(src_ids) tgt.append(tgt_ids) if index < 5: logger.info(log_title('Examples')) logger.info('topic tokens: {}'.format(topic_tokens)) logger.info('topic ids: {}'.format(topic_ids)) logger.info('triple tokens: {}'.format(triple_tokens)) logger.info('triple ids: {}'.format(triple_ids)) logger.info('source tokens: {}'.format(src_tokens)) logger.info('source ids: {}'.format(src_ids)) logger.info('target tokens: {}'.format(tgt_tokens)) logger.info('target ids: {}'.format(tgt_ids)) return topic, triple, src, tgt
def save_result(predicted_ids, alignment_history, id_2_label, input_file, output_file): src_inputs = [] with open(input_file, 'r', encoding='utf-8') as fin: for line in fin: line = WikiEntity(line) box = line.get_box() if len(box) == 0: continue src = [] for a in box.keys(): src += box[a].split() src_inputs.append(src) tgt_outputs = [] for tgt in predicted_ids: tgt[-1] = config.eos_id tgt_outputs.append(convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk)) assert len(src_inputs) == len(tgt_outputs) with open(output_file, 'w', encoding='utf-8') as fout: for src, tgt, alignment in zip(src_inputs, tgt_outputs, alignment_history): for i, (word, index) in enumerate(zip(tgt, alignment)): if word == config.unk: tgt[i] = src[index] print(json.dumps({'description': ' '.join(tgt)}, ensure_ascii=False), file=fout)
def save_result_v2(predicted_ids, id_2_label, output_file): with open(output_file, 'w', encoding='utf-8') as fout: for tgt in predicted_ids: tgt[-1] = config.eos_id tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk) print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)
def save_result(outputs, result_file, tokenizer, id_2_tag): print('write file: {}'.format(result_file)) with open(result_file, 'w', encoding='utf-8') as fout: for context, tags in outputs: context = tokenizer.convert_ids_to_tokens(context) tags = convert_list(tags, id_2_tag, 'O', 'O') result = parse_output(tags, context) print(json.dumps(result, ensure_ascii=False), file=fout)
def check_data(data, tokenizer, id_2_pos, id_2_tag): input_ids, input_mask, segment_ids, input_length, pos_ids, tag_ids = data for _ in range(5): print('=' * 20) index = np.random.randint(0, len(input_ids)) print('id: {}'.format(index)) length = input_length[index] input_tokens = tokenizer.convert_ids_to_tokens(input_ids[index]) print('input tokens: {}'.format(input_tokens[:length])) pos_tokens = convert_list(pos_ids[index], id_2_pos, '<pad>', '<unk>') print('pos tokens: {}'.format(pos_tokens[:length])) tag_tokens = convert_list(tag_ids[index], id_2_tag, 'O', 'O') print('tag tokens: {}'.format(tag_tokens[:length])) result = refine_output(input_ids[index], tag_ids[index], length, tokenizer, id_2_tag) print(result)
def save_outputs(predicted_ids, id_2_label, input_file, output_file): src_inputs = [] for line in read_json_lines(input_file): src_inputs.append(' {} '.format(config.sep).join(line['src'])) with open(output_file, 'w', encoding='utf-8') as fout: for src, tgt in zip(src_inputs, predicted_ids): tgt[-1] = config.eos_id tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk) print(json.dumps({ 'tgt': ' '.join(tgt), 'src': src }, ensure_ascii=False), file=fout)
def save_outputs(predicted_ids, id_2_label, input_file, output_file): golden_outputs = [] for line in read_json_lines(input_file): golden_outputs.append(line['tgt']) with open(output_file, 'w', encoding='utf-8') as fout: for tgt, golden in zip(predicted_ids, golden_outputs): tgt[-1] = config.eos_id tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk) print(json.dumps({ 'tgt': ' '.join(tgt), 'golden': golden }, ensure_ascii=False), file=fout)
def convert_data(self, context): context_seq = [] pos_seq = [] for word, pos in pos_text(cut_text(context)): context_seq.append(word) pos_seq.append(pos) input_tokens = [] pos_tokens = [] temp = [self.tokenizer.tokenize(word) for word in context_seq] for i, pos in enumerate(pos_seq): input_tokens += temp[i] pos_tokens += [pos] * len(temp[i]) # Account for [CLS] and [SEP] with "- 2" input_tokens = ['[CLS]'] + input_tokens[0:(self.config.sequence_len - 2)] + ['[SEP]'] pos_tokens = ['<pad>'] + pos_tokens[0:(self.config.sequence_len - 2)] + ['<pad>'] input_length = len(input_tokens) assert len(pos_tokens) == input_length input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens) segment_ids = [0] * input_length input_mask = [1] * input_length pos_ids = convert_list(pos_tokens, self.pos_2_id, self.config.pad_id, self.config.unk_id) # Zero-pad up to the sequence length. while len(input_ids) < self.config.sequence_len: input_ids.append(0) input_mask.append(0) segment_ids.append(0) pos_ids.append(self.config.pad_id) assert len(input_ids) == self.config.sequence_len assert len(input_mask) == self.config.sequence_len assert len(segment_ids) == self.config.sequence_len assert len(pos_ids) == self.config.sequence_len return input_ids, input_mask, segment_ids, input_length, pos_ids
def save_result_v1(predicted_ids, alignment_history, id_2_label, input_file, output_file): src_inputs = [] for line in read_json_lines(input_file): src_inputs.append(line['src']) tgt_outputs = [] for tgt in predicted_ids: tgt[-1] = config.eos_id tgt_outputs.append( convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk)) assert len(src_inputs) == len(tgt_outputs) with open(output_file, 'w', encoding='utf-8') as fout: for src, tgt, alignment in zip(src_inputs, tgt_outputs, alignment_history): for i, (word, index) in enumerate(zip(tgt, alignment)): if word == config.unk: tgt[i] = src[index] print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)
def _read_data(self, data_file, max_data_size=None): value_seq = [] attr_seq = [] pos_fw_seq = [] pos_bw_seq = [] desc_seq = [] counter = 0 with open(data_file, 'r', encoding='utf-8') as fin: for line in fin: we = WikiEntity(line) value = [] attr = [] pos_fw = [] pos_bw = [] box = we.get_box() for a in box.keys(): v = box[a].split() a = [a] * len(v) p = list(range(len(v))) value += v attr += a pos_fw += p pos_bw += reversed(p) desc = we.get_desc().split() # check length and limit the maximum length of input assert len(value) == len(attr) assert len(value) == len(pos_fw) assert len(value) == len(pos_bw) if len(value) == 0: continue value = value[:self.config.sequence_len] attr = attr[:self.config.sequence_len] pos_fw = pos_fw[:self.config.sequence_len] pos_fw = np.minimum(pos_fw, self.config.pos_size - 1).tolist() # 1 for zero pos_bw = pos_bw[:self.config.sequence_len] pos_bw = np.minimum(pos_bw, self.config.pos_size - 1).tolist() # 1 for zero desc = desc[:self.config.sequence_len - 2] # 2 for sos and eos if self.config.to_lower: value = list(map(str.lower, value)) attr = list(map(str.lower, attr)) desc = list(map(str.lower, desc)) desc = [self.config.sos] + desc + [self.config.eos] value_seq.append(convert_list(value, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) attr_seq.append(convert_list(attr, self.config.attr_2_id, self.config.pad_id, self.config.unk_id)) pos_fw_seq.append(pos_fw) pos_bw_seq.append(pos_bw) desc_seq.append(convert_list(desc, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) counter += 1 if counter % 10000 == 0: print('\rprocessing file {}: {:>6d}'.format(data_file, counter), end='') if max_data_size and counter >= max_data_size: break print() return value_seq, attr_seq, pos_fw_seq, pos_bw_seq, desc_seq
def refine_output(input_ids, pred_ids, input_length, tokenizer, id_2_tag): context = tokenizer.convert_ids_to_tokens(input_ids) pred_tags = convert_list(pred_ids, id_2_tag, 'O', 'O') return parse_output(pred_tags[:input_length], context[:input_length])