def evaluate(self, ref_file, hyp_file, to_lower): references = [] for line in read_json_lines(ref_file): ref = line.get(self.key, '').strip().split() # ref is a list of tokens if to_lower: ref = list(map(str.lower, ref)) references.append(ref) hypotheses = [] for line in read_json_lines(hyp_file): hyp = line.get(self.key, '').strip().split() # hyp is a list of tokens if to_lower: hyp = list(map(str.lower, hyp)) hypotheses.append(hyp) assert len(references) == len(hypotheses) results = {} results.update(calc_bleu(references, hypotheses)) results.update(calc_f1(references, hypotheses)) results.update(calc_distinct_ngram(hypotheses, max_ngram=2)) for key, value in results.items(): logger.info('{}: {:>.4f}'.format(key, value)) return results
def build_dict(config): counter = collections.Counter() for line in read_json_lines(config.train_data): src_seq = line.get('src', []) if config.to_lower: src_seq = list(map(str.lower, src_seq)) for word in src_seq: counter[word] += 1 tgt_seq = line.get('tgt', []) if config.to_lower: tgt_seq = list(map(str.lower, tgt_seq)) for word in tgt_seq: counter[word] += 1 counter[config.pad] = 1e9 - config.pad_id counter[config.unk] = 1e9 - config.unk_id counter[config.sos] = 1e9 - config.sos_id counter[config.eos] = 1e9 - config.eos_id counter[config.sep] = 1e9 - config.sep_id counter[config.num] = 1e9 - config.num_id counter[config.time] = 1e9 - config.time_id print('number of words: {}'.format(len(counter))) word_dict = {} for word, _ in counter.most_common(config.vocab_size + config.oov_vocab_size): word_dict[word] = len(word_dict) save_json_dict(word_dict, config.vocab_dict)
def build_dict(config): src_counter = collections.Counter() tgt_counter = collections.Counter() for line in tqdm(list(read_json_lines(config.train_data)), desc='Building dict'): src = line['src'] tgt = line['tgt'] if config.to_lower: src = src.lower() tgt = tgt.lower() for word in src.split(): src_counter[word] += 1 for word in tgt.split(): tgt_counter[word] += 1 src_counter[config.pad] = tgt_counter[config.pad] = 1e9 - config.pad_id src_counter[config.unk] = tgt_counter[config.unk] = 1e9 - config.unk_id src_counter[config.sos] = tgt_counter[config.sos] = 1e9 - config.sos_id src_counter[config.eos] = tgt_counter[config.eos] = 1e9 - config.eos_id src_counter[config.sep] = tgt_counter[config.sep] = 1e9 - config.sep_id src_counter[config.num] = tgt_counter[config.num] = 1e9 - config.num_id src_counter[config.time] = tgt_counter[config.time] = 1e9 - config.time_id logger.info('number of source words: {}'.format(len(src_counter))) logger.info('number of target words: {}'.format(len(tgt_counter))) word_dict = {} for word, _ in src_counter.most_common(config.src_vocab_size): word_dict[word] = len(word_dict) save_json_dict(word_dict, config.src_vocab_dict) word_dict = {} for word, _ in tgt_counter.most_common(config.tgt_vocab_size): word_dict[word] = len(word_dict) save_json_dict(word_dict, config.tgt_vocab_dict)
def generate_data(input_file, output_file, is_test=False): data = [] for line in tqdm(list(read_json_lines(input_file))): goal = line['goal'] knowledge = line['knowledge'] topic = goal[0][1:] triples = knowledge + [v for v in goal[1:] if v not in knowledge] if not is_test: conversation = line['conversation'] for i in range(len(conversation)): src = conversation[:i] tgt = conversation[i] data.append({ 'src': src, 'tgt': tgt, 'topic': topic, 'triples': triples }) else: src = line['history'] tgt = line['response'] data.append({ 'src': src, 'tgt': tgt, 'topic': topic, 'triples': triples }) save_json_lines(data, output_file)
def _read_data(self, data_file): src_seq = [] tgt_seq = [] counter = 0 for line in read_json_lines(data_file): src = line.get('src', []) tgt = line.get('tgt', []) if self.config.to_lower: src = list(map(str.lower, src)) tgt = list(map(str.lower, tgt)) src = src[:self.config.sequence_len] tgt = [self.config.sos ] + tgt[:self.config.sequence_len - 2] + [self.config.eos] src_seq.append( convert_list(src, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) tgt_seq.append( convert_list(tgt, self.config.word_2_id, self.config.pad_id, self.config.unk_id)) counter += 1 if counter % 10000 == 0: print('\rprocessing file {}: {:>6d}'.format( data_file, counter), end='') print() return src_seq, tgt_seq
def _read_data(self, data_file): topic = [] triple = [] src = [] tgt = [] data_iter = tqdm(list(read_json_lines(data_file))) for index, line in enumerate(data_iter): topic_seq = ' {} '.format(self.config.sep).join(line['topic']) triple_seq = ' {} '.format(self.config.sep).join( [' '.join(v) for v in line['triples']]) src_seq = ' {} '.format(self.config.sep).join(line['src']) tgt_seq = line['tgt'] if self.config.to_lower: topic_seq = topic_seq.lower() triple_seq = triple_seq.lower() src_seq = src_seq.lower() tgt_seq = tgt_seq.lower() topic_tokens = [self.config.sos ] + topic_seq.split() + [self.config.eos] triple_tokens = [self.config.sos] + triple_seq.split( )[:self.config.max_triple_length] + [self.config.eos] src_tokens = [self.config.sos] + src_seq.split( )[-self.config.max_seq_length:] + [self.config.eos] tgt_tokens = [self.config.sos] + tgt_seq.split( )[:self.config.max_seq_length] + [self.config.eos] topic_ids = convert_list(topic_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) triple_ids = convert_list(triple_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) src_ids = convert_list(src_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) tgt_ids = convert_list(tgt_tokens, self.config.word_2_id, self.config.pad_id, self.config.unk_id) topic.append(topic_ids) triple.append(triple_ids) src.append(src_ids) tgt.append(tgt_ids) if index < 5: logger.info(log_title('Examples')) logger.info('topic tokens: {}'.format(topic_tokens)) logger.info('topic ids: {}'.format(topic_ids)) logger.info('triple tokens: {}'.format(triple_tokens)) logger.info('triple ids: {}'.format(triple_ids)) logger.info('source tokens: {}'.format(src_tokens)) logger.info('source ids: {}'.format(src_ids)) logger.info('target tokens: {}'.format(tgt_tokens)) logger.info('target ids: {}'.format(tgt_ids)) return topic, triple, src, tgt
def _load_and_cache_data(self, data_file, cache_file=None): examples = [] for index, line in enumerate(tqdm(list(read_json_lines(data_file)), desc='Loading file: {}'.format(data_file))): src = line['src'] tgt = line.get('tgt') examples.append(InputExample(index, src, tgt)) features = convert_examples_to_features(examples, self.config) if cache_file: pickle.dump({'examples': examples, 'features': features}, open(cache_file, 'wb')) return examples, features
def evaluate(self, ref_file, hyp_file, to_lower): list_of_references = [] for line in read_json_lines(ref_file): ref = line[self.key] # ref is a list of words if to_lower: ref = list(map(str.lower, ref)) list_of_references.append([ref]) hypotheses = [] for line in read_json_lines(hyp_file): hyp = line[self.key] # hyp is a list of words if to_lower: hyp = list(map(str.lower, hyp)) hypotheses.append(hyp) assert len(list_of_references) == len(hypotheses) bleu1 = 100 * corpus_bleu(list_of_references, hypotheses, (1., 0., 0., 0.), SmoothingFunction().method4) bleu2 = 100 * corpus_bleu(list_of_references, hypotheses, (0.5, 0.5, 0., 0.), SmoothingFunction().method4) bleu3 = 100 * corpus_bleu(list_of_references, hypotheses, (0.33, 0.33, 0.33, 0.), SmoothingFunction().method4) bleu4 = 100 * corpus_bleu(list_of_references, hypotheses, (0.25, 0.25, 0.25, 0.25), SmoothingFunction().method4) print('{:>.4f}, {:>.4f}, {:>.4f}, {:>.4f}'.format( bleu1, bleu2, bleu3, bleu4)) res = { 'Bleu_1': bleu1, 'Bleu_2': bleu2, 'Bleu_3': bleu3, 'Bleu_4': bleu4, } return res
def build_word_dict(config, min_freq=5): cnt = 0 word_cnt = collections.Counter() attr_cnt = collections.Counter() for line in read_json_lines(config.train_data): we = WikiEntity(line) box = we.get_box() for a in box.keys(): for w in box[a].split(): if config.to_lower: w = w.lower() word_cnt[w] += 1 if config.to_lower: a = a.lower() attr_cnt[a] += 1 desc = we.get_desc() for w in desc.split(): if config.to_lower: w = w.lower() word_cnt[w] += 1 cnt += 1 if cnt % 10000 == 0: print('\rprocessing: {}'.format(cnt), end='') print() word_cnt[config.pad] = attr_cnt[config.pad] = 1e9 - config.pad_id word_cnt[config.unk] = attr_cnt[config.unk] = 1e9 - config.unk_id word_cnt[config.sos] = attr_cnt[config.sos] = 1e9 - config.sos_id word_cnt[config.eos] = attr_cnt[config.eos] = 1e9 - config.eos_id word_cnt[config.num] = attr_cnt[config.num] = 1e9 - config.num_id word_cnt[config.time] = attr_cnt[config.time] = 1e9 - config.time_id print('number of words in word counter: {}'.format(len(word_cnt))) print('number of words in attribute counter: {}'.format(len(attr_cnt))) word_dict = {} for word, cnt in word_cnt.most_common(): if cnt < min_freq: break word_dict[word] = len(word_dict) save_json(word_dict, config.word_dict) attr_dict = {} for attr, _ in attr_cnt.most_common(): attr_dict[attr] = len(attr_dict) save_json(attr_dict, config.attr_dict)
def save_outputs(predicted_ids, id_2_label, input_file, output_file): src_inputs = [] for line in read_json_lines(input_file): src_inputs.append(' {} '.format(config.sep).join(line['src'])) with open(output_file, 'w', encoding='utf-8') as fout: for src, tgt in zip(src_inputs, predicted_ids): tgt[-1] = config.eos_id tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk) print(json.dumps({ 'tgt': ' '.join(tgt), 'src': src }, ensure_ascii=False), file=fout)
def save_outputs(predicted_ids, id_2_label, input_file, output_file): golden_outputs = [] for line in read_json_lines(input_file): golden_outputs.append(line['tgt']) with open(output_file, 'w', encoding='utf-8') as fout: for tgt, golden in zip(predicted_ids, golden_outputs): tgt[-1] = config.eos_id tgt = convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk) print(json.dumps({ 'tgt': ' '.join(tgt), 'golden': golden }, ensure_ascii=False), file=fout)
def build_dict(filename, config): counter = collections.Counter() for line in tqdm(list(read_json_lines(filename))): goal = line['goal'] knowledge = line['knowledge'] conversation = line['conversation'] topic = goal[0][1:] for entity in topic: if config.to_lower: entity = entity.lower() for token in entity.strip().split(): counter[token] += 1 triples = knowledge + [v for v in goal[1:] if v not in knowledge] for triple in triples: for node in triple: if config.to_lower: node = node.lower() for token in node.strip().split(): counter[token] += 1 for sequence in conversation: if config.to_lower: sequence = sequence.lower() for token in sequence.strip().split(): counter[token] += 1 counter[config.pad] = 1e9 - config.pad_id counter[config.unk] = 1e9 - config.unk_id counter[config.sos] = 1e9 - config.sos_id counter[config.eos] = 1e9 - config.eos_id counter[config.sep] = 1e9 - config.sep_id counter[config.num] = 1e9 - config.num_id counter[config.time] = 1e9 - config.time_id logger.info('number of words: {}'.format(len(counter))) word_dict = {} for word, _ in counter.most_common(config.vocab_size + config.oov_vocab_size): word_dict[word] = len(word_dict) save_json_dict(word_dict, config.vocab_dict)
def save_result_v1(predicted_ids, alignment_history, id_2_label, input_file, output_file): src_inputs = [] for line in read_json_lines(input_file): src_inputs.append(line['src']) tgt_outputs = [] for tgt in predicted_ids: tgt[-1] = config.eos_id tgt_outputs.append( convert_list(tgt[:tgt.index(config.eos_id)], id_2_label, config.pad, config.unk)) assert len(src_inputs) == len(tgt_outputs) with open(output_file, 'w', encoding='utf-8') as fout: for src, tgt, alignment in zip(src_inputs, tgt_outputs, alignment_history): for i, (word, index) in enumerate(zip(tgt, alignment)): if word == config.unk: tgt[i] = src[index] print(json.dumps({'tgt': tgt}, ensure_ascii=False), file=fout)