def eval(self, train_step): with self.eval_graph.as_default(): self.eval_saver.restore(self.eval_session, self.model_file) bleu_score = 0 target_results = [] output_results = [] for step in range(0, self.eval_reader.data_size): data = next(self.eval_data) in_seq = data['in_seq'] in_seq_len = data['in_seq_len'] target_seq = data['target_seq'] target_seq_len = data['target_seq_len'] outputs = self.eval_session.run( self.eval_output, feed_dict={ self.eval_in_seq: in_seq, self.eval_in_seq_len: in_seq_len}) for i in range(len(outputs)): output = outputs[i] target = target_seq[i] output_text = reader.decode_text(output, self.eval_reader.vocabs).split(' ') target_text = reader.decode_text(target[1:], self.eval_reader.vocabs).split(' ') prob = int(self.eval_reader.data_size * self.batch_size / 10) target_results.append([target_text]) output_results.append(output_text) if random.randint(1, prob) == 1: print('====================') input_text = reader.decode_text(in_seq[i], self.eval_reader.vocabs) print('src:' + input_text) print('output: ' + ' '.join(output_text)) print('target: ' + ' '.join(target_text)) return bleu.compute_bleu(target_results, output_results)[0] * 100
def _bleu(ref_file, trans_file, subword_option=None): """Compute BLEU scores and handling BPE.""" max_order = 4 smooth = False ref_files = [ref_file] reference_text = [] for reference_filename in ref_files: with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename, "rb")) as fh: reference_text.append(fh.readlines()) per_segment_references = [] for references in zip(*reference_text): reference_list = [] for reference in references: reference = _clean(reference, subword_option) reference_list.append(reference.split(" ")) per_segment_references.append(reference_list) translations = [] with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh: for line in fh: line = _clean(line, subword_option=None) translations.append(line.split(" ")) # bleu_score, precisions, bp, ratio, translation_length, reference_length bleu_score, _, _, _, _, _ = bleu.compute_bleu(per_segment_references, translations, max_order, smooth) return 100 * bleu_score
def _evaluate(eval_fn, input_fn, decode_fn, path, config): graph = tf.Graph() with graph.as_default(): features = input_fn() refs = features["references"] placeholders = { "source": tf.placeholder(tf.int32, [None, None], "source"), "source_length": tf.placeholder(tf.int32, [None], "source_length") } predictions = eval_fn(placeholders) predictions = predictions[0][:, 0, :] all_refs = [[] for _ in range(len(refs))] all_outputs = [] sess_creator = tf.train.ChiefSessionCreator(checkpoint_dir=path, config=config) with tf.train.MonitoredSession(session_creator=sess_creator) as sess: while not sess.should_stop(): feats = sess.run(features) outputs = sess.run(predictions, feed_dict={ placeholders["source"]: feats["source"], placeholders["source_length"]: feats["source_length"] }) # shape: [batch, len] outputs = outputs.tolist() # shape: ([batch, len], ..., [batch, len]) references = [item.tolist() for item in feats["references"]] all_outputs.extend(outputs) for i in range(len(refs)): all_refs[i].extend(references[i]) decoded_symbols = decode_fn(all_outputs) decoded_refs = [decode_fn(refs) for refs in all_refs] decoded_refs = [list(x) for x in zip(*decoded_refs)] return bleu.compute_bleu(decoded_symbols, decoded_refs)
def main(args): data_pth = "data/%s" % args.data_name train_pth = os.path.join(data_pth, "train_data.txt") train_data = MonoTextData(train_pth, True, vocab=100000) vocab = train_data.vocab source_pth = os.path.join(data_pth, "test_data.txt") target_pth = args.target_path eval_data = MonoTextData(target_pth, True, vocab=vocab) source = pd.read_csv(source_pth, names=['label', 'content'], sep='\t') target = pd.read_csv(target_pth, names=['label', 'content'], sep='\t') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Classification Accuracy model = CNNClassifier(len(vocab), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) model.load_state_dict( torch.load("checkpoint/%s-classifier.pt" % args.data_name)) model.eval() eval_data, eval_label = eval_data.create_data_batch_labels( 64, device, batch_first=True) acc = 100 * evaluate(model, eval_data, eval_label) print("Acc: %.2f" % acc) # BLEU Score total_bleu = 0.0 sources = [] targets = [] for i in range(source.shape[0]): s = source.content[i].split() t = target.content[i].split() sources.append([s]) targets.append(t) total_bleu += compute_bleu(sources, targets)[0] total_bleu *= 100 print("Bleu: %.2f" % total_bleu)
def compute_bleu_score(references, translations, max_order=4, smooth=False): bleu_score, _, _, _, _, _ = bleu.compute_bleu(references, translations, max_order, smooth) print(bleu_score) return bleu_score * 100
def evaluate_bleu(refs, preds, bleu_n=4): """Compute Blue score""" eval_res = compute_bleu(refs, preds, max_order=bleu_n) return eval_res[0]