def test_single_metric(self): config_string = "SENTENCEBLEU n=4" segment = self.tokenize( "Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get(config_string) reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual(reference_scorer.score(segment), provided_scorer.score(segment))
def cal_metrics_score(samples, config, num_to_target, refs, index): """evaluate candidate sentences based on reference with evaluation metrics Args: samples: candidate sentences in list (with padding) (maxlen, batch_size*sampleN) num_to_target: dictionary to map number to word refs: ground truth translations in list (batch_size, len), uneven index: starting point of each source sentence Return: numpy array contains scores of candidates """ samplesN = config.samplesN batch_size = len(refs) # convert from time domain to batch domain samples = list(map(list, zip(*samples))) samples_totalN = len(samples) if config.sample_way == 'beam_search': scores = np.zeros((batch_size * samplesN)).astype('float32') for i in range(int(batch_size)): ref = util.seq2words(refs[i], num_to_target).split(" ") ss = [] for j in samples[i * samplesN:(i + 1) * samplesN]: ss.append(util.seq2words(j, num_to_target)) ss = [s.split(" ") for s in ss] # ss: list with (samplesN, len), uneven(seq2word could get rid of padding) # get evaluation metrics (negative smoothed BLEU) for samplings scorer = ScorerProvider().get(config.mrt_loss) scorer.set_reference(ref) score = np.array(scorer.score_matrix(ss)) # compute the negative BLEU score (use 1-BLEU (BLEU: 0~1)) scores[i * samplesN:(i + 1) * samplesN] = 1 - 1 * score else: # for randomly sampling strategy, starting point information needed scores = np.zeros((samples_totalN)).astype('float32') for i in range(int(batch_size)): ref = util.seq2words(refs[i], num_to_target).split(" ") ss = [] for j in samples[index[0][i]:index[0][i + 1]]: ss.append(util.seq2words(j, num_to_target)) ss = [s.split(" ") for s in ss] # get negative smoothed BLEU for samples scorer = ScorerProvider().get(config.mrt_loss) scorer.set_reference(ref) score = np.array(scorer.score_matrix(ss)) # compute the negative BLEU score (use 1-BLEU (BLEU: 0~1)) scores[index[0][i]:index[0][i + 1]] = 1 - 1 * score return scores
def test_interpolated_metrics(self): config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4" segment = self.tokenize( "Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get( config_string ) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual(reference_scorer.score(segment), provided_scorer.score(segment))
def main(input_file, output_file, translation_settings, references=None): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ scorer = None if translation_settings.json_log: scorer = ScorerProvider().get('SENTENCEBLEU n=4') translator = Translator(translation_settings) translations = translator.translate_file(input_file, translation_settings, reference_object=references, scorer=scorer) translator.write_translations(output_file, translations, translation_settings) logging.info('Done') translator.shutdown()