예제 #1
0
 def test_case_where_n_is_bigger_than_hypothesis_length(self):
     # Test BLEU to nth order of n-grams, where n > len(hypothesis).
     references = ['John loves Mary ?'.split()]
     hypothesis = 'John loves Mary'.split()
     n = len(hypothesis) + 1 # 
     weights = [1.0/n] * n # Uniform weights.
     self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4)
     
     # Test case where n > len(hypothesis) but so is n > len(reference), and
     # it's a special case where reference == hypothesis.
     references = ['John loves Mary'.split()]
     hypothesis = 'John loves Mary'.split()
     assert(sentence_bleu(references, hypothesis, weights) == 1.0)
예제 #2
0
파일: bleu.py 프로젝트: RileyShe/DeepPavlov
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple=(1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure/bpenalty
예제 #3
0
def calc_test_bleu_and_loss(sess, epoch):
    test_feed_generator = get_batch(test_vect_eng_sentences, test_decoder_input_data, test_decoder_target_data,
                                    batch_size)

    number_of_batches_in_test = int(len(test_vect_eng_sentences) / batch_size)

    # Calcualte the bleu of the translations of the test data
    bleu_scores = []
    average_loss = 0
    for i in tqdm(range(number_of_batches_in_test), desc="test metrics"):
        fd = next(test_feed_generator)
        predict_, loss_ = sess.run([decoder_prediction, loss], fd)
        for i, (inp, pred, exp) in enumerate(zip(fd[encoder_inputs].T, predict_.T, fd[decoder_targets].T)):
            input_sentence = decode_sequence(inp[::-1], rev_eng_vocab)
            output_sentence = decode_sequence(pred, rev_heb_vocab)
            expected_sentence = decode_sequence(exp, rev_heb_vocab)
            score = sentence_bleu([decode_sequence(pred, rev_heb_vocab, False)],
                                  decode_sequence(exp, rev_heb_vocab, False),
                                  smoothing_function=chencherry.method1)
            bleu_scores.append(score)
        average_loss += (loss_ / number_of_batches_in_test)
    train_writer.add_summary(
        tf.Summary(value=[tf.Summary.Value(tag="test_loss", simple_value=average_loss), ]), epoch)
    train_writer.add_summary(
        tf.Summary(value=[tf.Summary.Value(tag="test_bleu", simple_value=np.mean(bleu_scores)), ]), epoch)
예제 #4
0
def main():
    """ 
    bleu function parameters:
        bleu(candidate, references, weights)  
        :param candidate: a candidate sentence
        :type candidate: list(str)
        :param references: reference sentences
        :type references: list(list(str))
        :param weights: weights for unigrams, bigrams, trigrams and so on
        :type weights: list(float) 
    """
    
    # Command line argument checking
    if(len(sys.argv) != 3):
            sys.exit("ERROR: Invalid number of arguments, expecting 2")

    # Import the files, first the candidate into cFile and the reference to rFile
    cFile = open(sys.argv[1])
    rFile = open(sys.argv[2])
    
    cRaw = cFile.read()
    rRaw = rFile.read()

    # Then tokenize them both
    cToken = word_tokenize(cRaw)
    rToken = word_tokenize(rRaw)

    # Finally compute the BLEU score
    
    bleuSc = bleu_score.sentence_bleu([rToken], cToken)
    print(bleuSc)
예제 #5
0
def main():
    """ 
        bleu function parameters:
        bleu(candidate, references, weights)  
    :param candidate: a candidate sentence
    :type candidate: list(str)
    :param references: reference sentences
    :type references: list(list(str))
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float) 
    """

    # First define some test strings to work with
    refTextRaw = "This is the story of a man who fell from the fiftieth story of a building. While he fell, he reassured himself by repeating, 'So far, so good. So far, so good. So far, so good'. But, the important thing is not the fall - only the landing."
    candidateTextRaw = "This is the story of a man who fell from the 50th floor of a block. To reassure himself while he fell, he repeated, 'So far, so good. So far, so good. So far, so good'. However, the important thing is not the fall. Only the landing."
    refTextTokens = word_tokenize(refTextRaw)
    candidateTextTokens = word_tokenize(candidateTextRaw)

    candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']

    candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct']

    reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']

    reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party']

    reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party']

    # Work out the BLEU score
    bleuSc = bleu_score.sentence_bleu([refTextTokens], candidateTextTokens)
    print(bleuSc)
def computeSimple(sentence1, sentence2):
    features = [0] * 7
    tokenizer = RegexpTokenizer(r'\w+')
    words1 = tokenizer.tokenize(sentence1)
    words2 = tokenizer.tokenize(sentence2)
    n = len(words1)
    m = len(words2)

    # word overlap features
    count = 0 # num of same words in sentence
    for word1 in words1:
        for word2 in words2:
            if word1 == word2:
                count += 1

    features[0] = count / n # "precision"
    features[1] = count / m # "recall"

    features[2] = sentence_bleu([sentence1], sentence2)
    features[3] = sentence_bleu([sentence2], sentence1)

    # Obtain pairs of adjacent words
    skipgrams1 = skipgrams(words1, 2, 0)
    skipgrams2 = skipgrams(words2, 2, 0)

    count = 0
    for gram1 in skipgrams1:
        for gram2 in skipgrams2:
            if gram1 == gram2:
                count += 1

    features[4] = count / combinations(n, count)
    features[5] = count / combinations(m, count)


    """if (n > m):
        features[6] = m / n
    else:
        features[6] = n / m"""

    if len(sentence1) > len(sentence2):
        features[7] = len(sentence2) / len(sentence1)
    else:
        features[7] = len(sentence1) / len(sentence2)

    return features
 def test_case_where_n_is_bigger_than_hypothesis_length(self):
     # Test BLEU to nth order of n-grams, where n > len(hypothesis).
     # TODO: Currently this test breaks the BLEU implementation (13.03.2016)
     references = ['John loves Mary'.split()]
     hypothesis = 'John loves Mary'.split()
     n = len(hypothesis) + 1 # 
     weights = [1.0/n] * n # Uniform weights.
     assert(sentence_bleu(references, hypothesis, weights) == 1.0)
예제 #8
0
파일: pro.py 프로젝트: altcrank/NLP2
def evaluate_score(translation, score, smoothing_func):
    if score == 'BLEU':
        translation_split = translation.translation
        reference_split = translation.reference
        try:
            return bleu.sentence_bleu([reference_split], translation_split, smoothing_function=smoothing_func)
        except:
            word_count = min(len(reference_split), len(translation_split))
            weights = []
            weight = 0.25
            if word_count < 4:
                weight = 1 / float(word_count)
            for i in range(min(4, word_count)):
                weights.append(weight)
            return bleu.sentence_bleu([reference_split], translation_split, weights=weights, smoothing_function=smoothing_func)
    else:
        print 'evaluate_score: unrecognized score \'{0}\''.format(score)
예제 #9
0
 def test_partial_matches_hypothesis_longer_than_reference(self):
     references = ['John loves Mary'.split()]
     hypothesis = 'John loves Mary who loves Mike'.split()
     self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4)
     # Checks that the warning has been raised because len(reference) < 4.
     try:
         self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
     except AttributeError:
         pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
예제 #10
0
    def test_case_where_n_is_bigger_than_hypothesis_length(self):
        # Test BLEU to nth order of n-grams, where n > len(hypothesis).
        references = ['John loves Mary ?'.split()]
        hypothesis = 'John loves Mary'.split()
        n = len(hypothesis) + 1 #
        weights = [1.0/n] * n # Uniform weights.
        self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4)
        # Checks that the warning has been raised because len(hypothesis) < 4.
        try:
            self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
        except AttributeError:
            pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.

        # Test case where n > len(hypothesis) but so is n > len(reference), and
        # it's a special case where reference == hypothesis.
        references = ['John loves Mary'.split()]
        hypothesis = 'John loves Mary'.split()
        assert(sentence_bleu(references, hypothesis, weights) == 1.0)
예제 #11
0
def test():
  """Test the translation model."""
  nltk.download('punkt')
  with tf.Session() as sess:
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    src_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.src_lang + "_mapping%d.txt" % FLAGS.src_lang_vocab_size
    dst_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.dst_lang + "_mapping%d.txt" % FLAGS.dst_lang_vocab_size
    src_lang_vocab, _ = data_utils.initialize_vocabulary(src_lang_vocab_path)
    _, rev_dst_lang_vocab = data_utils.initialize_vocabulary(dst_lang_vocab_path)

    weights = [0.25, 0.25, 0.25, 0.25]

    first_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.src_lang))
    second_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.dst_lang))
		
    total_bleu_value = 0.0
    computing_bleu_iterations = 0

    for first_lang_raw in first_lang_file:
      second_lang_gold_raw = second_lang_file.readline()
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(first_lang_raw), src_lang_vocab)
      # Which bucket does it belong to?
      try:
        bucket_id = min([b for b in xrange(len(_buckets))
                         if _buckets[b][0] > len(token_ids)])
      except ValueError:
        continue
      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
	  {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out sentence corresponding to outputs.
      model_tran_res = " ".join([tf.compat.as_str(rev_dst_lang_vocab[output]) for output in outputs])
      second_lang_gold_tokens = word_tokenize(second_lang_gold_raw)
      model_tran_res_tokens = word_tokenize(model_tran_res)
      try:
        current_bleu_value = sentence_bleu([model_tran_res_tokens], second_lang_gold_tokens, weights)
        total_bleu_value += current_bleu_value
        computing_bleu_iterations += 1
      except ZeroDivisionError:
        pass
      if computing_bleu_iterations % 10 == 0:
        print("BLEU value after %d iterations: %.2f"
              % (computing_bleu_iterations, total_bleu_value / computing_bleu_iterations))
    final_bleu_value = total_bleu_value / computing_bleu_iterations
    print("Final BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, final_bleu_value))
    return
예제 #12
0
 def test_zero_matches(self):
     # Test case where there's 0 matches
     references = ['The candidate has no alignment to any of the references'.split()]
     hypothesis = 'John loves Mary'.split()
     
     # Test BLEU to nth order of n-grams, where n is len(hypothesis). 
     for n in range(1,len(hypothesis)):
         weights = [1.0/n] * n # Uniform weights.
         assert(sentence_bleu(references, hypothesis, weights) == 0)
예제 #13
0
 def test_full_matches(self):    
     # Test case where there's 100% matches
     references = ['John loves Mary'.split()]
     hypothesis = 'John loves Mary'.split()
 
     # Test BLEU to nth order of n-grams, where n is len(hypothesis). 
     for n in range(1,len(hypothesis)):
         weights = [1.0/n] * n # Uniform weights.
         assert(sentence_bleu(references, hypothesis, weights) == 1.0)
예제 #14
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    f = codecs.open('report-%s.csv'% args.model, 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    src_lines = codecs.open(args.src, 'r', 'utf-8').readlines()
    src_lines_nounk = codecs.open(args.src + '.nounk', 'r', 'utf-8').readlines()

    target_lines = codecs.open(args.target, 'r', 'utf-8').readlines()
    target_lines_nounk = codecs.open(args.target + '.nounk', 'r', 'utf-8').readlines()

    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()
    gold_lines_nounk = codecs.open(args.gold + '.nounk', 'r', 'utf-8').readlines()

    data = ['Src', 'Src_UNK', 'Target_UNK', 'Target', 'Gold_UNK', 'Gold', 'BLEU1']
    csv_f.writerow(data)

    num_lines = len(gold_lines)
    logging.info('Num Lines: %d'% num_lines)


    references = []
    hypotheses = []
    for index in range(num_lines):
        data = []
        data.append(src_lines_nounk[index].strip())
        data.append(src_lines[index].strip())

        data.append(target_lines[index].strip())
        data.append(target_lines_nounk[index].strip())

        data.append(gold_lines[index].strip())
        data.append(gold_lines_nounk[index].strip())

        gold = gold_lines[index].strip().split()
        output = target_lines[index].strip().split()
        default = 'UNK UNK UNK UNK'.split()

        if len(output) < 4:
            bleu_score = 0.0
            hypotheses.append(default)
        else:
            bleu_score = sentence_bleu([gold], output, weights=(1.0,))
            hypotheses.append(output)

        references.append([gold])
        logging.info('sentence:%d bleu:%f'%(index, bleu_score))
        data.append(str(bleu_score))
        csv_f.writerow(data)

    final_bleu = corpus_bleu(references, hypotheses)
    unigram_bleu = corpus_bleu(references, hypotheses, weights=(1.0,))
    logging.info('Final BLEU: %f Unigram_BLEU: %f '% (final_bleu, unigram_bleu))
예제 #15
0
 def test_reference_or_hypothesis_shorter_than_fourgrams(self):
     # Tese case where the length of reference or hypothesis
     # is shorter than 4.
     references = ['let it go'.split()]
     hypothesis = 'let go it'.split()
     # Checks that the value the hypothesis and reference returns is 1.0
     assert(sentence_bleu(references, hypothesis) == 1.0)
     # Checks that the warning has been raised.
     try:
         self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
     except AttributeError:
         pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
예제 #16
0
파일: model.py 프로젝트: sra4077/RLSeq2Seq
  def reward_function(self, reference, summary, measure='rouge_l/f_score'):
    """Calculate the reward between the reference and summary.

    Args:
      reference: A list of ids representing the ground-truth data
      summary: A list of ids representing the model generated data

    Returns:
      A single value representing the evaluation value for reference and summary
    """
    if 'rouge' in measure:
      return rouge([summary],[reference])[measure]
    else:
      return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
예제 #17
0
def get_validation_bleu_scores(sess, generator, iterations):
    """
    calculate the mean bleu score for specific number of minibatch iterations of a generator
    """
    scores = []
    average_loss = 0
    for i in tqdm(range(iterations), desc="validation metrics"):
        feed = next(generator)
        validation_predict_, loss_ = sess.run([decoder_prediction, loss], feed)
        scores += [
            sentence_bleu([decode_sequence(pred, rev_heb_vocab, False)], decode_sequence(exp, rev_heb_vocab, False),
                          smoothing_function=chencherry.method1) for pred, exp in
            zip(validation_predict_.T, feed[decoder_targets].T)]
        average_loss += (loss_ / iterations)
    return np.mean(scores), average_loss
예제 #18
0
def bleu(reference, candidate):
    """
    Compute the BLEU score for a given candidate sentence, with respect to a
    given reference sentence.

    reference: the reference translation
    candidate: the candidate translation
    """
    chen_cherry = SmoothingFunction()
    try:
        return sentence_bleu([reference], candidate, smoothing_function=chen_cherry.method7)
    except ZeroDivisionError as error:
        return 0
    except AttributeError as error:
        return 0
def bleu_ngram_score(arg):
  if arg == 1:
    sampled_poem = poem1_display.get('1.0', 'end')
  else:
    sampled_poem = poem2_display.get('1.0', 'end')
  sampled_poem = sampled_poem.rstrip()
  sampled_poem = sampled_poem.replace(',', '')
  sampled_poem = sampled_poem.split('\n')
  sampled_poem = [x.lower() for x in sampled_poem]
  sampled_token = [nltk.word_tokenize(x) for x in sampled_poem]
  # Smoothing function
  sf = SmoothingFunction().method4
  score = []
  for x in sampled_token:
    score.append(sentence_bleu(reference, x, weights = (0.25, 0.25, 0.25, 0.25), smoothing_function = sf))
  average_score = sum(score) / len(score)
  write_to_log("BLEU (cumulative 4-gram) score of Sonnet %d: %.2f" %(arg, average_score), logger_index)
def calculate_bleu(reference, candidate):
    reference_list = tokenize_sentence(reference)
    candidate_list = tokenize_sentence(candidate)
    return sentence_bleu([reference_list], candidate_list, weights=(1, 0, 0, 0))
예제 #21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'zhangjingjun'
__mtime__ = '2018/9/5'
# ----------Dragon be here!----------
              ┏━┓      ┏━┓
            ┏━┛ ┻━━━━━━┛ ┻━━┓
            ┃       ━       ┃
            ┃  ━┳━┛   ┗━┳━  ┃
            ┃       ┻       ┃
            ┗━━━┓      ┏━━━━┛
                ┃      ┃神兽保佑
                ┃      ┃永无BUG!
                ┃      ┗━━━━━━━━━┓
                ┃                ┣━┓
                ┃                ┏━┛
                ┗━━┓ ┓ ┏━━━┳━┓ ┏━┛
                   ┃ ┫ ┫   ┃ ┫ ┫
                   ┗━┻━┛   ┗━┻━┛
"""
from nltk.translate.bleu_score import sentence_bleu
reference = [['this', 'is', 'small', 'test']]
candidate = ['this', 'is', 'a', 'test']
score = sentence_bleu(reference, candidate)
print(score)
예제 #22
0
def bleuScore(s1, s2):
    return bleu_score.sentence_bleu(s1, s2)
예제 #23
0
from nltk.translate.bleu_score import sentence_bleu

reference = list('她的故事在法国遥远的西部山上')
hypothesis = list('她的故事在法国的遥远山')
score = sentence_bleu([reference], hypothesis)
print(score)
예제 #24
0
        if val is None:
            continue
        elif val is True:
            constrained = constrained.replace(slot, slot[7:])
            attn += ' {}'.format(slot[7:])
            unconstrained += ' {}'.format(slot[7:])
        else:
            constrained = constrained.replace(slot, str(val))
            attn += ' {}'.format(str(val))
            unconstrained += ' {}'.format(str(val))
    '''
    if count % 1000 == 0:
        print(count)
    count += 1

    bleu_score[0] += sentence_bleu([target], attn)
    bleu_score[1] += sentence_bleu([target], unconstrained)
    bleu_score[2] += sentence_bleu([target], constrained)

    r_score = RGE.get_scores(attn, target)
    rouge_1[0] += r_score[0]['rouge-1']['f']
    rouge_2[0] += r_score[0]['rouge-2']['f']
    rouge_l[0] += r_score[0]['rouge-l']['f']
    r_score = RGE.get_scores(unconstrained, target)
    rouge_1[1] += r_score[0]['rouge-1']['f']
    rouge_2[1] += r_score[0]['rouge-2']['f']
    rouge_l[1] += r_score[0]['rouge-l']['f']
    r_score = RGE.get_scores(constrained, target)
    rouge_1[2] += r_score[0]['rouge-1']['f']
    rouge_2[2] += r_score[0]['rouge-2']['f']
    rouge_l[2] += r_score[0]['rouge-l']['f']
예제 #25
0
with open(GENERATED_FILE, 'r') as input_file:
    lines = input_file.readlines()
    for i in range(0, len(lines), 2):
        source = lines[i].strip()
        generated = lines[i + 1].strip()

        source_clean = re.sub(r'<UNK> ', "", source)
        key = ''
        for target_str in source_target_dict.keys():
            word_count = 0
            for word1 in source_clean.split():
                if word1 in target_str.split():
                    word_count += 1
            if word_count == len(source_clean.split()):
                key = target_str
                break

        target = source_target_dict[key]
        semples.append((source, target, generated))

total_bleu = 0
smoothing_foonction = SmoothingFunction()
for _, t, g in semples:
    target_words = t.strip().split()
    generated_words = g.strip().split()
    score = sentence_bleu([target_words],
                          generated_words,
                          smoothing_function=smoothing_foonction.method4)
    total_bleu += score

print('BLEU: {:.4}'.format(total_bleu / len(semples)))
예제 #26
0
def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    tf.logging.set_verbosity(
        tf.logging.INFO)  # choose what level of logging you want
    tf.logging.info('Starting running in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    if not os.path.exists(FLAGS.log_root):
        if FLAGS.mode == "train":
            os.makedirs(FLAGS.log_root)
        else:
            raise Exception(
                "Logdir %s doesn't exist. Run in train mode to create it." %
                (FLAGS.log_root))

    vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size)  # create a vocabulary

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
    hparam_list = [
        'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag',
        'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim',
        'batch_size', 'max_dec_steps', 'max_enc_steps'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    hparam_list = [
        'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std',
        'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    tf.set_random_seed(111)  # a seed value for randomness

    if hps_generator.mode == 'train':

        print("Start pre-training......")
        model_class = Classification(hps_discriminator, vocab)
        cla_batcher = ClaBatcher(hps_discriminator, vocab)
        sess_cls, saver_cls, train_dir_cls = setup_training_classification(
            model_class)
        print("Start pre-training classification......")
        #run_pre_train_classification(model_class, cla_batcher, 10, sess_cls, saver_cls, train_dir_cls)
        #generated = Generate_training_sample(model_class, vocab, cla_batcher, sess_cls)

        #print("Generating training examples......")
        #generated.generate_training_example("train")
        #generated.generator_validation_example("valid")

        model_sentiment = Sentimentor(hps_generator, vocab)
        sentiment_batcher = SenBatcher(hps_generator, vocab)
        sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor(
            model_sentiment)
        #run_pre_train_sentimentor(model_sentiment,sentiment_batcher,1,sess_sen,saver_sen,train_dir_sen)
        sentiment_generated = Generate_non_sentiment_weight(
            model_sentiment, vocab, sentiment_batcher, sess_sen)
        #sentiment_generated.generate_training_example("train_sentiment")
        #sentiment_generated.generator_validation_example("valid_sentiment")

        model = Generator(hps_generator, vocab)
        # Create a batcher object that will create minibatches of data
        batcher = GenBatcher(vocab, hps_generator)

        sess_ge, saver_ge, train_dir_ge = setup_training_generator(model)

        util.load_ckpt(saver_sen, sess_sen, ckpt_dir="train-sentimentor")

        util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification")

        generated = Generated_sample(model, vocab, batcher, sess_ge)
        #print("Start pre-training generator......")
        run_pre_train_generator(
            model, batcher, 4, sess_ge, saver_ge, train_dir_ge, generated,
            model_class, sess_cls,
            cla_batcher)  # this is an infinite loop until interrupted

        #generated.generator_validation_negetive_example("temp_negetive", batcher, model_class,sess_cls,cla_batcher) # batcher, model_class, sess_cls, cla_batcher
        #generated.generator_validation_positive_example(
        #    "temp_positive", batcher, model_class,sess_cls,cla_batcher)

        loss_window = 0
        t0 = time.time()
        print("begin dual learning:")
        for epoch in range(30):
            batches = batcher.get_batches(mode='train')
            for i in range(len(batches)):
                current_batch = copy.deepcopy(batches[i])
                sentiment_batch = batch_sentiment_batch(
                    current_batch, sentiment_batcher)
                result = model_sentiment.max_generator(sess_sen,
                                                       sentiment_batch)
                weight = result['generated']
                current_batch.weight = weight
                sentiment_batch.weight = weight

                cla_batch = batch_classification_batch(current_batch, batcher,
                                                       cla_batcher)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)

                cc = SmoothingFunction()

                reward_sentiment = 1 - np.abs(0.5 - result['y_pred_auc'])
                reward_BLEU = []
                for k in range(FLAGS.batch_size):
                    reward_BLEU.append(
                        sentence_bleu(
                            [current_batch.original_reviews[k].split()],
                            cla_batch.original_reviews[k].split(),
                            smoothing_function=cc.method1))

                reward_BLEU = np.array(reward_BLEU)

                reward_de = (2 / (1.0 / (1e-6 + reward_sentiment) + 1.0 /
                                  (1e-6 + reward_BLEU)))

                result = model.run_train_step(sess_ge, current_batch)
                train_step = result[
                    'global_step']  # we need this to update our running average loss
                loss = result['loss']
                loss_window += loss
                if train_step % 100 == 0:
                    t1 = time.time()
                    tf.logging.info(
                        'seconds for %d training generator step: %.3f ',
                        train_step, (t1 - t0) / 100)
                    t0 = time.time()
                    tf.logging.info('loss: %f', loss_window /
                                    100)  # print the loss to screen
                    loss_window = 0.0
                if train_step % 10000 == 0:
                    #bleu_score = generatored.compute_BLEU(str(train_step))
                    #tf.logging.info('bleu: %f', bleu_score)  # print the loss to screen
                    generated.generator_validation_negetive_example(
                        "valid-generated-transfer/" + str(epoch) +
                        "epoch_step" + str(train_step) + "_temp_positive",
                        batcher, model_class, sess_cls, cla_batcher)
                    generated.generator_validation_positive_example(
                        "valid-generated/" + str(epoch) + "epoch_step" +
                        str(train_step) + "_temp_positive", batcher,
                        model_class, sess_cls, cla_batcher)
                    #saver_ge.save(sess, train_dir + "/model", global_step=train_step)

                cla_batch, bleu = output_to_classification_batch(
                    result['generated'], current_batch, batcher, cla_batcher,
                    cc)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)
                reward_result_sentiment = result['y_pred_auc']
                reward_result_bleu = np.array(bleu)

                reward_result = (2 / (1.0 /
                                      (1e-6 + reward_result_sentiment) + 1.0 /
                                      (1e-6 + reward_result_bleu)))

                current_batch.score = 1 - current_batch.score

                result = model.max_generator(sess_ge, current_batch)

                cla_batch, bleu = output_to_classification_batch(
                    result['generated'], current_batch, batcher, cla_batcher,
                    cc)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)
                reward_result_transfer_sentiment = result['y_pred_auc']
                reward_result_transfer_bleu = np.array(bleu)

                reward_result_transfer = (
                    2 / (1.0 /
                         (1e-6 + reward_result_transfer_sentiment) + 1.0 /
                         (1e-6 + reward_result_transfer_bleu)))

                #tf.logging.info("reward_nonsentiment: "+str(reward_sentiment) +" output_original_sentiment: "+str(reward_result_sentiment)+" output_original_bleu: "+str(reward_result_bleu))

                reward = reward_result_transfer  #reward_de + reward_result_sentiment +
                #tf.logging.info("reward_de: "+str(reward_de))

                model_sentiment.run_train_step(sess_sen, sentiment_batch,
                                               reward)

    elif hps_generator.mode == 'decode':
        decode_model_hps = hps_generator  # This will be the hyperparameters for the decoder model
        #model = Generator(decode_model_hps, vocab)
        #generated = Generated_sample(model, vocab, batcher)
        #bleu_score = generated.compute_BLEU()
        #tf.logging.info('bleu: %f', bleu_score)  # print the loss to screen

    else:
        raise ValueError("The 'mode' flag must be one of train/eval/decode")
	hypothesis_list.extend(fake_data_text)
	reference.extend(real_data_text)

nll_gen_error = np.array(nll_gen_error)
nll_gen_error_mean = nll_gen_error.mean()
print(nll_gen_error_mean)

random.shuffle(hypothesis_list)
random.shuffle(reference)
reference = reference[:5000]
n_gram_bleu_scores = {"{}-gram".format(gram):0 for gram in range(2,6)}
for ngram in range(2,6):
	weight = tuple((1./ngram for _ in range(ngram)))
	bleu_score = []
	for h in hypothesis_list[:2000]:
		BLEUscore = sentence_bleu(reference,h,weight)
		bleu_score.append(BLEUscore)
	current_bleu = 1.0*sum(bleu_score)/len(bleu_score)
	n_gram_bleu_scores["{}-gram".format(len(weight))] = current_bleu
	if current_bleu < 1e-2:
		break

text_log.write("\n\nGot nll_gen mean: {}".format(nll_gen_error_mean))
for gram,score in n_gram_bleu_scores.items():
	text_log.write("\nGot {} score: {}".format(gram,score))

text_log.close()

save_model(generator,summary_path)
save_model(discriminator,summary_path)
예제 #28
0
def train(model, train_loader, criterion, optimizer, lr_scheduler, dataloader):
    model.train()
    device = next(model.parameters()).device.index
    losses = []
    total_iter = len(train_loader)

    for i, batch in enumerate(train_loader):
        srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device)

        # Empty gradients
        optimizer.zero_grad()

        # Predict targets (Forward propagation)
        preds, _, _, _ = model(srcs, trgs)

        # Unroll the preds and trgs
        preds_unroll = preds[1:].view(-1, preds.shape[-1])
        trgs_unroll = trgs[1:].view(-1)

        # Calculate loss
        loss = criterion(preds_unroll, trgs_unroll)
        losses.append(loss.item())

        # Calculate gradients (Backpropagation)
        loss.backward()

        # Cliping the parameters
        nn.utils.clip_grad_norm_(model.parameters(), 1)

        # Update learning rate
        lr_scheduler.step()

        # Update parameters
        optimizer.step()

        sys.stdout.write(
            "[{:5d}/{:5d}] lrate: {:f} total steps: {:d}\r".format(
                i + 1, total_iter, optimizer.param_groups[0]['lr'],
                lr_scheduler.current_step))
        if lr_scheduler.current_step >= TRAINING_STEPS:
            break
    # Calculate average loss
    avg_loss = sum(losses) / len(losses)

    #===========================================================================
    # Check train metric
    model.eval()
    sum_bleu = 0.0
    num_sentence = 0.0
    sos_idx = dataloader.sos_idx

    with torch.no_grad():
        for i, batch in enumerate(train_loader):
            srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device)

            # Predict targets (Forward propagation)
            preds, _, _, _ = model(srcs, trgs)

            # Unroll the preds and trgs
            preds_unroll = preds[1:].view(-1, preds.shape[-1])
            trgs_unroll = trgs[1:].view(-1)

            #===================================================================
            # For BLEU score
            # Target Decoding
            trans_preds = preds
            trans_preds = trans_preds.argmax(dim=2)

            # Greedy Decoding
            # trans_preds = model.translate_forward(srcs, sos_idx, trgs.size(1))
            trans_preds = trans_preds.cpu().detach().numpy()
            trgs = trgs.cpu().detach().numpy()

            for trans_pred, trg in zip(trans_preds, trgs):
                # Translate each sentence
                pred_sentence = dataloader.translate_sentence(trans_pred)
                trg_sentence = dataloader.translate_sentence(trg)
                # Calculate each sentence bleu score
                if len(pred_sentence) > 1:
                    sum_bleu += sentence_bleu(
                        [trg_sentence],
                        pred_sentence,
                        smoothing_function=smoothing_func) * 100
                    num_sentence += 1
            #===================================================================
            sys.stdout.write("[{:5d}/{:5d}]\r".format(i + 1, total_iter))

    # Calculate the metrics
    # Perplexity
    ppl = np.exp(avg_loss)
    # Bilingual Evaluation Understudy Score
    bleu = sum_bleu / num_sentence

    return avg_loss, (ppl, bleu)
예제 #29
0
from nltk.translate.bleu_score import sentence_bleu
ref_google = "the sleeve of the shirt tore"
ref_bing = "the shirt sleeve tore"  # traduzi pelo site Systran Translate
ref_yandex = "shirt sleeve ripped"
candidate = ""
candidates = [
    "the tore sleeve", "the collar has tear", "the short sleeve rent",
    "this shirt is torn", "shirt sleeve the rent", "the shirt sleeve tore"
]
reference = [ref_google.split(' '), ref_bing.split(' '), ref_yandex.split(' ')]
ref_text = [
    "the shirt", "the shirt is tore", "the shirt is black", "a shirt can tear",
    "shirt sleeve"
]
i = 0
reference = []
while i < len(ref_text):
    reference.append(ref_text[i].split(' '))
    i += 1

candidate = candidate.split(' ')
print("Referencia: {}".format(reference))
print("Original: " + ' '.join(candidate))
i = 0
while i < len(candidates):
    candidate = candidates[i].split(' ')
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    print("Candidato {}({}): {}".format(i, score, candidate))
    i += 1
예제 #30
0
def test(model, test_loader, criterion, dataloader):
    model.eval()
    device = next(model.parameters()).device.index
    losses = []
    total_iter = len(test_loader)
    sum_bleu = 0.0
    num_sentence = 0.0
    sos_idx = dataloader.sos_idx

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device)
            # Predict targets (Forward propagation)
            preds, enc_self, dec_self, dec_enc = model(srcs, trgs)

            # Unroll the preds and trgs
            preds_unroll = preds[1:].view(-1, preds.shape[-1])
            trgs_unroll = trgs[1:].view(-1)

            # Calculate loss
            loss = criterion(preds_unroll, trgs_unroll)
            losses.append(loss.item())

            #===================================================================
            # For BLEU score
            # Target Decoding
            trans_preds = preds
            trans_preds = trans_preds.argmax(dim=2)

            # Greedy Decoding
            # trans_preds = model.translate_forward(srcs, sos_idx, trgs.size(1))
            trans_preds = trans_preds.cpu().detach().numpy()
            trgs = trgs.cpu().detach().numpy()

            for idx, (trans_pred, trg,
                      src) in enumerate(zip(trans_preds, trgs, srcs)):
                # Translate each sentence
                pred_sentence = dataloader.translate_sentence(trans_pred)
                trg_sentence = dataloader.translate_sentence(trg)
                src_sentence = dataloader.translate_sentence(src, type='src')
                # Calculate each sentence bleu score
                if len(pred_sentence) > 1:
                    each_belu = sentence_bleu(
                        [trg_sentence],
                        pred_sentence,
                        smoothing_function=smoothing_func) * 100
                    sum_bleu += each_belu
                    num_sentence += 1
                    #===========================================================
                    # Monitoring the results
                    # print('SRC :', src_sentence)
                    # print('TRG :', trg_sentence)
                    # print('PRED:', pred_sentence)
                    # print(each_belu)
                    # input()
                    #===========================================================
                    # Visualize the attentions
                    # visualize_attention(enc_self, idx, src_sentence, src_sentence, 'enc', i)
                    # visualize_attention(dec_self, idx, trg_sentence, trg_sentence, 'dec', i)
                    # visualize_attention(dec_enc, idx, src_sentence, trg_sentence, 'edc', i)

            #===================================================================
            sys.stdout.write("[{:5d}/{:5d}]\r".format(i + 1, total_iter))

    # Calculate average loss
    avg_loss = sum(losses) / len(losses)

    # Calculate the metrics
    # Perplexity
    ppl = np.exp(avg_loss)
    # Bilingual Evaluation Understudy Score
    bleu = sum_bleu / num_sentence

    return avg_loss, (ppl, bleu)
예제 #31
0
def bleu_score_char(dialogs, sample_amount):
    from nltk.translate.bleu_score import sentence_bleu
    from nltk.translate.bleu_score import SmoothingFunction
    smoothie = SmoothingFunction().method4

    if (dialogs == 'cornell'):
        dataset_folder_name = 'cornell-dialogs'
    else:
        dataset_folder_name = dialogs

    data = get_outputs_and_references(dataset_folder_name, sample_amount)
    # print('DATAAA')
    # print(data)

    sum_bleu_score_1 = 0
    sum_bleu_score_2 = 0
    sum_bleu_score_3 = 0
    sum_bleu_score_4 = 0

    i = 0

    for datum in data:
        reference = []
        reference_str = datum['reference'].lower()
        output_str = datum['output']

        reference_str = reference_str.replace(".", "")
        reference_str = reference_str.replace(",", "")
        output_str = output_str.replace(".", "")
        output_str = output_str.replace(",", "")

        reference.append(nltk.tokenize.word_tokenize(reference_str))
        candidate = nltk.tokenize.word_tokenize(output_str)
        print(reference)
        print(candidate)

        try:
            bleu_1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
        except:
            bleu_1 = 0.5

        try:
            bleu_2 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.5, 0.5, 0, 0))
        except:
            bleu_2 = 0.5

        try:
            bleu_3 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.33, 0.33, 0.33, 0))
        except:
            bleu_3 = 0.5

        try:
            bleu_4 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.25, 0.25, 0.25, 0.25))
        except:
            bleu_4 = 0.5

        print(bleu_1)
        print(bleu_2)
        print(bleu_3)
        print(bleu_4)

        sum_bleu_score_1 = sum_bleu_score_1 + bleu_1
        sum_bleu_score_2 = sum_bleu_score_2 + bleu_2
        sum_bleu_score_3 = sum_bleu_score_3 + bleu_3
        sum_bleu_score_4 = sum_bleu_score_4 + bleu_4

        i = i + 1
        print(i)

    print('BLEU-1 : ' + str(round(sum_bleu_score_1 / len(data), 4)))
    print('BLEU-2 : ' + str(round(sum_bleu_score_2 / len(data), 4)))
    print('BLEU-3 : ' + str(round(sum_bleu_score_3 / len(data), 4)))
    print('BLEU-4 : ' + str(round(sum_bleu_score_4 / len(data), 4)))
예제 #32
0
def bleu_score(dialogs, sample_amount):
    from nltk.translate.bleu_score import sentence_bleu
    from nltk.translate.bleu_score import SmoothingFunction
    smoothie = SmoothingFunction().method4

    if (dialogs == 'cornell'):
        dataset_folder_name = 'cornell-dialogs'
    else:
        dataset_folder_name = dialogs

    data = get_outputs_and_references(dataset_folder_name, sample_amount)
    # print('DATAAA')
    # print(data)

    sum_bleu_score_1 = 0
    sum_bleu_score_2 = 0
    sum_bleu_score_3 = 0
    sum_bleu_score_4 = 0

    i = 0

    for datum in data:
        reference = []
        reference_str = datum['reference'].lower()
        output_str = datum['output']

        reference_str = reference_str.replace(".", "")
        reference_str = reference_str.replace(",", "")
        output_str = output_str.replace(".", "")
        output_str = output_str.replace(",", "")

        reference.append(nltk.tokenize.word_tokenize(reference_str))
        candidate = nltk.tokenize.word_tokenize(output_str)
        print(reference)
        print(candidate)

        try:
            bleu_1 = sentence_bleu(reference,
                                   candidate,
                                   weights=(1, 0, 0, 0),
                                   smoothing_function=smoothie)
        except:
            bleu_1 = 0.5

        try:
            bleu_2 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.5, 0.5, 0, 0),
                                   smoothing_function=smoothie)
        except:
            bleu_2 = 0.5

        try:
            bleu_3 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.33, 0.33, 0.33, 0),
                                   smoothing_function=smoothie)
        except:
            bleu_3 = 0.5

        try:
            bleu_4 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.25, 0.25, 0.25, 0.25),
                                   smoothing_function=smoothie)
        except:
            bleu_4 = 0.5

        print(bleu_1)
        print(bleu_2)
        print(bleu_3)
        print(bleu_4)

        sum_bleu_score_1 = sum_bleu_score_1 + bleu_1
        sum_bleu_score_2 = sum_bleu_score_2 + bleu_2
        sum_bleu_score_3 = sum_bleu_score_3 + bleu_3
        sum_bleu_score_4 = sum_bleu_score_4 + bleu_4

        i = i + 1
        print(i)

    print('BLEU-1 : ' + str(round(sum_bleu_score_1 / len(data), 4)))
    print('BLEU-2 : ' + str(round(sum_bleu_score_2 / len(data), 4)))
    print('BLEU-3 : ' + str(round(sum_bleu_score_3 / len(data), 4)))
    print('BLEU-4 : ' + str(round(sum_bleu_score_4 / len(data), 4)))

    with open(os.environ['CURRENT_CORNELL_MODEL'] + '-' + str(sample_amount) +
              '.txt',
              'w',
              encoding="latin1") as out_file:
        out_file.write('BLEU-1 : ' +
                       str(round(sum_bleu_score_1 / len(data), 4)))
        out_file.write('\n')
        out_file.write('BLEU-2 : ' +
                       str(round(sum_bleu_score_2 / len(data), 4)))
        out_file.write('\n')
        out_file.write('BLEU-3 : ' +
                       str(round(sum_bleu_score_3 / len(data), 4)))
        out_file.write('\n')
        out_file.write('BLEU-4 : ' +
                       str(round(sum_bleu_score_4 / len(data), 4)))
        out_file.write('\n')
        out_file.write('Sample amount : ' + str(sample_amount))
예제 #33
0
 def test_empty_references_and_hypothesis(self):
     # Test case where both references and hypothesis is empty.
     references = [[]]
     hypothesis = []
     assert sentence_bleu(references, hypothesis) == 0
예제 #34
0
 def test_empty_references(self):
     # Test case where there's reference is empty.
     references = [[]]
     hypothesis = 'John loves Mary'.split()
     assert sentence_bleu(references, hypothesis) == 0
예제 #35
0
def bleu_single(hypothesis: str, reference: str) -> float:
    return sentence_bleu([tokenizer.tokenize(reference)],
                         tokenizer.tokenize(hypothesis))
예제 #36
0
def baseline_bleu(df):
    smoothie = SmoothingFunction().method1
    df['bleu'] = df.apply(lambda x: sentence_bleu(x['reference_token'], x['translation_token'], weights=(1, 0, 0, 0),
                                                  smoothing_function=smoothie), axis=1)
    return df
예제 #37
0
def evaluate_summ_qa(model, dataset, mode, batch_size=64):
    assert mode in ('summ', 'qa'), 'Invalid mode!'

    model.eval()

    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              collate_fn=lambda x: x)

    rouge1_f_sum = rouge2_f_sum = rougeL_f_sum = bleu_sum = 0
    examples_rouge = examples_bleu = 0

    rouge = Rouge()
    count = 0
    if mode == 'summ':
        for mini_batch in tqdm(data_loader):
            count += 1
            refs = [' '.join(data['question']) for data in mini_batch]
            x = [data['description'] for data in mini_batch]
            hyps_raw = beam_search('summ', model, x)
            hyps = [' '.join(list(sent)) for sent in hyps_raw]
            try:
                rouge_score = rouge.get_scores(hyps,
                                               refs,
                                               avg=True,
                                               ignore_empty=True)
                rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch)
                rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch)
                rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch)
                examples_rouge += len(mini_batch)
            except ValueError as e:
                print(str(e) + ' | continuing...')
                continue

    elif mode == 'qa':
        for mini_batch in tqdm(data_loader):
            count += 1
            refs = [' '.join(data['answer']) for data in mini_batch]
            x = [data['question'] for data in mini_batch]
            hyps_raw = beam_search('qa', model, x)
            hyps = [' '.join(list(sent)) for sent in hyps_raw]
            try:
                rouge_score = rouge.get_scores(hyps,
                                               refs,
                                               avg=True,
                                               ignore_empty=True)
                rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch)
                rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch)
                rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch)
                examples_rouge += len(mini_batch)
            except ValueError as e:
                print(str(e) + ' | continuing...')
                continue

            # calculate BLEU score
            refs = [data['answer'] for data in mini_batch]
            hyps = [list(sent) for sent in hyps_raw]
            smoothie = SmoothingFunction().method4
            for i in range(len(hyps)):
                try:
                    bleu = sentence_bleu([refs[i]],
                                         hyps[i],
                                         smoothing_function=smoothie)
                    bleu_sum += bleu
                    examples_bleu += 1
                except ZeroDivisionError as e:
                    print(str(e) + ' | continuing...')
                    continue

    rouge_1_f = rouge1_f_sum / examples_rouge
    rouge_2_f = rouge2_f_sum / examples_rouge
    rouge_L_f = rougeL_f_sum / examples_rouge
    if mode == 'qa':
        bleu_score = bleu_sum / examples_bleu

    # with open('output/test_{}.txt'.format(mode), 'w', encoding='utf-8') as f:
    #     f.write('rouge-1 f: ' + str(rouge_1_f) + '\n')
    #     f.write('rouge-2 f: ' + str(rouge_2_f) + '\n')
    #     f.write('rouge-L f: ' + str(rouge_L_f) + '\n')
    #     f.write('\n')
    #
    #     for i in range((len(candidates)):
    #         f.write('input: ' + inputs[i] + '\n')
    #         f.write('hyp: ' + ''.join(candidates[i]) + '\n')
    #         f.write('ref: ' + targets[i] + '\n\n')

    if is_training:
        model.train()
    print('rouge-1 f: ' + str(rouge_1_f))
    print('rouge-2 f: ' + str(rouge_2_f))
    print('rouge-L f: ' + str(rouge_L_f))
    if mode == 'qa':
        print('bleu: ', bleu_score)
예제 #38
0
def sample_results(preds,
                   ind2word,
                   word2ind,
                   converted_summaries,
                   converted_texts,
                   use_bleu=False):
    """Plots the actual text and summary and the corresponding created summary.
    takes care of whether beam search or greedy decoder was used.
    """
    beam = False

    if len(np.array(preds).shape) == 4:
        beam = True
    '''Bleu score is not used correctly here, but serves as reference.
    '''
    if use_bleu:
        bleu_scores = []

    for pred, summary, text, seq_length in zip(
            preds[0], converted_summaries, converted_texts,
        [len(inds) for inds in converted_summaries]):
        print('\n\n\n', 100 * '-')
        if beam:
            actual_text = [
                ind2word[word] for word in text
                if word != word2ind["<SOS>"] and word != word2ind["<EOS>"]
            ]
            actual_summary = [
                ind2word[word] for word in summary
                if word != word2ind['<EOS>'] and word != word2ind['<SOS>']
            ]

            created_summary = []
            for word in pred:
                if word[0] != word2ind['<SOS>'] and word[0] != word2ind[
                        '<EOS>']:
                    created_summary.append(ind2word[word[0]])
                    continue
                else:
                    continue

            print('Actual Text:\n{}\n'.format(' '.join(actual_text)))
            print('Actual Summary:\n{}\n'.format(' '.join(actual_summary)))
            print('Created Summary:\n{}\n'.format(' '.join(created_summary)))
            if use_bleu:
                bleu_score = sentence_bleu([actual_summary], created_summary)
                bleu_scores.append(bleu_score)
                print('Bleu-score:', bleu_score)

            print()

        else:
            actual_text = [
                ind2word[word] for word in text
                if word != word2ind["<SOS>"] and word != word2ind["<EOS>"]
            ]
            actual_summary = [
                ind2word[word] for word in summary
                if word != word2ind['<EOS>'] and word != word2ind['<SOS>']
            ]
            created_summary = [
                ind2word[word] for word in pred
                if word != word2ind['<EOS>'] and word != word2ind['<SOS>']
            ]

            print('Actual Text:\n{}\n'.format(' '.join(actual_text)))
            print('Actual Summary:\n{}\n'.format(' '.join(actual_summary)))
            print('Created Summary:\n{}\n'.format(' '.join(created_summary)))
            if use_bleu:
                bleu_score = sentence_bleu([actual_summary], created_summary)
                bleu_scores.append(bleu_score)
                print('Bleu-score:', bleu_score)

    if use_bleu:
        bleu_score = np.mean(bleu_scores)
        print('\n\n\nTotal Bleu Score:', bleu_score)
예제 #39
0
#!/usr/bin/python3
# coding: utf-8
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

##################################################################
## 一: sentence_bleu: Calculate BLEU score (Bilingual Evaluation Understudy)
# sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, emulate_multibleu=False)
# 参考语句必须作为语句列表来提供, 其中每个语句是一个记号列表, 候选语句作为一个记号列表被提供
reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
candidate = ['this', 'is', 'a', 'test']
score = sentence_bleu(reference, candidate)
print(score)  # 会输出一个满分, 因为候选语句完全匹配其中一个参考语句

reference = [['the', 'cat', "is", "sitting", "on", "the", "mat"]]
test = ["on", 'the', "mat", "is", "a", "cat"]  # The hypothesis contains 0 counts of 4-gram overlaps.
print(sentence_bleu(reference, test))  # 5.5546715329196825e-78
test = ['the', 'cat', 'is', 'sitting', 'on', 'mat']
print(sentence_bleu(reference, test))  # 0.6731821382417487

##################################################################
## 二: corpus_bleu: 计算多个句子(如段落或文档)的 BLEU 分数
# 参考文本必须被指定为文档列表, 其中每个文档是一个参考语句列表, 并且每个可替换的参考语句也是记号列表, 也就是说文档列表是记号列表的列表的列表
# 候选文档必须被指定为列表, 其中每个文件是一个记号列表, 也就是说候选文档是记号列表的列表

references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]  # two references for one document
candidates = [['this', 'is', 'a', 'test']]
score = corpus_bleu(references, candidates)
print(score)  # 1.0; 运行这个例子就像之前一样输出满分

##################################################################
예제 #40
0
def validate(val_loader, encoder, decoder, criterion, tok_en, tok_zh):
    '''
    Performs one epoch's validation.
    '''
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    references_en = list(
    )  # references (true captions) for calculating corpus BLEU-4 score
    hypotheses_en = list()  # hypotheses (predictions)

    references_zh = list(
    )  # references (true captions) for calculating corpus BLEU-4 score
    hypotheses_zh = list()  # hypotheses (predictions)

    avg_loss = 0

    with torch.no_grad():
        # Batches
        for cnt, (encap, zhcap, video, caplen_en, caplen_zh, enrefs,
                  zhrefs) in enumerate(val_loader, 1):
            encap, zhcap, video, caplen_en, caplen_zh = encap.cuda(
            ), zhcap.cuda(), video.cuda(), caplen_en.cuda(), caplen_zh.cuda()

            # Forward prop.
            init_hidden, vid_out = encoder(
                video
            )  # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim)
            scores_en, pred_lengths_en, scores_zh, pred_lengths_zh = decoder.inference(
                encap, zhcap, init_hidden, vid_out, args.MAX_INPUT_LENGTH)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets_en = encap[:, 1:]
            scores_copy_en = scores_en.clone()
            targets_zh = zhcap[:, 1:]
            scores_copy_zh = scores_zh.clone()

            # Calculate loss
            loss_en = criterion(
                scores_en[:, 1:].contiguous().view(-1, decoder.vocab_size_en),
                targets_en.contiguous().view(-1))
            loss_zh = criterion(
                scores_zh[:, 1:].contiguous().view(-1, decoder.vocab_size_zh),
                targets_zh.contiguous().view(-1))

            # Hypotheses
            _, preds_en = torch.max(scores_copy_en, dim=2)
            preds_en = preds_en.tolist()
            temp_preds_en = list()
            for j, p in enumerate(preds_en):
                temp_preds_en.append(
                    preds_en[j][1:pred_lengths_en[j]])  # remove pads and idx-0

            preds_en = temp_preds_en
            hypotheses_en.extend(preds_en)  # preds= [1,2,3]

            enrefs = [list(map(int, i.split())) for i in enrefs
                      ]  # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]

            for r in enrefs:
                references_en.append([r])

            assert len(references_en) == len(hypotheses_en)

            _, preds_zh = torch.max(scores_copy_zh, dim=2)
            preds_zh = preds_zh.tolist()
            temp_preds_zh = list()
            for j, p in enumerate(preds_zh):
                temp_preds_zh.append(
                    preds_zh[j][1:pred_lengths_zh[j]])  # remove pads and idx-0

            preds_zh = temp_preds_zh
            hypotheses_zh.extend(preds_zh)  # preds= [1,2,3]

            zhrefs = [list(map(int, i.split())) for i in zhrefs
                      ]  # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]

            for r in zhrefs:
                references_zh.append([r])

            assert len(references_zh) == len(hypotheses_zh)

            avg_loss += loss_en.item() + loss_zh.item()

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>

            # Calculate loss

            # Hypotheses

        # Calculate metrics
        avg_loss = avg_loss / cnt

        scorers = {
            "Bleu": Bleu(4),
            "Meteor": Meteor(),
            "Rouge": Rouge(),
            "Cider": Cider(),
            "Spice": Spice()
        }

        gts_en = {}
        res_en = {}
        for i in range(len(references_en)):
            gts_en[i] = [tok_en.decode_sentence(references_en[i][0])]
            res_en[i] = [tok_en.decode_sentence(hypotheses_en[i])]
        scores = {}
        for name, scorer in scorers.items():
            score, all_scores = scorer.compute_score(gts_en, res_en)
            if isinstance(score, list):
                for i, sc in enumerate(score, 1):
                    scores[name + str(i)] = sc
            else:
                scores[name] = score
        print("Score of EN:")
        print(scores)
        """
        gts_zh = {}
        res_zh = {}
        for i in range(len(references_zh)):
            gts_zh[i] = [tok_zh.decode_sentence(references_zh[i][0])]
            res_zh[i] = [tok_zh.decode_sentence(hypotheses_zh[i])]
        scores = {}
        for name, scorer in scorers.items():
            score, all_scores = scorer.compute_score(gts_zh, res_zh)
            if isinstance(score, list):
                for i, sc in enumerate(score, 1):
                    scores[name + str(i)] = sc
            else:
                scores[name] = score
        print("Score of ZH:")
        print(scores)
        """
        corpbleu_en = corpus_bleu(references_en, hypotheses_en)
        sentbleu_en = 0
        for i, (r, h) in enumerate(zip(references_en, hypotheses_en), 1):
            sentbleu_en += sentence_bleu(r, h, smoothing_function=cc.method7)
        sentbleu_en /= i

    return avg_loss, sentbleu_en, corpbleu_en
예제 #41
0
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    blue_score = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        reference_words = []
        candidate_words = []
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                reference_words.append('</s>')
                break
            else:
                reference_words.append(indo_vocab.id2token[topi.item()])
            for index in target_tensor[di].data.topk(1):
                candidate_words.append(indo_vocab.id2token[index.item()]) 
            loss += criterion(decoder_output, target_tensor[di])
            #blue_score += sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0))
            print("reference words",reference_words)
            print("candidate words",candidate_words)
            print("blue score:",sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0)))
            decoder_input = target_tensor[di]  # Teacher forcing
        # print("Decoded words",decoded_words)
        # print("candidate words",candidate_words)    

    else:
        # Without teacher forcing: use its own predictions as the next input
        reference_words = []
        candidate_words = []
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            if topi.item() == EOS_token:
                reference_words.append('</s>')
                break
            else:
                reference_words.append(indo_vocab.id2token[topi.item()])

            for index in target_tensor[di].data.topk(1):
                candidate_words.append(indo_vocab.id2token[index.item()])
            #print("Decoded words",decoded_words)
            print("reference words",reference_words)
            print("candidate words",candidate_words)
            print("blue score:",sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0)))
            loss += criterion(decoder_output, target_tensor[di])
            #blue_score += sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0))
            if decoder_input.item() == EOS_token:
                break
        # print("Decoded words",decoded_words)
        # print("candidate words",candidate_words)                      

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return (loss.item() / target_length),(blue_score)
예제 #42
0
 def test_empty_references(self):
     # Test case where there's reference is empty.
     references = [[]]
     hypothesis = 'John loves Mary'.split()
     assert(sentence_bleu(references, hypothesis) == 0)
예제 #43
0
def crossUnigramsRatio(s1, s2):
    nPairs = min(len(s1),len(s2))
    l2 = [w2.pos_ for w2 in s2]
    cnt = 0.
    for w in s1:
        if w.pos_ in l2:
            cnt += 1.
            idx = l2.index(w.pos_)
            l2.pop(idx)
    cuRatio = cnt / nPairs
    return cuRatio

with open('myOutput_'+fname+'.csv','w') as outFile:
    outFile.write('label,bleu,similarity,wmd,crossUnigrams\n')
    for snt in reader:
        if snt['gold_label'] != '-':
            s1 = nlp(snt['sentence1'])
            s2 = nlp(snt['sentence2'])

            a = bleu_score.sentence_bleu(s1[:].text, s2[:].text)
            b = s1.similarity(s2)
            c = wmd(s1.text, s2.text)
            d = crossUnigramsRatio(s1, s2)

            outFile.write('%s,%f,%f,%f,%f\n'%(snt['gold_label'],a,b,c,d))

        else:
            noLabel += 1.

print('Done calculating values')
예제 #44
0
# cumulative BLEU scores
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
reference = [['this', 'is', 'small', 'test']]
candidate = ['this', 'is', 'a', 'test']
print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))
예제 #45
0
def scoring(reference, candidate):
    score = sentence_bleu(reference, candidate)
    return score
예제 #46
0
파일: evaluation.py 프로젝트: oplatek/e2end
def get_bleus(referencess, wordss):
    '''Return bleu using nltk and 0.0 for empty decoded sequnces'''
    return [sentence_bleu([r], s, smoothing_function=bleu_smoothing) if s else 0.0 for r, s in zip(referencess, wordss)]
예제 #47
0
plot_attention("california is never pleasant during winter , and it is sometimes wonderful in december .")

from nltk.translate.bleu_score import sentence_bleu

# 存储每个句子的模型翻译结果# 存储每个句子
fr_preds = []

# 对样本中的每个英文进行翻译
for sentence in tqdm.tqdm(source_text.split("\n")):
    fr_pred = make_prediction(sentence)
    # 存储翻译结果
    fr_preds.append(fr_pred)

# 以样本中的法语翻译结果为reference
references = target_text.split("\n")

# 存储每个句子的BLEU分数
bleu_score = []

for i in tqdm.tqdm(range(len(fr_preds))):
    # 去掉特殊字符
    pred = fr_preds[i].replace("<EOS>", "").replace("<PAD>", "").rstrip()
    reference = references[i].lower()
    # 计算BLEU分数
    score = sentence_bleu([reference.split()], pred.split())

    bleu_score.append(score)

print("The BLEU score on our corpus is about {}".format(sum(bleu_score) / len(bleu_score)))
def bleu_score(dialogs, sample_amount):
    from nltk.translate.bleu_score import sentence_bleu
    from nltk.translate.bleu_score import SmoothingFunction
    smoothie = SmoothingFunction().method4

    if (dialogs == 'cornell'):
        dataset_folder_name = 'cornell-dialogs'
    else:
        dataset_folder_name = dialogs

    data = get_outputs_and_references(dataset_folder_name, sample_amount)
    # print('DATAAA')
    # print(data)

    sum_bleu_score_1 = 0
    sum_bleu_score_2 = 0
    sum_bleu_score_3 = 0
    sum_bleu_score_4 = 0

    i = 0

    for datum in data:
        reference = []
        reference_str = datum['reference'].lower()
        output_str = datum['output']

        reference_str = reference_str.replace(".", "")
        reference_str = reference_str.replace(",", "")
        output_str = output_str.replace(".", "")
        output_str = output_str.replace(",", "")

        reference.append(nltk.tokenize.word_tokenize(reference_str))
        candidate = nltk.tokenize.word_tokenize(output_str)
        print(reference)
        print(candidate)

        try:
            bleu_1 = sentence_bleu(reference,
                                   candidate,
                                   weights=(1, 0, 0, 0),
                                   smoothing_function=smoothie)
        except:
            print('BLEU Error')
            bleu_1 = float(os.getenv('defbleu1')) / float(100)

        try:
            bleu_2 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.5, 0.5, 0, 0),
                                   smoothing_function=smoothie)
        except:
            print('BLEU Error')
            bleu_2 = float(os.getenv('defbleu2')) / float(100)

        try:
            bleu_3 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.33, 0.33, 0.33, 0),
                                   smoothing_function=smoothie)
        except:
            print('BLEU Error')
            bleu_3 = float(os.getenv('defbleu3')) / float(100)

        try:
            bleu_4 = sentence_bleu(reference,
                                   candidate,
                                   weights=(0.25, 0.25, 0.25, 0.25),
                                   smoothing_function=smoothie)
        except:
            print('BLEU Error')
            bleu_4 = float(os.getenv('defbleu4')) / float(100)

        print(bleu_1)
        print(bleu_2)
        print(bleu_3)
        print(bleu_4)

        sum_bleu_score_1 = sum_bleu_score_1 + bleu_1
        sum_bleu_score_2 = sum_bleu_score_2 + bleu_2
        sum_bleu_score_3 = sum_bleu_score_3 + bleu_3
        sum_bleu_score_4 = sum_bleu_score_4 + bleu_4

        i = i + 1
        print(i)

    # print('BLEU-1 : ' + str(round(sum_bleu_score_1/len(data), 4)))
    # print('BLEU-2 : ' + str(round(sum_bleu_score_2/len(data), 4)))
    # print('BLEU-3 : ' + str(round(sum_bleu_score_3/len(data), 4)))
    # print('BLEU-4 : ' + str(round(sum_bleu_score_4/len(data), 4)))

    # print('Cumulative BLEU-1 : ' + str(round(sum_bleu_score_1/len(data), 4)))
    # print('Cumulative BLEU-2 : ' + str(round(sum_bleu_score_2/len(data), 4)))
    # print('Cumulative BLEU-3 : ' + str(round(sum_bleu_score_3/len(data), 4)))
    num = os.getenv('bleu' + sys.argv[1]).split('/')
    print('Cumulative BLEU : ' + str(round(float(num[0]) / float(num[1]), 4)))
예제 #49
0
def similarity(attrs_text1, attrs_text2):  # подсчет похожести по BLEU
    attrs1 = []
    attrs1.append(attrs_text1.split(' '))
    attrs2 = attrs_text2.split(' ')
    score = sentence_bleu(attrs1, attrs2)
    return score
예제 #50
0
pred_good, pred_bad, bleus = [], [], []
count = 0
for jpgfnm, image_feature, tokenized_text in zip(fnm_test, di_test, dt_test):
    count += 1
    if count % 200 == 0:
        print("  {:4.2f}% is done..".format(100 * count /
                                            float(len(fnm_test))))

    desc_true = [index_word[i] for i in tokenized_text]
    desc_true = desc_true[1:-1]  # remove startseq and endseq

    desc = predict_desc(image_feature.reshape(1, len(image_feature)))
    desc = desc.split()
    desc = desc[1:-1]  # remove startseq and endseq

    bleu = sentence_bleu([desc_true], desc)
    bleus.append(bleu)
    if bleu > 0.7 and len(pred_good) < nkeep:
        pred_good.append((bleu, jpgfnm, desc_true, desc))
    elif bleu < 0.3 and len(pred_bad) < nkeep:
        pred_bad.append((bleu, jpgfnm, desc_true, desc))

print('The average accuracy based on BLEU is:', np.mean(bleus))


# demo: show the 'good' and 'bad' results
def plot_images(pred):
    def create_str(desc_true):
        line = ""
        for s in desc_true:
            line += " " + s
예제 #51
0
 def test_partial_matches_hypothesis_longer_than_reference(self):
     references = ['John loves Mary'.split()]
     hypothesis = 'John loves Mary who loves Mike'.split()
     self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4)
def main():
    reports = {}

    with open(config.cleaned_reports) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            else:
                uid, problems, findings, impression = row[1:]
                reports[str(uid)] = (parse_list(problems), findings,
                                     impression)

    train_reports, valid_reports, _ = create_report_splits(reports)

    train_dataset = data.XRayDataset(reports=train_reports,
                                     transform=transforms.Compose([
                                         transforms.Resize(299),
                                         transforms.RandomCrop((299, 299)),
                                         transforms.RandomHorizontalFlip(),
                                         transforms.ToTensor(),
                                         transforms.Normalize(
                                             mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225]),
                                     ]))
    train_dataloader = torch.utils.data.dataloader.DataLoader(
        train_dataset,
        collate_fn=data.collate_fn,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        batch_size=config.batch_size,
        num_workers=config.batch_size)
    valid_dataset = data.XRayDataset(reports=valid_reports,
                                     transform=transforms.Compose([
                                         transforms.Resize(299),
                                         transforms.CenterCrop((299, 299)),
                                         transforms.ToTensor(),
                                         transforms.Normalize(
                                             mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225]),
                                     ]))
    valid_dataset.tokenizer = train_dataset.tokenizer
    valid_dataloader = torch.utils.data.dataloader.DataLoader(
        valid_dataset,
        collate_fn=data.collate_fn,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        batch_size=config.batch_size,
        num_workers=config.batch_size)

    num_classes = len(train_dataset.classes)

    encoder = models.EncoderCNN(config.emb_dim,
                                num_classes).to(config.device,
                                                memory_format=memory_format)
    decoder = models.DecoderRNN_Word(
        config.emb_dim, config.hidden_dim, train_dataset.tokenizer,
        config.num_layers).to(config.device, memory_format=memory_format)

    classes_loss = torch.nn.BCEWithLogitsLoss()
    outputs_loss = torch.nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = apex.optimizers.FusedAdam(params, lr=config.learning_rate)

    [encoder, decoder], optimizer = apex.amp.initialize([encoder, decoder],
                                                        optimizer,
                                                        opt_level="O1")

    def train_one_epoch(dataloader,
                        batch_size,
                        encoder,
                        decoder,
                        classes_loss,
                        outputs_loss,
                        optimizer,
                        train=True):
        total_step = len(dataloader.dataset) // batch_size
        if train:
            encoder.train()
            decoder.train()
        else:
            encoder.eval()
            decoder.eval()
        running_c_loss = torch.Tensor([0.0])
        running_o_loss = torch.Tensor([0.0])
        state_h, state_c = decoder.zero_state(batch_size)
        with torch.set_grad_enabled(train):
            for i, (images, class_labels, captions,
                    lengths) in enumerate(progress_bar(dataloader)):
                images = images.to(
                    config.device,
                    non_blocking=True).contiguous(memory_format=memory_format)
                captions = captions.to(config.device, non_blocking=True)
                class_labels = class_labels.to(config.device,
                                               non_blocking=True)
                lengths = [o - 1 for o in lengths]
                targets = torch.nn.utils.rnn.pack_padded_sequence(
                    captions[:, 1:],
                    lengths,
                    batch_first=True,
                    enforce_sorted=False)[0]
                encoder.zero_grad()
                decoder.zero_grad()
                logits, features = encoder(images)
                c_loss = classes_loss(logits, class_labels)
                outputs, (state_h,
                          state_c) = decoder(features, captions[:, :-1],
                                             lengths, (state_h, state_c))
                o_loss = outputs_loss(outputs, targets)
                if train:
                    with apex.amp.scale_loss(c_loss, optimizer) as scaled_loss:
                        scaled_loss.backward(retain_graph=True)
                    with apex.amp.scale_loss(o_loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    state_h = state_h.detach()
                    state_c = state_c.detach()
                    optimizer.step()
                running_c_loss += c_loss
                running_o_loss += o_loss
        c_loss = float(running_c_loss.item() / total_step)
        o_loss = float(running_o_loss.item() / total_step)
        return c_loss, o_loss

    batch_size = config.batch_size

    if not args.test:
        print("Start training")

        history = {
            "train_c_loss": [],
            "train_o_loss": [],
            "valid_c_loss": [],
            "valid_o_loss": []
        }

        best_loss = 100

        for epoch in range(num_epochs):
            print("\nEpoch", epoch + 1, "/", num_epochs, ":\n")

            train_c_loss, train_o_loss = train_one_epoch(train_dataloader,
                                                         batch_size,
                                                         encoder,
                                                         decoder,
                                                         classes_loss,
                                                         outputs_loss,
                                                         optimizer,
                                                         train=True)
            print("* train_loss - ", round(train_c_loss, 3),
                  round(train_o_loss, 3), "- perplexity -",
                  round(np.exp(train_o_loss), 3))
            history["train_c_loss"].append(train_c_loss)
            history["train_o_loss"].append(train_o_loss)

            valid_c_loss, valid_o_loss = train_one_epoch(valid_dataloader,
                                                         batch_size,
                                                         encoder,
                                                         decoder,
                                                         classes_loss,
                                                         outputs_loss,
                                                         optimizer,
                                                         train=False)
            print("* valid_loss - ", round(valid_c_loss, 3),
                  round(valid_o_loss, 3), "- perplexity -",
                  round(np.exp(valid_o_loss), 3))
            history["valid_c_loss"].append(valid_c_loss)
            history["valid_o_loss"].append(valid_o_loss)

            current_valid_loss = valid_o_loss
            if current_valid_loss < best_loss:
                print("* best loss, saving weights")
                best_loss = current_valid_loss
                torch.save(encoder.state_dict(), outdir + "encoder_word.pt")
                torch.save(decoder.state_dict(), outdir + "decoder_word.pt")

        print("Save history to CSV file")
        df = pd.DataFrame(list(
            zip(history["train_c_loss"], history["train_o_loss"],
                history["valid_c_loss"], history["valid_o_loss"])),
                          columns=[
                              "train_c_loss", "train_o_loss", "valid_c_loss",
                              "valid_o_loss"
                          ])
        df.to_csv(outdir + "history.csv")

    print("Load weights and run mAP and BLEU eval")

    encoder.load_state_dict(torch.load(outdir + "encoder_word.pt"))
    decoder.load_state_dict(torch.load(outdir + "decoder_word.pt"))
    y_true, y_pred = get_class_predictions(encoder, train_dataset)
    recall, precision, AP, train_mAP = evaluate_encoder_predictions(
        y_true, y_pred)

    y_true, y_pred = get_class_predictions(encoder, valid_dataset)
    recall, precision, AP, valid_mAP = evaluate_encoder_predictions(
        y_true, y_pred)

    print("* train mAP -", round(train_mAP, 3), "- valid mAP -",
          round(valid_mAP, 3))

    bleu_scores = []

    for name, dataloader in zip(["train", "valid"],
                                [train_dataloader, valid_dataloader]):
        encoder.eval()
        decoder.eval()
        running_bleu = 0.0
        dataset_len = len(dataloader.dataset)
        with torch.set_grad_enabled(False):
            for index in trange(0, dataset_len):
                image, problems, impression = dataloader.dataset.__getitem__(
                    index)
                image_tensor = image.unsqueeze(0).to(device)
                logits, features = encoder(image_tensor)
                #                 seed = []
                #                 seed = torch.from_numpy(train_dataset.tokenizer.encode(seed)).unsqueeze(0).cuda()
                #                 predictions, seed, decode_lengths, alphas = decoder.sample(features, seed, [32, ])
                #                 sampled_ids = list(predictions[0].cpu().numpy())
                # sampled_ids = decoder.beam_decode(features)
                sampled_ids = decoder.greedy_decode(features)
                sampled_ids = [i for i in sampled_ids]
                original = train_dataset.tokenizer.decode(impression[1:-1])
                generated = train_dataset.tokenizer.decode(sampled_ids[:-1])
                reference = [nltk.word_tokenize(original)]
                candidate = nltk.word_tokenize(generated)
                bleu_score = sentence_bleu(reference,
                                           candidate,
                                           weights=(1, 0, 0, 0))
                running_bleu += bleu_score
            bleu_score = running_bleu / dataset_len
            bleu_scores.append(bleu_score)

    print("* train/valid BLEU-1 scores", bleu_scores)
예제 #53
0
 def test_empty_hypothesis(self):
     # Test case where there's hypothesis is empty.
     references = ['The candidate has no alignment to any of the references'.split()]
     hypothesis = []
     assert(sentence_bleu(references, hypothesis) == 0)
def calc_bleu_many(cand_seq, ref_sequences):
    sf = bleu_score.SmoothingFunction()
    return bleu_score.sentence_bleu(ref_sequences,
                                    cand_seq,
                                    smoothing_function=sf.method1,
                                    weights=(0.5, 0.5))
예제 #55
0
 def test_empty_references_and_hypothesis(self):
     # Test case where both references and hypothesis is empty.
     references = [[]]
     hypothesis = []
     assert(sentence_bleu(references, hypothesis) == 0)
예제 #56
0
def translate_en_fr(src_sent):

    # read checkpoint path, number indicates the latest step
    CHECKPOINT_PATH = "INFO7374-12200"

    tf.reset_default_graph()

    # define the trained model
    with tf.variable_scope("nmt_model", reuse=None):
        model = NMTModel()

    # sentence for testing
    test_en_text = src_sent

    # file for vocab
    SRC_VOCAB = "vocab.en"
    TRG_VOCAB = "vocab.fr"

    # convert sentence to word_index according to vocab
    with codecs.open(SRC_VOCAB, "r", "utf-8") as f_vocab:
        src_vocab = [w.strip() for w in f_vocab.readlines()]
        src_id_dict = dict((src_vocab[x], x) for x in range(len(src_vocab)))
    test_en_ids = [
        (src_id_dict[token] if token in src_id_dict else src_id_dict['<unk>'])
        for token in test_en_text.split()
    ]

    # build inference based on saved model weights
    output_op = model.inference(test_en_ids)
    sess = tf.Session()
    saver = tf.train.Saver()
    saver.restore(sess, CHECKPOINT_PATH)

    # read translation output
    output_ids = sess.run(output_op)

    # convert translation idx into word
    with codecs.open(TRG_VOCAB, "r", "utf-8") as f_vocab:
        trg_vocab = [w.strip() for w in f_vocab.readlines()]
    output_text = ' '.join([trg_vocab[x] for x in output_ids])

    # output translation
    final_output_text = output_text.encode('utf8').decode(
        sys.stdout.encoding).strip('<eos>')

    # load test_set - size: 100

    src_test = []
    with open('test.en', 'r', encoding='utf-8') as f:
        for line in f:
            src_test.append(line.strip())

    tgt_test = []
    with open('test.fr', 'r', encoding='utf-8') as f:
        for line in f:
            tgt_test.append(line.strip())

    if src_sent in src_test:
        idx = src_test.index(src_sent)
        trgt_sent = tgt_test[idx]
        bleu = sentence_bleu(trgt_sent, final_output_text)
        lst = levenshtein(trgt_sent, final_output_text)
    else:
        trgt_sent = 'Not Available In App Test Set'
        bleu = 'NA'
        lst = 'NA'

    return output_text[6:-7], trgt_sent, bleu, lst
    sess.close()
예제 #57
0
 def bleuScore(self, s1, s2):
     return bleu_score.sentence_bleu(s1, s2)
예제 #58
0
def main(_):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    generator, rnnlm, style_discriminator, siamese_discriminator,semantic_discriminator, rollout, vocab, tsf_vocab_inv = \
               pretrain.create_model(sess, save_folder, FLAGS, embed_fn)
    saver = tf.train.Saver(tf.all_variables())

    MODEL = FLAGS.model_path
    try:
        saver.restore(sess, MODEL)
    except:
        print("Error: No model found in {}".format(MODEL))
        sys.exit(0)

    # load test data
    test_orig_sents, test_orig_words, test_orig_len = data_helpers.loadTestInputs(FLAGS.max_sent_len, save_folder)
    print("test size: {}".format(len(test_orig_sents)))

    dump_folder = "../dump/" + str(FLAGS.data_type) + "/"
    #output_path = dump_folder + FLAGS.output_path
    output_path = FLAGS.output_path +"outputs.txt"
    log_path = output_path + "logs.txt"
    f = open(output_path, "w")
    g = open(log_path, "w")
    ind = 0
    total_bleu = 0
    total_sem = 0
    total_loss = 0
    while (ind < len(test_orig_sents)):
        input_sents = test_orig_sents[ind:ind+FLAGS.batch_size]
        input_len = test_orig_len[ind:ind+FLAGS.batch_size]
        orig_words = test_orig_words[ind:ind+FLAGS.batch_size]
        # pad to batch size
        if (ind+FLAGS.batch_size > len(test_orig_sents)):
            input_sents = test_orig_sents[ind:] + [test_orig_sents[ind]] * (ind+FLAGS.batch_size-len(test_orig_sents))
            input_len = test_orig_len[ind:]+[test_orig_len[ind]]*(ind+FLAGS.batch_size-len(test_orig_sents))
            orig_words = test_orig_words[ind:] + [test_orig_words[ind]] * (ind+FLAGS.batch_size-len(test_orig_sents))
        # generator_outputs: [batch_size, time, beam_width]
        beam_generator_outputs = generator.generate(sess, input_sents, input_len)
        # generator_outputs: [batch_size, time]
        generator_outputs = np.array(beam_generator_outputs)[:,:,0]
        generator_outputs_raw,generator_outputs_len = data_helpers.cleanGeneratorOutputs(generator_outputs, FLAGS.max_sent_len)
        generator_outputs = data_helpers.cleanTexts(generator_outputs, FLAGS.max_sent_len)
        style_loss = np.sum(style_discriminator.getStyleReward(sess,generator_outputs_raw,generator_outputs_len), axis = 0)
        lm_loss = np.sum(rnnlm.getLMReward(sess,generator_outputs_raw), axis = 0)
        tsf_words = data_helpers.convertIdxToWords(generator_outputs, tsf_vocab_inv)
        tmp_ind = ind
        ind += FLAGS.batch_size
        batch_bleu = 0
        for (orig_word_seq, tsf_word_seq) in zip(orig_words, tsf_words):
            if (tmp_ind >= len(test_orig_sents)):
                break
            # output
            f.write(" ".join(tsf_word_seq)+"\n")
            # log
            g.write("orig:\t"+" ".join(orig_word_seq)+"\n")
            g.write("tsf:\t"+" ".join(tsf_word_seq)+"\n")
            score = sentence_bleu(orig_word_seq, tsf_word_seq)
            batch_bleu +=score
            g.write("\n")
            tmp_ind += 1
        print("bleu score is {}".format(batch_bleu/FLAGS.batch_size))
        total_bleu += batch_bleu
        total_loss += style_loss
        total_sem += lm_loss
        
    print("total bleu score is {}".format(total_bleu/len(test_orig_sents)))
    
    print("total style score is {}".format(total_loss/len(test_orig_sents)))
    print("total style score is {}".format(total_sem/len(test_orig_sents)))
    
    f.close()
    g.close()
    print("done saving tsf sents to", output_path)
    print("done saving both orig and tsf logs to", log_path)
예제 #59
0
파일: main.py 프로젝트: Ryureka/ai3
def bleu_compute(reference, candidate):
    score = sentence_bleu(reference, candidate)
    return score
예제 #60
0
def evaluate_autoencoder(whichdecoder, data_source, epoch):
    # Turn on evaluation mode which disables dropout.
    eos_id = corpus.dictionary.word2idx['<eos>']
    autoencoder.eval()
    ntokens = len(corpus.dictionary.word2idx)
    n_sents = 0.0
    total_loss = 0.0
    token_accuracies = 0.0
    all_source_sents = []
    all_transfer_sents = []

    pbar = tqdm(range(len(data_source)))
    for ii in pbar:
        batch = data_source[ii]

        source, target, lengths = batch
        source = to_gpu(use_cuda, Variable(source, requires_grad=False))
        target = to_gpu(use_cuda, Variable(target, requires_grad=False))
        n_sents += source.size()[0]

        mask = target.gt(0)
        masked_target = target.masked_select(mask)
        # examples x ntokens
        output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens)

        hidden = autoencoder(0, source, lengths, noise=False, encode_only=True)

        # output: batch x seq_len x ntokens
        if whichdecoder == 0:
            output = autoencoder(0, source, lengths, noise=False)
            flattened_output = output.view(-1, ntokens)
            masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens)
            # accuracy
            max_vals1, max_indices1 = torch.max(masked_output, 1)
            token_accuracies += torch.mean(max_indices1.eq(masked_target).float()).item()

            max_values1, max_indices1 = torch.max(output, 2)
            max_indices2 = autoencoder.generate(1, hidden, maxlen=50)
        else:
            output = autoencoder(1, source, lengths, noise=False)
            flattened_output = output.view(-1, ntokens)
            masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens)
            # accuracy
            max_vals2, max_indices2 = torch.max(masked_output, 1)
            token_accuracies += torch.mean(max_indices2.eq(masked_target).float()).item()

            max_values2, max_indices2 = torch.max(output, 2)
            max_indices1 = autoencoder.generate(0, hidden, maxlen=50)

        # forward
        total_loss += criterion_ce(masked_output / args.temp, masked_target).data

        # all_source_sents, all_transfer_sents
        max_indices1 = max_indices1.view(output.size(0), -1).data.cpu().numpy()
        max_indices2 = max_indices2.view(output.size(0), -1).data.cpu().numpy()
        target = target.view(output.size(0), -1).data.cpu().numpy()
        tran_indices = max_indices2 if whichdecoder == 0 else max_indices1
        for t, tran_idx in zip(target, tran_indices):
            # real sentence
            truncated_to_eos = t.tolist().index(eos_id) if eos_id in t.tolist() else len(t)
            chars = " ".join([corpus.dictionary.idx2word[x] for x in t[:truncated_to_eos]])
            all_source_sents.append(chars)
            # transfer sentence
            truncated_to_eos = tran_idx.tolist().index(eos_id) if eos_id in tran_idx.tolist() else len(tran_idx)
            chars = " ".join([corpus.dictionary.idx2word[x] for x in tran_idx[:truncated_to_eos]])
            all_transfer_sents.append(chars)

    # compare the original and transfer
    aeoutf_from = "{}/{}_output_decoder_{}_from.txt".format(args.outf, epoch, whichdecoder)
    aeoutf_tran = "{}/{}_output_decoder_{}_tran.txt".format(args.outf, epoch, whichdecoder)
    with open(aeoutf_from, 'w') as f_from, open(aeoutf_tran, 'w') as f_trans:
        # laplacian smoothing
        # for word in corpus.dictionary.word2idx.keys():
        #    f_from.write(word + "\n")
        #    f_trans.write(word + "\n")
        for i in range(len(all_source_sents)):
            # real sentence
            f_from.write(all_source_sents[i])
            # transfer sentence
            f_trans.write(all_transfer_sents[i])
            if i != len(all_source_sents) - 1:
                f_from.write("\n")
                f_trans.write("\n")

    # bleu
    all_bleu_scores = 0.0
    for i in range(len(all_source_sents)):
        sou = all_source_sents[i].split(' ')
        tran = all_transfer_sents[i].split(' ')
        all_bleu_scores += sentence_bleu([sou], tran,smoothing_function=SmoothingFunction().method7,weights=[1.0/3.0]*3)
    bleu = all_bleu_scores / n_sents * 100.0

    # forward and reverse
    loss = total_loss.item() / len(data_source)
    ppl = math.exp(loss)

    #print('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl))
    #logging.info('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl))

    # transfer
    labels = fasttext_classifier.predict(all_transfer_sents)
    truth = str(1 - whichdecoder)
    transfer = float(sum([l == truth for ll in labels for l in ll])) / n_sents * 100.0

    # load sentences to evaluate on
    arpa_path = '{}/{}_lm_{}.arpa'.format(args.outf, epoch, whichdecoder)
    kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_from, arpa_path, args.N)
    forward = get_ppl(kenlm_model, all_transfer_sents)

    kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_tran, arpa_path, args.N)
    reverse = get_ppl(kenlm_model, all_source_sents)

    #print('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse))
    #logging.info('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse))

    return bleu, ppl, transfer, forward, reverse