예제 #1
0
 def on_epoch_end(self, epoch, logs={}):
     images, captions, X = next(self.generator)
     if self.YL is None:
         self.YL = np.ones((len(images), self.max_lengths), dtype="int32")
     X = np.array(X, dtype="float32")
     # Y_pred = np.random.uniform(size=(len(X), self.max_lengths, 5000))
     Y_pred = self.model.predict([X, self.YL])
     Y_pred = np.argmax(Y_pred, axis=-1)
     with open(self.output_dir + "index.html", "w") as f:
         f.write("<h1> Epoch " + str(epoch + 1) + "</h1>")
         f.write("<h3> Trained : " + str((epoch + 1) * 1000 * 32) +
                 " examples.</h3>")
         f.write("<h3> Loss : " + str(logs.get("loss")) + "</h3>")
         f.write("<h3> acc : " + str(logs.get("acc")) + "</h3><hr>")
         for i in range(len(images)):
             s = " ".join([self.i2w[int(k)] for k in Y_pred[i]])
             s = s.split(" .")[0].strip() + " ."
             path = self.output_dir + ("images/%i.jpg" % i)
             imsave(path, images[i])
             f.write("""
             <div>
             <img src='%s' style="width:255px;height:255px;"><br><br>
             <b>Output :</b> %s<br><br>
             <b>Expected :</b>
             <ul style="margin-top : 0">""" % ("images/%i.jpg" % i, s))
             for cap in captions[i]:
                 f.write("""
                 <li>%s</li>""" % cap)
             f.write("""
             </ul>
             <b>BLEU score :</b> %.3f<br>
             <hr>
             </div>
             """ % bleu(captions[i], s))
예제 #2
0
 def eval(self, hypList, refList):
     number = len(hypList)
     n_ref =  len(refList) // number
     
     result = {
         'bleu_1':0.0,
         'bleu_2':0.0,
         'bleu_3':0.0,
         'bleu_4':0.0,
         'bleu':0.0
         }
     
     for Index in range(0, number):
         ref = [refList[i].split() for i in range(Index * n_ref, (Index+1) * n_ref)]
         ref = [r[:-1] if r[-1] == '.' else r for r in ref]
         hyp = hypList[Index].split()
         if (hyp[-1] == '.'):
             hyp = hyp[:-1]
         #print type([ref]), type(ref), type(ref[0])
         #print type(hyp), type(hyp[0])
         
         Smooth = SmoothingFunction()
         
         bleu_1 = bleu(ref, hyp, weights=[1], smoothing_function = Smooth.method1)
         bleu_2 = bleu(ref, hyp, weights=[0, 1], smoothing_function = Smooth.method1)
         bleu_3 = bleu(ref, hyp, weights=[0, 0, 1], smoothing_function = Smooth.method1)
         bleu_4 = bleu(ref, hyp, weights=[0, 0, 0, 1], smoothing_function = Smooth.method1)
         bleu_all = bleu(ref, hyp, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function = Smooth.method1)
         
         #print hyp, ref
         #print Index, bleu_1, bleu_2, bleu_3, bleu_4
         
         result['bleu_1'] += bleu_1
         result['bleu_2'] += bleu_2
         result['bleu_3'] += bleu_3
         result['bleu_4'] += bleu_4
         result['bleu'] += bleu_all
     
     result['bleu_1'] /= number
     result['bleu_2'] /= number
     result['bleu_3'] /= number
     result['bleu_4'] /= number
     result['bleu'] /= number
     
         
     return result
예제 #3
0
 def evaluate_pair(self, predWords, targetWords):
     """Compute the BLEU score of a prediction given a reference.
     
     Args:
         predWords: predicted words (a list of strings).
         targetWords: reference, same type as preWords.
     Returns:
         The BLEU score (uses = nltk.translate.bleu_score.sentence_bleu).
     """
     return bleu([self._clear_special_tokens(targetWords)], 
                  self._clear_special_tokens(predWords), smoothing_function=SMOOTH.method3)
예제 #4
0
 def update(self, question, response, answers, vectorizer):
     correct_answers = [a[0] for a in answers if a[1]]
     correct_answers = [re.split("\s+", a) for a in correct_answers]
     all_answers = [a[0] for a in answers]
     all_answers = [re.split("\s+", a) for a in all_answers]
     response = re.split("\s+", response)
     try:
         bleu_score = bleu(correct_answers,
                           response,
                           smoothing_function=self.smoothing_function)
         bleu_score_all = bleu(all_answers,
                               response,
                               smoothing_function=self.smoothing_function)
     except ZeroDivisionError:
         bleu_score = 0.0
         bleu_score_all = 0.0
         print("Bleu score 0 for response %s" % str(response))
     self.total_docs += 1
     self.total_bleu += bleu_score
     self.total_bleu_all += bleu_score_all
예제 #5
0
def find_best_translation(input_line, results):
    best_bleu_score = 0.0
    best_index = 0

    for index, result in enumerate(results):
        if len(result[1].split()) == 0:
            continue
        q2 = input_line.split('END')[2]
        bleu_score = bleu([q2.split()], result[1].split(), weights=(1.0,))
        # bleu_score = bleu([input_line.split()], result[1].split(), weights=(1.0,))
        if bleu_score > best_bleu_score:
            best_bleu_score = bleu_score
            best_index = index

    return best_index, best_bleu_score
예제 #6
0
def evaluate(mfccs, references, max_length_targ, encoder, decoder, targ_lang, 
              device, beam_search=False, beam_width=3, alpha=0.3, nb_candidates=10):
    
    if beam_search == False:
        result= greedy_decode(mfccs, max_length_targ, encoder, decoder, targ_lang, device)
    else:
        result = beam_search_decode(mfccs, max_length_targ, encoder, decoder, targ_lang, device=device,
                                              beam_width=beam_width, nb_candidates=nb_candidates, alpha=alpha)
    result = result.split()    
    BLEUscore = bleu([references], result, weights = (0.5, 0.5))
    
    print("Input: {}".format(references))
    print("\n")
    print("Predicted translation: {}".format(result))
    print("\n")
    print("Bleu score: {}".format(BLEUscore))
    
    
예제 #7
0
def get_bleu_score(candidate_text, full_text, N=3):
    all_words = []
    for line in full_text:
        words = line.split()  # word/pos-tag pair
        for word in words:
            word = word.rsplit('/', 1)[0]
            all_words.append(word)

    weight = 1.0 / N
    bleu_score = 0.0
    candidate_seq = candidate_text.split()
    candidate_seq = [word.rsplit('/', 1)[0] for word in candidate_seq]

    for index in range(len(candidate_seq) - 2):
        bleu_score += bleu([all_words], candidate_seq[index:index + 3],
                           [weight])

    return bleu_score
    def update(self, question, response, answers, vectorizer):
        all_answers = [a[0] for a in answers]
        all_answers = [re.split("\s+", a) for a in all_answers]
        response = re.split("\s+", response)
        similarities = []
        for a in all_answers:
            try:
                bleu_score = bleu([a],
                                  response,
                                  smoothing_function=self.smoothing_function)
            except ZeroDivisionError:
                bleu_score = 0.0
            similarities.append(bleu_score)

        map_score_based_on_bleu = calculateMAP(similarities,
                                               [a[1] for a in answers])
        average_bleu = np.mean(similarities)
        self.total_docs += 1
        self.mapBLEU += map_score_based_on_bleu
        self.total_average_bleu += average_bleu
        return similarities
예제 #9
0
    def eval(self, hypList, refList):
        # Lower
        hypList = [it.lower() for it in hypList]
        refList = [it.lower() for it in refList]
        number = len(hypList)
        n_ref = len(refList) // number

        result = {
            'bleu_1': 0.0,
            'bleu_2': 0.0,
            'bleu_3': 0.0,
            'bleu_4': 0.0,
            'bleu': 0.0
        }

        for Index in range(0, number):
            ref = [
                refList[i].split()
                for i in range(Index * n_ref, (Index + 1) * n_ref)
            ]
            ref = [r[:-1] if r[-1] == '.' else r for r in ref]
            hyp = hypList[Index].split()
            if (hyp[-1] == '.'):
                hyp = hyp[:-1]

            Smooth = SmoothingFunction()

            bleu_1 = bleu(ref,
                          hyp,
                          weights=[1],
                          smoothing_function=Smooth.method1)
            bleu_2 = bleu(ref,
                          hyp,
                          weights=[0, 1],
                          smoothing_function=Smooth.method1)
            bleu_3 = bleu(ref,
                          hyp,
                          weights=[0, 0, 1],
                          smoothing_function=Smooth.method1)
            bleu_4 = bleu(ref,
                          hyp,
                          weights=[0, 0, 0, 1],
                          smoothing_function=Smooth.method1)
            bleu_all = bleu(ref,
                            hyp,
                            weights=[0.25, 0.25, 0.25, 0.25],
                            smoothing_function=Smooth.method1)

            result['bleu_1'] += bleu_1
            result['bleu_2'] += bleu_2
            result['bleu_3'] += bleu_3
            result['bleu_4'] += bleu_4
            result['bleu'] += bleu_all

        result['bleu_1'] /= number
        result['bleu_2'] /= number
        result['bleu_3'] /= number
        result['bleu_4'] /= number
        result['bleu'] /= number

        return result
예제 #10
0
def compute_match_scores(tgt_seqs,
                         pred_seqs,
                         do_lower=True,
                         do_stem=True,
                         type='exact'):
    '''
    If type='exact', returns a list of booleans indicating if a pred has a matching tgt
    If type='partial', returns a 2D matrix, each value v_ij is a float in range of [0,1]
        indicating the (jaccard) similarity between pred_i and tgt_j
    :param tgt_seqs:
    :param pred_seqs:
    :param do_stem:
    :param topn:
    :param type: 'exact' or 'partial'
    :return:
    '''
    # do processing to baseline predictions
    if type == "exact":
        match_score = np.zeros(shape=(len(pred_seqs)), dtype='float32')
    else:
        match_score = np.zeros(shape=(len(pred_seqs), len(tgt_seqs)),
                               dtype='float32')

    target_number = len(tgt_seqs)
    predicted_number = len(pred_seqs)

    metric_dict = {
        'target_number': target_number,
        'prediction_number': predicted_number,
        'correct_number': match_score
    }

    # convert target index into string
    if do_lower:
        tgt_seqs = [[w.lower() for w in seq] for seq in tgt_seqs]
        pred_seqs = [[w.lower() for w in seq] for seq in pred_seqs]
    if do_stem:
        tgt_seqs = [stem_word_list(seq) for seq in tgt_seqs]
        pred_seqs = [stem_word_list(seq) for seq in pred_seqs]

    for pred_id, pred_seq in enumerate(pred_seqs):
        if type == 'exact':
            match_score[pred_id] = 0
            for true_id, true_seq in enumerate(tgt_seqs):
                match = True
                if len(pred_seq) != len(true_seq):
                    continue
                for pred_w, true_w in zip(pred_seq, true_seq):
                    # if one two words are not same, match fails
                    if pred_w != true_w:
                        match = False
                        break
                # if every word in pred_seq matches one true_seq exactly, match succeeds
                if match:
                    match_score[pred_id] = 1
                    break
        elif type == 'ngram':
            # use jaccard coefficient as the similarity of partial match (1+2 grams)
            pred_seq_set = set(pred_seq)
            pred_seq_set.update(
                set([
                    pred_seq[i] + '_' + pred_seq[i + 1]
                    for i in range(len(pred_seq) - 1)
                ]))
            for true_id, true_seq in enumerate(tgt_seqs):
                true_seq_set = set(true_seq)
                true_seq_set.update(
                    set([
                        true_seq[i] + '_' + true_seq[i + 1]
                        for i in range(len(true_seq) - 1)
                    ]))
                if float(
                        len(set.union(*[set(true_seq_set),
                                        set(pred_seq_set)]))) > 0:
                    similarity = len(set.intersection(*[set(true_seq_set), set(pred_seq_set)])) \
                              / float(len(set.union(*[set(true_seq_set), set(pred_seq_set)])))
                else:
                    similarity = 0.0
                match_score[pred_id, true_id] = similarity
        elif type == 'mixed':
            # similar to jaccard, but addtional to 1+2 grams we also put in the full string, serves like an exact+partial surrogate
            pred_seq_set = set(pred_seq)
            pred_seq_set.update(
                set([
                    pred_seq[i] + '_' + pred_seq[i + 1]
                    for i in range(len(pred_seq) - 1)
                ]))
            pred_seq_set.update(set(['_'.join(pred_seq)]))
            for true_id, true_seq in enumerate(tgt_seqs):
                true_seq_set = set(true_seq)
                true_seq_set.update(
                    set([
                        true_seq[i] + '_' + true_seq[i + 1]
                        for i in range(len(true_seq) - 1)
                    ]))
                true_seq_set.update(set(['_'.join(true_seq)]))
                if float(
                        len(set.union(*[set(true_seq_set),
                                        set(pred_seq_set)]))) > 0:
                    similarity = len(set.intersection(*[set(true_seq_set), set(pred_seq_set)])) \
                              / float(len(set.union(*[set(true_seq_set), set(pred_seq_set)])))
                else:
                    similarity = 0.0
                match_score[pred_id, true_id] = similarity

        elif type == 'bleu':
            # account for the match of subsequences, like n-gram-based (BLEU) or LCS-based
            # n-grams precision doesn't work that well
            for true_id, true_seq in enumerate(tgt_seqs):
                match_score[pred_id, true_id] = bleu(pred_seq, [true_seq],
                                                     [0.7, 0.3, 0.0])

    return match_score
예제 #11
0
    def evaluate(self, data_loader):
        loss = 0.0
        data_size = 0
        score = {'Bleu_1': 0, 'Bleu_4': 0, 'ROUGE_L': 0, 'METEOR': 0}
        r = Rouge()
        m = Meteor()
        criterion = nn.NLLLoss()
        for iter, (batch_x, batch_y) in enumerate(data_loader):
            batch_size = batch_x.size(0)
            encoder_hidden = self.encoder.initHidden(batch_size)

            batch_x = Variable(batch_x.transpose(0, 1))
            batch_y = Variable(batch_y.transpose(0, 1))

            input_length = batch_x.size(0)
            target_length = batch_y.size(0)

            data_size += batch_size

            output = torch.LongTensor(target_length, batch_size)

            encoder_outputs = torch.zeros(self.max_length, batch_size,
                                          self.encoder.hidden_size)
            encoder_outputs = encoder_outputs.cuda(
            ) if use_cuda else encoder_outputs

            for ei in range(input_length):
                encoder_output, encoder_hidden = self.encoder(
                    batch_x[ei], batch_size, encoder_hidden)
                encoder_outputs[ei] = encoder_output[0]

            decoder_input = torch.LongTensor([SOS_token] * batch_size)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            decoder_hidden = encoder_hidden

            for di in range(target_length):
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input, batch_size, decoder_hidden)
                topv, topi = decoder_output.data.topk(1)

                output[di] = topi.view(-1)
                decoder_input = topi.view(-1)
                loss += criterion(decoder_output, batch_y[di]).item()

            output = output.transpose(0, 1)  #(batch_Size, target_len)
            for di in range(output.size()[0]):
                ignore = [0, 1, 2]  # [SOS_token, EOS_token, PAD_token]
                sent = [
                    str(word.item()) for word in output[di]
                    if word not in ignore
                ]
                y = [
                    str(word.item()) for word in batch_y[di]
                    if word not in ignore
                ]
                score['ROUGE_L'] += r.calc_score([' '.join(sent)],
                                                 [' '.join(y)])
                score['Bleu_1'] += bleu([y], sent, weights=[1.0])
                score['Bleu_4'] += bleu([y],
                                        sent,
                                        weights=[0.25, 0.25, 0.25, 0.25])
                score['METEOR'] += m._score(" ".join(sent), [" ".join(y)])
        print 'data amount:%d' % data_size
        score['Bleu_1'] = score['Bleu_1'] / (target_length * data_size)
        score['Bleu_4'] = score['Bleu_4'] / (target_length * data_size)
        score['ROUGE_L'] = score['ROUGE_L'] / (target_length * data_size)
        score['METEOR'] = score['METEOR'] / (target_length * data_size)
        return loss / (target_length * data_size), score
예제 #12
0
 def bleu_score(self, candidate, references):
     weights    = [0.5, 0.5]
     candidate  = [c for c in candidate if c != '<pad>']
     references = [[c for c in ref if c != '<pad>'] for ref in references]
     return bleu(references, candidate, weights)