Пример #1
0
def train_seq2seq(input_filename, output_filename, model_dir):    
    def input_fn(input_filename, output_filename, batch_size, shuffle_buffer=1):

        encoder_input_data_gen = lambda: data_loader.data_generator_3(input_filename, is_encoder_input=True)
        decoder_output_data_gen = lambda: data_loader.data_generator_3(output_filename)
        decoder_input_data_gen = lambda: data_loader.data_generator_3(output_filename, is_decoder_input=True)

        encoder_input_data = tf.data.Dataset.from_generator(encoder_input_data_gen,
                                                            output_types=tf.int32,
                                                            output_shapes=(None,))
        decoder_output_data = tf.data.Dataset.from_generator(decoder_output_data_gen,
                                                             output_types=tf.int32,
                                                             output_shapes=(None,))
        decoder_input_data = tf.data.Dataset.from_generator(decoder_input_data_gen,
                                                            output_types=tf.int32,
                                                            output_shapes=(None,))

        dataset = tf.data.Dataset.zip((encoder_input_data, decoder_output_data, decoder_input_data)).shuffle(shuffle_buffer).repeat(1).padded_batch(batch_size,
                                                                                                                                                    padded_shapes=([None],[None],[None]))

        iterator = dataset.make_one_shot_iterator()

        encoder_inputs, decoder_outputs, decoder_inputs = iterator.get_next()

        return {'encoder_inputs': encoder_inputs, 'decoder_outputs': decoder_outputs, 'decoder_inputs': decoder_inputs}


    est = tf.estimator.Estimator(model_fn=seq2seq,
                                 model_dir=model_dir,
                                 params=params)

    train_input_func = lambda: input_fn(cfg..source_data_train, cfg.target_data_train, cfg.batch_size, shuffle_buffer=1000)
    eval_input_func = lambda: input_fn(cfg.source_data_dev, cfg.target_data_dev, cfg.batch_size)
    test_input_func = lambda: input_fn(cfg.source_data_test, cfg.target_data_test, cfg.batch_size)
    
    # first train for 20000 stpes
    est.train(input_fn=train_input_func, steps=20000)
    for r in range(cfg.num_rounds):
        # training for num_steps steps
        print('\nRound', r + 1)
        est.train(input_fn=train_input_func, steps=cfg.num_steps)
        
        # evaluatation
        print('\nEvaluation:')
        predictions = est.predict(input_fn=dev_input_func)

        # writing the predictions into a file
        print('\n\nWriting Predictions...')
        for i, pred in enumerate(predictions):
            with open('./predictions/' + str(i), 'w+') as pred_file:
                for keyph in np.array(pred).T:
                    pred_file.write(data_loader.index_to_sent(keyph).replace('<EOS>', '')
                                                                    .replace('<UNK>', '')
                                                                    .replace('<SOS>', '') + '\n')
                    
        # running the evaluation metrics, precision, recall, f1-score, and ROUGE
        precision_and_recall(r)
        rouge(5)
        rouge(10)
Пример #2
0
def choosebest_ind(sentences):
    scores = []
    for i in range(0, len(sentences)):
        with open(source_sen_file, "w") as w:
            w.write(sentences[i])
        rouge(MAX_WORDS, config_file, rouge_out)
        score = parse_rouge(rouge_out, ver)
        scores.append(score)
    count = 0
    best = []
    for sen, scr in sorted(zip(sentences, scores), key=lambda (x, y): -y):
        best.append(sen)
        count += len(sen.split(" "))
        if count > MAX_WORDS: break
    return best
Пример #3
0
def compute_rouge(sent_list, refs):
    sentext = "\n".join(sent_list)
    code = abs(hash(sentext))
    name0 = "%s/mds/%d.txt" % (TMP, code)
    utils.write2file(sentext, name0)
    cfgline = name0
    for i, sens in enumerate(refs):
        name1 = "%s/mds/%d.txt%d" % (TMP, code, i)
        utils.write2file("\n".join(sens), name1)
        cfgline = cfgline + " " + name1
    cfgfile = "%s/mds/%d.cfg" % (TMP, code)
    utils.write2file(cfgline, cfgfile)
    rouge_out = "%s/mds/%d.rge" % (TMP, code)
    rge.rouge(1000, cfgfile, rouge_out)
    score = rge.parse_rouge(rouge_out,
                            2) + 0.0001 * rge.parse_rouge(rouge_out, 1)
    return score
def moses_bl_rouge(p, l):
    bl = bleu.moses_multi_bleu(p, l)
    x = rouge.rouge(p, l)
    print(
        'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f'
        %
        (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'],
         x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'],
         x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
Пример #5
0
def choosebest_seq(sentences):
    best = []
    count = 0
    while True:
        mscore = -1
        msen = None
        for sen in sentences:
            if sen in best: continue
            with open(source_sen_file, "w") as w:
                w.write("\n".join(best + [sen]))
            rouge(MAX_WORDS, config_file, rouge_out)
            score = parse_rouge(rouge_out, ver)
            if score > mscore:
                mscore = score
                msen = sen
        if msen is None: break
        best.append(msen)
        count += len(msen.split(" "))
        if count > MAX_WORDS: break
    return best
Пример #6
0
  def reward_function(self, reference, summary, measure='rouge_l/f_score'):
    """Calculate the reward between the reference and summary.

    Args:
      reference: A list of ids representing the ground-truth data
      summary: A list of ids representing the model generated data

    Returns:
      A single value representing the evaluation value for reference and summary
    """
    if 'rouge' in measure:
      return rouge([summary],[reference])[measure]
    else:
      return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
Пример #7
0
  def reward_function(self, reference, summary, measure='rouge_l/f_score'):
    """Calculate the reward between the reference and summary.

    Args:
      reference: A list of ids representing the ground-truth data
      summary: A list of ids representing the model generated data

    Returns:
      A single value representing the evaluation value for reference and summary
    """
    if 'rouge' in measure:
      return rouge([summary],[reference])[measure]
    else:
      return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
def _rouge(ref_file, summarization_file, subword_option=None):
    """Compute ROUGE scores and handling BPE."""

    references = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh:
        for line in fh:
            references.append(_clean(line, subword_option))

    hypotheses = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file,
                                                  "rb")) as fh:
        for line in fh:
            hypotheses.append(_clean(line, subword_option=None))

    rouge_score_map = rouge.rouge(hypotheses, references)
    return 100 * rouge_score_map["rouge_l/f_score"]
Пример #9
0
def rouge_score(references, generated):
    """both are a list of strings"""
    score = rouge(generated, references)
    rouge_s = {k: (v * 100) for (k, v) in score.items()}
    '''
    "rouge_1/f_score": rouge_1_f,
    "rouge_1/r_score": rouge_1_r,
    "rouge_1/p_score": rouge_1_p,
    "rouge_2/f_score": rouge_2_f,
    "rouge_2/r_score": rouge_2_r,
    "rouge_2/p_score": rouge_2_p,
    "rouge_l/f_score": rouge_l_f,
    "rouge_l/r_score": rouge_l_r,
    "rouge_l/p_score": rouge_l_p,
    '''
    return rouge_s
Пример #10
0
def test_scratch(xtt,ytt,int_to_vocab,vocab_to_int,encoder_model,decoder_model,max_sl,max_rl):
  st=time.time()
  predictions = []
  real_og=[]
  pred_op=[]
  c=0
  b=50
  for i in range(0,len(xtt)):
    #review
    review=seq_to_text(xtt[i],int_to_vocab)
    review=review.replace("<PAD>",'')
    #original summary   
    og_summary=seq_to_summary(ytt[i],vocab_to_int,int_to_vocab)
    og_summary=og_summary.replace("<PAD>",'')
    real_og.append(str(og_summary))
    #predicted summary   
    predict_summary=decode_sequence(xtt[i].reshape(1,max_rl),encoder_model,decoder_model,vocab_to_int,int_to_vocab,max_sl)
    predict_summary=predict_summary.replace("<PAD>",'')
    pred_op.append(str(predict_summary))
    #write to a text file name review_og_pred.txt
    predictions.append("review:"+review+"\t"+"orignal:"+og_summary+"\t"+"predicted:"+predict_summary+"\n")
    #this part is used to print output if the size of c is greater than b 
    #limited output is print as only 5000 lines can be printed in colab whole output is written in a text file 
    if c>b:
      print("Review: {}".format(review))
      print("Original Summary: {}".format(og_summary))
      print("Predicted Summary: {}".format(predict_summary))
      b+=b
    c+=1

  print("total time to complete {}".format(time.time()-st))
  file = open("/content/drive/MyDrive/LSTMscore.txt","w")
  file.writelines(predictions)
  file.close()

  bleau=compute_bleu(real_og,pred_op, max_order=4,smooth=False)
  bscore=nltk.translate.bleu_score.corpus_bleu(real_og,pred_op)
  rougen=rouge_n(pred_op, real_og, n=2)
  ro=rouge(pred_op, real_og)

  print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau)
  print("bleau score",bscore)
  print("rouge2",rougen)
  print("rouge",ro)
Пример #11
0
def testT5(model,tokenizer,test_loader):
  #intialize the empty lists
  predictions = []
  real_og=[]
  pred_op=[]
  c=0
  b=1000
  #for data in test loader
  for i, (input_ids, attention_mask, y) in enumerate(test_loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    y = y.to(device)
    #generate summaries 
    #store real and predicted summary in a list and write in txt file
    summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask,max_length=10)
    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    #this part is used to print output if the size of c is greater than b 
    #limited output is print as only 5000 lines can be printed in colab whole output is written in a text file 
    for pred_sent, real_sent in zip(pred, real): 
      if c>b:
        print("Original: {}".format(real_sent))
        print("Predicted: {}".format(pred_sent))
        print("\n")
        b+=b
      real_og.append(real_sent)
      pred_op.append(pred_sent)
      predictions.append(str("pred sentence: " + pred_sent + "\t\t real sentence: " + real_sent+"\n"))
      c+=1
  file1 = open("/content/drive/MyDrive/TFIVE.txt","w")
  file1.writelines(predictions)
  file1.close()
  #calculate scores
  bleau=compute_bleu(real_og,pred_op, max_order=4,smooth=False)
  bscore=nltk.translate.bleu_score.corpus_bleu(real_og,pred_op)
  rougen=rouge_n(pred_op, real_og, n=2)
  ro=rouge(pred_op, real_og)

  print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau)
  print("bleau score",bscore)
  print("rouge2",rougen)
  print("rouge",ro)
Пример #12
0
def get_metrics(f1,f2):
        ref = []
        decoded = []
        count = 0
        for i, j in zip(sorted(glob.glob(f1)),sorted(glob.glob(f2))):
                ref_tex = ''
                dec_tex = ''
                for k in open(i).readlines():
                        dec_tex = dec_tex + k.strip()
                for l in open(j).readlines():
                        ref_tex = ref_tex + l.strip()
                ref.append(ref_tex)
                decoded.append(dec_tex)
                count = count + 1

        bl = bleu.moses_multi_bleu(decoded,ref)
        x = rouge.rouge(decoded,ref)
        s = "\t%.2f\t%.2f\t%.2f\t%.2f"%(bl,x['rouge_1/f_score']*100,x['rouge_2/f_score']*100,x['rouge_l/f_score']*100)
        print(count)
        return s
Пример #13
0
def view_lstm():
  f = open("/content/drive/MyDrive/LSTMscore.txt", "r")
  text=f.readlines()
  text=pd.DataFrame(text,columns=["value"])
  text=text["value"].str.split("\t",expand=True)
  text.columns=["value","original","predicted"]
  text["original"]=text["original"].str.split(":").str[1]
  text["predicted"]=text["predicted"].str.split(":").str[1]
  text["predicted"]=text["predicted"].replace('\n','', regex=True)
  f.close()
  bleau=compute_bleu(text["original"],text["predicted"], max_order=4,smooth=False)
  bscore=nltk.translate.bleu_score.corpus_bleu(text["original"],text["predicted"])
  rougen=rouge_n(text["predicted"], text["original"], n=2)
  ro=rouge(text["predicted"],text["original"])

  print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau)
  print("bleau score",bscore)
  print("rouge2",rougen)
  print("rouge",ro)
  return text
Пример #14
0
def get_metrics(f1, f2):
    ref = []
    decoded = []
    count = 0
    print(f1)
    print(f2)
    for i, j in zip(sorted(glob.glob(f1)), sorted(glob.glob(f2))):
        ref_tex = ''
        dec_tex = ''
        for k in open(i).readlines():
            dec_tex = dec_tex + k.strip()
        for l in open(j).readlines():
            ref_tex = ref_tex + l.strip()
        ref.append(ref_tex)
        decoded.append(dec_tex)
        count = count + 1
    print(len(decoded))
    print(len(ref))
    x = rouge.rouge(decoded, ref)

    bl = bleu.moses_multi_bleu(decoded, ref)  #replace by pycoco bleu
    return 0, 0, 0, bl
Пример #15
0
def view_t5_op():
  #get the final cleaned data
  df=pd.read_csv('/content/drive/MyDrive/product_reviews.csv')[:147799]
  print("The length of dataset is ",len(df))
  
  #set the threshold 
  threshold = 20
  max_rl=80 #maximum review length
  max_sl=10 #maximum summary length
  
  #get reviewText whose length is less than maximum review length
  df['reviewText']=df['reviewText'].str.slice(0,max_rl)
  
  #get summary whose length is less than maximum summary length
  df['summary']=df['summary'].str.slice(0,max_rl)

  f = open("/content/drive/MyDrive/TFIVE.txt", "r")
  text=f.readlines()
  text=pd.DataFrame(text,columns=["value"])
  text=text["value"].str.split("\t",expand=True)
  text.columns=["predicted","value","original"]
  text.drop(columns=["value"],inplace=True)
  text["predicted"]=text["predicted"].str.split(":").str[1]
  text["original"]=text["original"].str.split(":").str[1]
  text["original"]=text["original"].replace('\n','', regex=True)
  f.close()

  bleau=compute_bleu(text["original"],text["predicted"], max_order=4,smooth=False)
  bscore=nltk.translate.bleu_score.corpus_bleu(text["original"],text["predicted"])
  rougen=rouge_n(text["predicted"], text["original"], n=2)
  ro=rouge(text["predicted"],text["original"])

  print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau)
  print("bleau score",bscore)
  print("rouge2",rougen)
  print("rouge",ro)
  return df,text
Пример #16
0
    short_summary['short'] = short_summary['short'].apply(
        lambda x: data_preprocessing(x))
    actual_news['long'] = actual_news['long'].apply(
        lambda x: data_preprocessing(x))

    human_summaries = short_summary['short'].astype(str).values.tolist()
    complete_news = actual_news['long'].astype(str).values.tolist()

    return human_summaries, complete_news


if __name__ == '__main__':

    # Read the excel file
    human_summaries, complete_news = read_excel()

    # This list will hold the generated summaries
    output_summaries = []

    for news in complete_news[:10]:
        text, summary = summarization(news, 'english', 1)
        output_summaries.append(summary[0].text)

    # Evaluate the first N summaries with ROUGE
    output_summaries = [text for text in output_summaries[:10]]
    human_summaries = [text for text in human_summaries[:10]]

    # Run ROUGE evaluation metric
    print(rouge(output_summaries, human_summaries))
Пример #17
0
    target = []
    for batch_idx_list in test_batch_sampler:
        user_list, item_list, review_input_list, _, real_review_list = test_dataset.get_batch(
            batch_idx_list)
        sample_idx_list, _ = model._sample_text_by_top_one(
            user_list, item_list, review_input_list)
        #convertWord(sample_idx_list, word_dict)
        ref = tensorToScalar(real_review_list).tolist()
        try:
            bleu_score = compute_bleu([sample_idx_list], [ref])
            bleu_list_1.append(bleu_score[1])
            bleu_list_2.append(bleu_score[2])
            bleu_list_3.append(bleu_score[3])
            bleu_list_4.append(bleu_score[4])

            rouge_score = rouge([sample_idx_list], ref)
            rouge_1_list.append(rouge_score[0])
            rouge_2_list.append(rouge_score[1])
            rouge_L_list.append(rouge_score[2])
        except:
            pass
        count += 1
        if count % 50 == 0:
            import sys
            sys.exit(0)
            t1 = time()
            print('Generating %d lines, test samples cost:%.4fs' % (count,
                                                                    (t1 - t0)))

    print('bleu_1:%.4f' % np.mean(bleu_list_1))
    print('bleu_2:%.4f' % np.mean(bleu_list_2))
Пример #18
0
content_file = "/tmp/rouge-model%s.txt" % d
config_file = "/tmp/rouge-config%s.txt" % d

if d:
    content = []
    count = 0
    with open(path.join(dataset, d, "content.txt.nrm")) as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            n = len(line.strip().split(" "))
            if n > MIN_WORDS:
                content.append(line)
                count += n
            if count > MAX_WORDS: break

    with open(content_file, "w") as w:
        w.write("\n".join(content))

    for s in listdir(path.join(dataset, d, "sources")):
        if s.endswith(".nrm"):
            with open(config_file, "w") as w:
                w.write("%s %s" %
                        (path.join(dataset, d, "sources", s), content_file))
            rouge(
                MAX_WORDS, config_file,
                path.join(dataset, d, "sources",
                          ".".join(s.split(".")[:-1]) + ".rge"))
    os.system("rm %s" % content_file)
    os.system("rm %s" % config_file)
def valid_model(in_path, da_path, sum_path,sess):
    #return accuracy for dialogue act, rouge-1,2,3,L for summary
    #some useful items are also calculated
    #da_outputs, correct_das: predicted / ground truth of dialogue act

    rouge_1 = []
    rouge_2 = []
    rouge_3 = []
    rouge_L = []
    da_outputs = []
    correct_das = []

    data_processor_valid = DataProcessor(in_path, da_path, sum_path, in_vocab, da_vocab)
    while True:
        #get a batch of data
        in_data, da_data, da_weight, length, sums, sum_weight,sum_lengths, in_seq, da_seq, sum_seq = data_processor_valid.get_batch(batch_size)
        feed_dict = {input_data.name: in_data, sequence_length.name: length, sum_length.name: sum_lengths}
        if data_processor_valid.end != 1 or in_data:
            ret = sess.run(inference_outputs, feed_dict)

            #summary part
            pred_sums = []
            correct_sums = []
            for batch in ret[1]:
                tmp = []
                for time_i in batch:
                    tmp.append(np.argmax(time_i))
                pred_sums.append(tmp)
            for i in sums:
                correct_sums.append(i.tolist())
            for pred,corr in zip(pred_sums,correct_sums):
                rouge_score_map = rouge.rouge(pred,corr)
                rouge1 = 100*rouge_score_map['rouge_1/f_score']
                rouge2 = 100*rouge_score_map['rouge_2/f_score']
                rouge3 = 100*rouge_score_map['rouge_3/f_score']
                rougeL = 100*rouge_score_map['rouge_l/f_score']
                rouge_1.append(rouge1)
                rouge_2.append(rouge2)
                rouge_3.append(rouge3)
                rouge_L.append(rougeL)

            #dialogue act part
            pred_das = ret[0].reshape((da_data.shape[0], da_data.shape[1], -1))
            for p, t, i, l in zip(pred_das, da_data, in_data, length):
                p = np.argmax(p, 1)
                tmp_pred = []
                tmp_correct = []
                for j in range(l):
                    tmp_pred.append(da_vocab['rev'][p[j]])
                    tmp_correct.append(da_vocab['rev'][t[j]])
                da_outputs.append(tmp_pred)
                correct_das.append(tmp_correct)

        if data_processor_valid.end == 1:
            break

    precision = computeAccuracy(correct_das, da_outputs)
    logging.info('da precision: ' + str(precision))
    logging.info('sum rouge1: ' + str(np.mean(rouge_1)))
    logging.info('sum rouge2: ' + str(np.mean(rouge_2)))
    logging.info('sum rouge3: ' + str(np.mean(rouge_3)))
    logging.info('sum rougeL: ' + str(np.mean(rouge_L)))

    data_processor_valid.close()
    return np.mean(rouge_1),np.mean(rouge_2),np.mean(rouge_3),np.mean(rouge_L),precision
Пример #20
0
def evaluate(test_dataset, test_batch_sampler, model, review_aspect_mask):
    model.eval()
    
    bleu_score = []
    bleu_list_1, bleu_list_2, bleu_list_3, bleu_list_4 = [], [], [], []
    rouge_1_list, rouge_2_list, rouge_L_list = [], [], []

    hyp_ref_list = []

    for batch_idx_list in test_batch_sampler:
        
        #user, item, review_input, summary, real_review = \
        #    test_dataset.get_batch(batch_idx_list)
        #sample_idx_list = \
        #    model._sample_text_by_top_one(user, item, summary, review_input, \
        #    review_aspect, review_aspect_mask)
    
        user, item, review_input, summary, real_review = test_dataset.get_batch(batch_idx_list)
        sample_idx_list = model._sample_text_by_top_one(user, item, review_input, review_aspect_mask)

        #import pdb; pdb.set_trace()
        #for record_idx, hyp in enumerate(tensorToScalar(sample_idx_list)):
        for record_idx, hyp in enumerate(sample_idx_list):
            hyp = tensorToScalar(hyp).tolist()
            for clip_idx, word in enumerate(hyp):
                if word == PAD:
                    # if current word is the last word of hyp
                    if clip_idx + 1 == len(hyp):
                        clip_idx = clip_idx - 1
                        break
                    # if next word also the PAD
                    elif hyp[clip_idx + 1] == PAD:
                        clip_idx = clip_idx - 1
                        break
            hyp = hyp[:clip_idx+1]

            #import pdb; pdb.set_trace()
            ref = tensorToScalar(real_review[record_idx]).tolist()
            for clip_idx, word in enumerate(ref):
                if word == PAD:
                    # if current word is the last word of ref
                    if clip_idx + 1 == len(ref):
                        clip_idx = clip_idx - 1
                        break
                    # if next word also the PAD
                    elif ref[clip_idx + 1] == PAD:
                        clip_idx = clip_idx - 1
                        break
            ref = ref[:clip_idx+1]

            if len(ref) != 0:
                hyp_ref_list.append([hyp, [ref]])

    #import pdb; pdb.set_trace()
    for record_idx, [hyp, ref] in enumerate(hyp_ref_list):
        try:
            bleu_score = compute_bleu([hyp], [ref])
            bleu_list_1.append(bleu_score[1])
            bleu_list_2.append(bleu_score[2])
            bleu_list_3.append(bleu_score[3])
            bleu_list_4.append(bleu_score[4])

            rouge_score = rouge([hyp], ref)
            rouge_1_list.append(rouge_score[0])
            rouge_2_list.append(rouge_score[1])
            rouge_L_list.append(rouge_score[2])
        except:
            pass

    #import pdb; pdb.set_trace()
    print('bleu_1:%.4f' % np.mean(bleu_list_1))
    print('bleu_2:%.4f' % np.mean(bleu_list_2))
    print('bleu_3:%.4f' % np.mean(bleu_list_3))
    print('bleu_4:%.4f' % np.mean(bleu_list_4))
    print('rouge_1_f:%.4f' % np.mean(rouge_1_list))
    print('rouge_2_f:%.4f' % np.mean(rouge_2_list))
    print('rouge_L_f:%.4f' % np.mean(rouge_L_list))

    return np.mean(bleu_list_4), np.mean(rouge_L_list)   
Пример #21
0
                    max_score = score
                    best = s
        if best:
            base = ".".join(best.split(".")[:-1])
            s = base + ".best" + ver
            text_path = os.path.join(sources, base + ".nrm")
            cont_path = os.path.join(dataset, dp, "content.txt.nrm")
            if os.path.exists(os.path.join(sources,
                                           s)) and os.path.exists(text_path):
                if rge.parse_rouge(os.path.join(sources, base + ".rge"),
                                   2) < min_rge:
                    continue
                can_text = candidate(text_path)
                ref_text = "\n".join([
                    x for x in utils.fileaslist(cont_path)
                    if len(x.split(" ")) > 3
                ][:FIRST_N_LINES])
                can_path = "/tmp/mds/%s.can.txt" % base
                ref_path = "/tmp/mds/%s.ref.txt" % base
                utils.write2file(can_text, can_path)
                utils.write2file(ref_text, ref_path)
                eval_writer.write("%s %s\n" % (can_path, ref_path))

    eval_writer.close()
    print "created the evaluation file, running rouge..."

    os.chdir(rouge_dir)
    rge.rouge(1000, eval_path, eval_out)

    print "done."
Пример #22
0
import sys
import glob
import rouge
import bleu
import pandas as pd
f1 = sys.argv[1]  #decoded
f2 = sys.argv[2]  #reference
ref = []
decoded = []

for i, j in zip(sorted(glob.glob(f1 + '*.txt')),
                sorted(glob.glob(f2 + '*.txt'))):
    ref_tex = ''
    dec_tex = ''
    for k in open(i).readlines():
        dec_tex = dec_tex + k.strip()
    for l in open(j).readlines():
        ref_tex = ref_tex + l.strip()
    ref.append(ref_tex)
    decoded.append(dec_tex)
data = {'decoded': decoded, 'reference': ref}
df = pd.DataFrame(data)
df.to_csv('analysis.csv', index=False)
bl = bleu.moses_multi_bleu(decoded, ref)
x = rouge.rouge(decoded, ref)
print('%.2f\t%.2f\t%.2f\t%.2f' %
      (bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100,
       x['rouge_l/f_score'] * 100))
Пример #23
0
 def reward_function(self, reference, summary, measure='rouge_l/f_score'):
   if 'rouge' in measure:
     return rouge([summary],[reference])[measure]
   else:
     return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
Пример #24
0
        + 'Bleu_2: ' + str(bleu_2) + '\t' \
        + 'Bleu_3: ' + str(bleu_3) + '\t' \
        + 'Bleu_4: ' + str(bleu_4)

        print(log_str)

        if (args.save):
            #Save evaluation results in a log file
            with open(os.path.join(args.save_path, 'log.txt'), 'a') as f:
                f.write(log_str + '\n')

    if (args.rouge):

        reference_corpus = [
            " ".join(reference) for reference in reference_corpus
        ]
        translation_corpus = [
            " ".join(hypothesis) for hypothesis in translation_corpus
        ]

        score = rouge(translation_corpus, reference_corpus)
        print(score["rouge_l/f_score"])

        log_str = 'Rouge Evaluation: ' + '\t'
        print(log_str)

        if (args.save):
            #Save evaluation results in a log file
            with open(os.path.join(args.save_path, 'log.txt'), 'a') as f:
                f.write(log_str + '\n')
Пример #25
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  if FLAGS.generating:
    data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True)
  else:
    data_reader = DataReader(FLAGS.data_dir)
  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True

  with tf.Session(config=config) as sess:
      saver.restore(sess, FLAGS.ckpt_dir)
      print('Model succesfully restored')
      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.real_test_review)
        img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.real_test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        if FLAGS.generating:
          for gen, ref in zip(gen_reviews, ref_reviews):
            gen_str = "GENERATED:\n"+" ".join(gen)
            ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n"
            log_info(log_file,gen_str)
            log_info(log_file,ref_str)

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
Пример #26
0
def reward_function(reference, summary, measure='rouge_l/f_score'):

    if 'rouge' in measure:
        return rouge([summary], [reference])[measure]
Пример #27
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  data_reader = DataReader(FLAGS.data_dir)

  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  update_rating, update_review, global_step = train_fn(model)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True
  with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(1, FLAGS.num_epochs + 1):
      log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs))

      count = 0
      sum_rating_loss = 0
      sum_review_loss = 0

      # Training
      for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True):
        count += 1

        fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True)
        _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd)
        sum_rating_loss += _rating_loss

        review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                            data_reader.train_review)
        img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.train_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images,
                             reviews=reviews, is_training=True)
        _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd)
        sum_review_loss += _review_loss

        if _step % FLAGS.display_step == 0:
          data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count),
                                       review_loss=(sum_review_loss / count))

      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.test_review)
        img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))

      save_path = saver.save(sess, f"tmp/model{epoch}.ckpt")
      log_info(log_file, '')