Exemplo n.º 1
0
def evaluate_scores(predictor, random=False):
    timer = gezi.Timer('evaluate_scores')
    init()
    imgs, img_features = get_image_names_and_features()

    num_metric_eval_examples = min(FLAGS.num_metric_eval_examples, len(imgs))
    step = FLAGS.metric_eval_batch_size

    if random:
        index = np.random.choice(len(imgs),
                                 num_metric_eval_examples,
                                 replace=False)
        imgs = imgs[index]
        img_features = img_features[index]

    text_max_words = all_distinct_texts.shape[1]
    rank_metrics = gezi.rank_metrics.RecallMetrics()

    print('text_max_words:', text_max_words)
    start = 0
    while start < num_metric_eval_examples:
        end = start + step
        if end > num_metric_eval_examples:
            end = num_metric_eval_examples
        print('predicts start:', start, 'end:', end, file=sys.stderr)
        predicts(imgs[start:end], img_features[start:end], predictor,
                 rank_metrics)
        start = end

    melt.logging_results(
        rank_metrics.get_metrics(),
        rank_metrics.get_names(),
        tag='evaluate: epoch:{} step:{} train:{} eval:{}'.format(
            melt.epoch(), melt.step(), melt.train_loss(), melt.eval_loss()))

    timer.print()

    return rank_metrics.get_metrics(), rank_metrics.get_names()
Exemplo n.º 2
0
def evaluate_translation(predictor, random=False, index=None):
  timer = gezi.Timer('evaluate_translation')

  refs = prepare_refs()

  imgs, img_features = get_image_names_and_features()
  num_metric_eval_examples = min(FLAGS.num_metric_eval_examples, len(imgs))
  if num_metric_eval_examples <= 0:
    num_metric_eval_examples = len(imgs)
  if num_metric_eval_examples == len(imgs):
    random = False

  step = FLAGS.metric_eval_batch_size

  if random:
    if index is None:
      index = np.random.choice(len(imgs), num_metric_eval_examples, replace=False)
    imgs = imgs[index]
    img_features = img_features[index]
  else:
    img_features = img_features[:num_metric_eval_examples]

  results = {}

  start = 0
  while start < num_metric_eval_examples:
    end = start + step
    if end > num_metric_eval_examples:
      end = num_metric_eval_examples
    print('predicts image start:', start, 'end:', end, file=sys.stderr, end='\r')
    translation_predicts(imgs[start: end], img_features[start: end], predictor, results)
    start = end
    
  scorers = [
            (Bleu(4), ["bleu_1", "bleu_2", "bleu_3", "bleu_4"]),
            (Meteor(),"meteor"),
            (Rouge(), "rouge_l"),
            (Cider(), "cider")
        ]

  score_list = []
  metric_list = []

  selected_refs = {}
  selected_results = {}
  #by doing this can force same .keys()
  for key in results:
    selected_refs[key] = refs[key]
    selected_results[key] = results[key]
    assert len(selected_results[key]) == 1, selected_results[key]
  assert selected_results.keys() == selected_refs.keys(), '%d %d'%(len(selected_results.keys()), len(selected_refs.keys())) 

  if FLAGS.eval_translation_reseg:
    print('tokenization...', file=sys.stderr)
    global tokenizer
    if tokenizer is None:
      tokenizer = PTBTokenizer()
    selected_refs  = tokenizer.tokenize(selected_refs)
    selected_results = tokenizer.tokenize(selected_results)

  logging.info('predict&label:{}{}{}'.format('|'.join(selected_results.items()[0][1]), '---', '|'.join(selected_refs.items()[0][1])))

  for scorer, method in scorers:
    print('computing %s score...'%(scorer.method()), file=sys.stderr)
    score, scores = scorer.compute_score(selected_refs, selected_results)
    if type(method) == list:
      for sc, scs, m in zip(score, scores, method):
        score_list.append(sc)
        metric_list.append(m)
        if FLAGS.eval_result_dir:
          out = open(os.path.join(FLAGS.eval_result_dir, m+'.txt'), 'w')
          for i, sc in enumerate(scs):
            key = selected_results.keys()[i]
            result = selected_results[key]
            refs = '\x01'.join(selected_refs[key])
            print(key, result, refs, sc, sep='\t', file=out)
    else:
      score_list.append(score)
      metric_list.append(method)
      if FLAGS.eval_result_dir:
        out = open(os.path.join(FLAGS.eval_result_dir, m+'.txt'), 'w')
        for i, sc in enumerate(scores):
          key = selected_results.keys()[i]
          result = selected_results[key]
          refs = '\x01'.join(selected_refs[key])
          print(key, result, refs, sc, sep='\t', file=out)
  
  #exclude "bleu_1", "bleu_2", "bleu_3"
  score_list, metric_list = score_list[3:], metric_list[3:]
  assert(len(score_list) == 4)

  avg_score = sum(score_list) / len(score_list)
  score_list.append(avg_score)
  metric_list.append('avg')
  metric_list = ['trans_' + x for x in metric_list]

  melt.logging_results(
    score_list,
    metric_list,
    tag='evaluate: epoch:{} step:{} train:{} eval:{}'.format(
      melt.epoch(), 
      melt.step(),
      melt.train_loss(),
      melt.eval_loss()))

  timer.print()

  return score_list, metric_list
Exemplo n.º 3
0
def evaluate_scores(predictor, random=False):
    timer = gezi.Timer('evaluate_scores')
    init()
    if FLAGS.eval_img2text:
        imgs, img_features = get_image_names_and_features()
        num_metric_eval_examples = min(FLAGS.num_metric_eval_examples,
                                       len(imgs))
        step = FLAGS.metric_eval_batch_size

        if random:
            index = np.random.choice(len(imgs),
                                     num_metric_eval_examples,
                                     replace=False)
            imgs = imgs[index]
            img_features = img_features[index]

        rank_metrics = gezi.rank_metrics.RecallMetrics()

        start = 0
        while start < num_metric_eval_examples:
            end = start + step
            if end > num_metric_eval_examples:
                end = num_metric_eval_examples
            print('predicts image start:', start, 'end:', end, file=sys.stderr)
            predicts(imgs[start:end], img_features[start:end], predictor,
                     rank_metrics)
            start = end

        melt.logging_results(
            rank_metrics.get_metrics(),
            rank_metrics.get_names(),
            tag='evaluate: epoch:{} step:{} train:{} eval:{}'.format(
                melt.epoch(), melt.step(), melt.train_loss(),
                melt.eval_loss()))

    if FLAGS.eval_text2img:
        num_metric_eval_examples = min(FLAGS.num_metric_eval_examples,
                                       len(all_distinct_texts))
        if random:
            index = np.random.choice(len(all_distinct_texts),
                                     num_metric_eval_examples,
                                     replace=False)
            text_strs = all_distinct_text_strs[index]
            texts = all_distinct_texts[index]

        rank_metrics2 = gezi.rank_metrics.RecallMetrics()

        start = 0
        while start < num_metric_eval_examples:
            end = start + step
            if end > num_metric_eval_examples:
                end = num_metric_eval_examples
            print('predicts start:', start, 'end:', end, file=sys.stderr)
            predicts_txt2im(text_strs[start:end], texts[start:end], predictor,
                            rank_metrics2)
            start = end

        melt.logging_results(rank_metrics2.get_metrics(),
                             ['t2i' + x for x in rank_metrics2.get_names()],
                             tag='text2img')

    timer.print()

    if FLAGS.eval_img2text and FLAGS.eval_text2img:
        return rank_metrics.get_metrics() + rank_metrics2.get_metrics(
        ), rank_metrics.get_names() + [
            't2i' + x for x in rank_metrics2.get_names()
        ]
    elif FLAGS.eval_img2text:
        return rank_metrics.get_metrics(), rank_metrics.get_names()
    else:
        return rank_metrics2.get_metrics(), rank_metrics2.get_names()
Exemplo n.º 4
0
def evaluate_scores(predictor, random=False, index=None, exact_predictor=None, exact_ratio=1.):
  """
  actually this is rank metrics evaluation, by default recall@1,2,5,10,50
  """
  timer = gezi.Timer('evaluate_scores')
  init()
  if FLAGS.eval_img2text:
    imgs, img_features = get_image_names_and_features()
    num_metric_eval_examples = min(FLAGS.num_metric_eval_examples, len(imgs)) 
    if num_metric_eval_examples <= 0:
      num_metric_eval_examples = len(imgs)
    if num_metric_eval_examples == len(imgs):
      random = False

    step = FLAGS.metric_eval_batch_size

    if random:
      if index is None:
        index = np.random.choice(len(imgs), num_metric_eval_examples, replace=False)
      imgs = imgs[index]
      img_features = img_features[index]
    else:
      img_features = img_features[:num_metric_eval_examples]

    rank_metrics = gezi.rank_metrics.RecallMetrics()

    start = 0
    while start < num_metric_eval_examples:
      end = start + step
      if end > num_metric_eval_examples:
        end = num_metric_eval_examples
      print('predicts image start:', start, 'end:', end, file=sys.stderr, end='\r')
      predicts(imgs[start: end], img_features[start: end], predictor, rank_metrics, 
               exact_predictor=exact_predictor, exact_ratio=exact_ratio)
      start = end
      
    melt.logging_results(
      rank_metrics.get_metrics(), 
      rank_metrics.get_names(), 
      tag='evaluate: epoch:{} step:{} train:{} eval:{}'.format(
        melt.epoch(), 
        melt.step(),
        melt.train_loss(),
        melt.eval_loss()))

  if FLAGS.eval_text2img:
    num_metric_eval_examples = min(FLAGS.num_metric_eval_examples, len(all_distinct_texts))

    if random:
      index = np.random.choice(len(all_distinct_texts), num_metric_eval_examples, replace=False)
      text_strs = all_distinct_text_strs[index]
      texts = all_distinct_texts[index]
    else:
      text_strs = all_distinct_text_strs
      texts = all_distinct_texts

    rank_metrics2 = gezi.rank_metrics.RecallMetrics()

    start = 0
    while start < num_metric_eval_examples:
      end = start + step
      if end > num_metric_eval_examples:
        end = num_metric_eval_examples
      print('predicts start:', start, 'end:', end, file=sys.stderr, end='\r')
      predicts_txt2im(text_strs[start: end], texts[start: end], predictor, rank_metrics2, exact_predictor=exact_predictor)
      start = end
    
    melt.logging_results(
      rank_metrics2.get_metrics(), 
      ['t2i' + x for x in rank_metrics2.get_names()],
      tag='text2img')

  timer.print()

  if FLAGS.eval_img2text and FLAGS.eval_text2img:
    return rank_metrics.get_metrics() + rank_metrics2.get_metrics(), rank_metrics.get_names() + ['t2i' + x for x in rank_metrics2.get_names()]
  elif FLAGS.eval_img2text:
    return rank_metrics.get_metrics(), rank_metrics.get_names()
  else:
    return rank_metrics2.get_metrics(), rank_metrics2.get_names()