Пример #1
0
def gen_train_graph(input_app, input_results, trainer):
    """
    main flow, key graph
  """
    #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod)
    if FLAGS.num_gpus == 0:
        loss = tower_loss(trainer, input_app, input_results)
    else:
        loss_function = lambda: tower_loss(trainer)
        #here loss is a list of losses
        loss = melt.tower_losses(loss_function, FLAGS.num_gpus)
        print('num tower losses:', len(loss))

    ops = [loss]
    #--------mark train graph finished, all graph after must share variable from train graph
    #melt.reuse_variables()
    trainer.is_training = False

    deal_debug_results = None

    #FLAGS.debug = True
    if FLAGS.debug == True:
        #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]]
        #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]]
        #print('-----', tf.get_collection('sequence'))
        #ops += [tf.get_collection('fixed_text')[-1], tf.get_collection('eval_text')[-1], tf.get_collection('fixed_input_text')[-1],
        #        tf.get_collection('sequence')[1], tf.get_collection('sequence_length')[1],
        #        tf.get_collection('outputs')[1]]
        def _deal_debug_results(results):
            print(results)
            print([x.shape for x in results])

        deal_debug_results = _deal_debug_results

    return ops, deal_debug_results
Пример #2
0
def gen_train_graph(input_app, input_results, trainer):
    """
    main flow, key graph
  """
    #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod)
    if FLAGS.num_gpus == 0:
        loss = tower_loss(trainer, input_app, input_results)
    else:
        loss_function = lambda: tower_loss(trainer)
        #here loss is a list of losses
        loss = melt.tower_losses(loss_function, FLAGS.num_gpus)
        print('num tower losses:', len(loss))

    ops = [loss]
    #--------mark train graph finished, all graph after must share variable from train graph
    #melt.reuse_variables()
    trainer.is_training = False

    deal_debug_results = None

    if FLAGS.debug == True:
        ops += [tf.get_collection('scores')[-1]]

        def _deal_debug_results(results):
            print(results)

        deal_debug_results = _deal_debug_results

    return ops, deal_debug_results
def gen_train_graph(input_app, input_results, trainer):
    """
    main flow, key graph
  """
    #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod)
    if FLAGS.num_gpus == 0:
        loss = tower_loss(trainer, input_app, input_results)
    else:
        loss_function = lambda: tower_loss(trainer)
        #here loss is a list of losses
        loss = melt.tower_losses(loss_function, FLAGS.num_gpus)
        print('num tower losses:', len(loss))

    ops = [loss]
    #--------mark train graph finished, all graph after must share variable from train graph
    #melt.reuse_variables()
    trainer.is_training = False

    deal_debug_results = None
    if FLAGS.debug == True:
        ops += [tf.get_collection('scores')[-1]]

        def _deal_debug_results(results):
            _, scores = results
            print('scores', scores)

        # if not FLAGS.feed_dict:
        #   ops += [text, text_str, neg_text, neg_text_str]

        # def _deal_debug_results(results):
        #   if FLAGS.feed_dict:
        #     _, scores = results
        #   else:
        #     _, scores, text, text_str, neg_text, neg_text_str = results
        #   print(scores)
        #   if not FLAGS.feed_dict:
        #     print(text_str[0], text[0], text2ids.ids2text(text[0]))
        #     print(neg_text_str[0][0][0], neg_text[0][0], text2ids.ids2text(neg_text[0][0]))

        #     # global step
        #     # if step == 42:
        #     #   print(neg_text_str[8][3][0], neg_text[8][3], text2ids.ids2text(neg_text[8][3]))
        #     # step += 1

        deal_debug_results = _deal_debug_results

        ###----------show how to debug
        #debug_ops = [text, neg_text, trainer.emb, trainer.scores]
        #debug_ops += trainer.gradients
        #print(trainer.gradients)
        #ops += debug_ops
        #def _deal_debug_results(results):
        #  for result in results[-len(debug_ops):]:
        #    #print(result.shape)
        #    print(result)
        #deal_debug_results = _deal_debug_results

    return ops, deal_debug_results
Пример #4
0
def gen_train_graph(input_app, input_results, trainer):
  """
    main flow, key graph
  """
  #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod)
  if FLAGS.num_gpus == 0:
    loss = tower_loss(trainer, input_app, input_results)
  else:
    loss_function = lambda: tower_loss(trainer)
    #here loss is a list of losses
    loss = melt.tower_losses(loss_function, FLAGS.num_gpus)
    print('num tower losses:', len(loss))

  ops = [loss]
  #--------mark train graph finished, all graph after must share variable from train graph
  #melt.reuse_variables()
  trainer.is_training = False
    
  deal_debug_results = None

  #FLAGS.debug = True
  if FLAGS.debug == True:
    #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]]
    #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]]
    ops += [tf.get_collection('logits')[-1]]
    def _deal_debug_results(results):
      print(results)
      #_, seq, len = results 
      #for item in seq[0]:
      #  print(item)
      #print('len:', len[0])
      #_, scores, encode_feature, encode_state = results
      #print('scores', scores)
      #print('encode_feature', encode_feature)  
      #print('encode_state', encode_state)      

    deal_debug_results = _deal_debug_results

  return ops, deal_debug_results
Пример #5
0
def gen_train_graph(input_app, input_results, trainer):
    """
    main flow, key graph
  """
    #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod)
    if FLAGS.num_gpus > 1 and FLAGS.use_tower_loss:
        loss_function = lambda: tower_loss(trainer)
        #here loss is a list of losses
        loss = melt.tower_losses(loss_function, FLAGS.num_gpus)
    else:
        loss = tower_loss(trainer, input_app, input_results)

    ops = [loss]

    deal_debug_results = None

    #FLAGS.debug = True
    if FLAGS.debug == True:
        #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]]
        #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]]
        ops += [tf.get_collection('logits')[-1]]

        def _deal_debug_results(results):
            print(results)
            #_, seq, len = results
            #for item in seq[0]:
            #  print(item)
            #print('len:', len[0])
            #_, scores, encode_feature, encode_state = results
            #print('scores', scores)
            #print('encode_feature', encode_feature)
            #print('encode_state', encode_state)

        deal_debug_results = _deal_debug_results

    return ops, deal_debug_results
Пример #6
0
def main(_):
    num_train_examples = 45000
    melt.apps.train.init()

    batch_size = melt.batch_size()
    num_gpus = melt.num_gpus()

    batch_size_per_gpu = FLAGS.batch_size

    # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus
    #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size)

    global_scope = FLAGS.algo
    with tf.variable_scope(global_scope) as global_scope:
        data_format = 'channels_first'
        num_layers = 44
        batch_norm_decay = 0.997
        batch_norm_epsilon = 1e-05
        data_dir = './mount/data/cifar10/'
        with tf.variable_scope('main') as scope:
            model = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=True,
                data_format=data_format)

            dataset = cifar10.Cifar10DataSet(data_dir,
                                             subset='train',
                                             use_distortion=True)

            ## This is wrong will cause all gpu read same data, so slow convergence but will get better test result
            #_, image_batch, label_batch = dataset.make_batch(FLAGS.batch_size)
            def loss_function():
                # doing this 2gpu will get similar result as 1gpu, seems a bit better valid result and a bit worse test result might due to randomness
                _, image_batch, label_batch = dataset.make_batch(
                    batch_size_per_gpu)
                return tower_loss(model, image_batch, label_batch)

            #loss_function = lambda: tower_loss(model, image_batch, label_batch)
            loss = melt.tower_losses(loss_function, num_gpus)
            pred = model.predict()
            pred = pred['classes']
            label_batch = dataset.label_batch
            acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch)))

            #tf.summary.image('train/image', dataset.image_batch)
            # # Compute confusion matrix
            # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10)
            # # Get a image tensor for summary usage
            # image_tensor = draw_confusion_matrix(matrix)
            # tf.summary.image('train/confusion_matrix', image_tensor)

            scope.reuse_variables()
            ops = [loss, acc]

            # TODO multiple gpu validation and inference

            validator = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=False,
                data_format=data_format)

            valid_dataset = cifar10.Cifar10DataSet(data_dir,
                                                   subset='valid',
                                                   use_distortion=False)
            valid_iterator = valid_dataset.make_batch(batch_size)
            valid_id_batch, valid_image_batch, valid_label_batch = valid_iterator.get_next(
            )

            valid_loss = tower_loss(validator, valid_image_batch,
                                    valid_label_batch)
            valid_pred = validator.predict()
            valid_pred = valid_pred['classes']

            ## seems not work with non rpeat mode..
            #tf.summary.image('valid/image', valid_image_batch)
            ## Compute confusion matrix
            #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10)
            ## Get a image tensor for summary usage
            #image_tensor = draw_confusion_matrix(matrix)
            #tf.summary.image('valid/confusion_matrix', image_tensor)

            #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch)
            #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, is_training=False)
            #eval_ops = [val_loss]

            metric_eval_fn = lambda model_path=None: \
                                evaluator.evaluate([valid_id_batch, valid_loss, valid_pred, valid_label_batch, valid_image_batch],
                                                   valid_iterator,
                                                   model_path=model_path)

            predictor = cifar10_model.ResNetCifar10(
                num_layers,
                batch_norm_decay=batch_norm_decay,
                batch_norm_epsilon=batch_norm_epsilon,
                is_training=False,
                data_format=data_format)

            predictor.init_predict()

            test_dataset = cifar10.Cifar10DataSet(data_dir,
                                                  subset='test',
                                                  use_distortion=False)
            test_iterator = test_dataset.make_batch(batch_size)
            test_id_batch, test_image_batch, test_label_batch = test_iterator.get_next(
            )

            test_pred = predictor.predict(test_image_batch,
                                          input_data_format='channels_last')
            test_pred = test_pred['classes']

            inference_fn = lambda model_path=None: \
                                evaluator.inference([test_id_batch, test_pred],
                                                    test_iterator,
                                                    model_path=model_path)

            global eval_names
            names = ['loss', 'acc']

        melt.apps.train_flow(ops,
                             names=names,
                             metric_eval_fn=metric_eval_fn,
                             inference_fn=inference_fn,
                             model_dir=FLAGS.model_dir,
                             num_steps_per_epoch=num_train_examples //
                             batch_size)