예제 #1
0
def train(model='fcn5'):
    config = tf.ConfigProto(allow_soft_placement=False,log_device_placement=FLAGS.log_device_placement)
    device_id = FLAGS.device_id
    device_str = ''
    if int(device_id) >= 0:
        device_str = '/gpu:%d'%int(device_id)
    else:
        device_str = '/cpu:0'


    with tf.Graph().as_default(), tf.device(device_str), tf.Session(config=config) as sess:
        feature_dim = models.feature_dim
        label_dim = models.label_dim
        images = tf.placeholder(tf.float32, [None, feature_dim])
        labels = tf.placeholder(tf.float32, [None, label_dim])

        logits = None
        if model == 'fcn5':
            logits = models.model_fcn5(images)
        else:
            logits = models.model_fcn8(images)
        loss = models.loss(logits, labels)

        predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss)

        init = tf.initialize_all_variables()
        sess.run(init)
        tf.train.start_queue_runners(sess=sess)
        batch_size_per_epoch = int((EPOCH_SIZE + FLAGS.batch_size - 1)/ FLAGS.batch_size)
        iterations = FLAGS.epochs * batch_size_per_epoch 
        average_batch_time = 0.0
        epochs_info = []
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs, labs = get_real_batch_data(FLAGS.batch_size, 10)
            _, loss_value = sess.run([optimizer, loss], feed_dict={images:imgs,labels:labs})
            average_loss += loss_value
            duration = time.time() - start_time
            average_batch_time += float(duration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % FLAGS.log_step == 0:
                examples_per_sec = FLAGS.batch_size / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
            if step > 0 and step % (FLAGS.eval_step * batch_size_per_epoch) == 0:
                average_loss /= FLAGS.eval_step * batch_size_per_epoch
                accuracy_value = accuracy.eval(feed_dict={images: mnist.test.images, labels: mnist.test.labels})
                print("test accuracy %g"%accuracy_value)
                epochs_info.append('%d:%g:%s'%(step/(FLAGS.eval_step*batch_size_per_epoch), accuracy_value, average_loss)) 
                average_loss = 0.0
        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
예제 #2
0
def train(model='fcn5'):
    if FLAGS.num_gpus < 2:
        print("The number of GPU should be 2 or more, if you use one GPU, please use fcn5_mnist.py to train")
        return

    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement)

    with tf.Graph().as_default(), tf.device("/cpu:0"):
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        device_ids = FLAGS.device_ids.split(',')
        if len(device_ids) > FLAGS.num_gpus:
            print('The device_ids should have the same number of GPUs with num_gpus')
            return

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9)

        tower_grads = []
        feed_vars = []
        average_loss_tensor = []
        for i in xrange(FLAGS.num_gpus):
            with tf.device('/gpu:%s'%device_ids[i]):
                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope:
                    feature_dim = models.feature_dim
                    label_dim = models.label_dim
                    images = tf.placeholder(tf.float32, [None, feature_dim], name='images')
                    labels = tf.placeholder(tf.float32, [None, label_dim], name='labels')
                    feed_vars.append((images, labels))

                    logits = models.model_fcn5(images)
                    loss = models.loss(logits, labels)
                    tf.add_to_collection('losses', loss)

                    #tf.add_n(tf.get_collection('losses'), name='total_loss')
                    losses = tf.get_collection('losses', scope)
                    total_loss = tf.add_n(losses, name='total_loss')
                    average_loss_tensor.append(total_loss)

                    tf.get_variable_scope().reuse_variables()
                    grads = optimizer.compute_gradients(total_loss)
                    tower_grads.append(grads)

        print('tower_grads: ', tower_grads, '\nlen: ', len(tower_grads))
        print ('total_loss: ', total_loss)

        grads = average_gradients(tower_grads)
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)

        train_op = apply_gradient_op
        average_op = tf.reduce_mean(average_loss_tensor, 0)
        saver = tf.train.Saver(tf.all_variables())

        init = tf.initialize_all_variables()
        sess = tf.Session(config=config)
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
        iterations = FLAGS.epochs * num_batches_per_epoch 
        average_batch_time = 0.0
        epochs_info = []

        step = 0
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs, labs = get_real_batch_data(real_batch_size, 10)
            feed_dict = {}
            for i in range(FLAGS.num_gpus):
                feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size]
                feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] 
           # _, loss_value = sess.run([train_op, total_loss], feed_dict=feed_dict)
            _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict)
            duration = time.time() - start_time
            average_batch_time += float(duration)
            average_loss += loss_value

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % FLAGS.log_step == 0:
                examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
                average_loss /= num_batches_per_epoch * FLAGS.eval_step
                print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss))
                epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
                average_loss = 0.0

        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
예제 #3
0
def train(model='fcn5'):

    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement)

    if FLAGS.xla:
        # Turns on XLA.  XLA is not included in the standard build.  For single GPU this shows ~5% improvement
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Graph().as_default(), tf.device("/" + FLAGS.local_ps_device + ":0"):
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        device_ids = FLAGS.device_ids
        if not device_ids:
            device_ids = [str(i) for i in range(FLAGS.num_gpus)]
        else:
            device_ids = device_ids.split(',')

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9)

        def assign_to_device(device, ps_device=FLAGS.local_ps_device):
            worker_device = device
            ps_sizes = [0]
            if FLAGS.local_ps_device.lower == 'gpu':
                ps_sizes = [0] * FLAGS.num_gpus
            def _assign(op):
                if op.device:
                  return op.device
                if op.type not in ['Variable', 'VariableV2']:
                  return worker_device
                device_index, _ = min(enumerate(
                    ps_sizes), key=operator.itemgetter(1))
                device_name = '/' + FLAGS.local_ps_device +':' + str(device_index)
                var_size = op.outputs[0].get_shape().num_elements()
                ps_sizes[device_index] += var_size
                return device_name
            return _assign

        images = None
        labels = None
        if FLAGS.use_dataset:
            with tf.device('/CPU:0'):
                d_features = mnist.train.images
                d_labels = mnist.train.labels
                dataset = tf.contrib.data.Dataset.from_tensor_slices((d_features, d_labels))
                dataset = dataset.shuffle(buffer_size=60000)
                dataset = dataset.repeat()
                dataset = dataset.batch(FLAGS.batch_size)
                # Trick to get datasets to buffer the next epoch.  This is needed because
                # the data loading is occuring outside DataSets in python.  Normally preprocessing
                # would occur in DataSets and this odd looking line is not needed.  
                dataset = dataset.map(lambda x,y:(x,y),
                    num_threads=FLAGS.num_gpus,
                    output_buffer_size=FLAGS.num_gpus)
                iterator = dataset.make_initializable_iterator()
                images,labels = iterator.get_next()

        tower_grads = []
        feed_vars = []
        average_loss_tensor = []
        reuse_variables = False
        accuracy = None
        for i in xrange(FLAGS.num_gpus):
            with tf.device(assign_to_device('/gpu:%s'%device_ids[i])):
                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope:
                    if not FLAGS.use_dataset:
                        feature_dim = models.feature_dim
                        label_dim = models.label_dim
                        images = tf.placeholder(tf.float32, [None, feature_dim], name='images')
                        labels = tf.placeholder(tf.int64, [None, label_dim], name='labels')
                        feed_vars.append((images, labels))
                    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): 
                        logits = models.model_fcn5(images)
                    if i == 0:
                        # Prediction only on GPU:0
                        predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
                        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))
                    loss = models.loss(logits, labels)
                    reuse_variables = True
                    average_loss_tensor.append(loss)
                    grads = optimizer.compute_gradients(loss)
                    tower_grads.append(grads)

        grads = average_gradients(tower_grads)
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)

        train_op = apply_gradient_op
        average_op = tf.reduce_mean(average_loss_tensor)
        saver = tf.train.Saver(tf.global_variables())

        init = tf.global_variables_initializer()
        sess = tf.Session(config=config)
        sess.run(init)
        if FLAGS.use_dataset:
            sess.run(iterator.initializer)
            
        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
        iterations = FLAGS.epochs * num_batches_per_epoch 
        average_batch_time = 0.0
        epochs_info = []

        step = 0
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            feed_dict = {}
            if not FLAGS.use_dataset:
                imgs, labs = get_real_batch_data(real_batch_size, 10)
                for i in range(FLAGS.num_gpus):
                    feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size]
                    feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] 
            _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict)
            duration = time.time() - start_time
            average_batch_time += float(duration)
            average_loss += loss_value

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % FLAGS.log_step == 0:
                examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
                average_loss /= num_batches_per_epoch * FLAGS.eval_step
                print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss))
                epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
                average_loss = 0.0
                feed_dict = { images: mnist.test.images, labels :mnist.test.labels }
                if not FLAGS.use_dataset:
                    feed_dict = {}
                    feed_dict[feed_vars[0][0]] = mnist.test.images
                    feed_dict[feed_vars[0][1]] = mnist.test.labels
                accuracy_value = accuracy.eval(session=sess, feed_dict=feed_dict)
                print("test accuracy %g"%accuracy_value)

        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
예제 #4
0
def train(model='fcn5'):
    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    device_id = FLAGS.device_id
    device_str = ''
    if int(device_id) >= 0:
        device_str = '/gpu:%d' % int(device_id)
        config.allow_soft_placement = True
        config.intra_op_parallelism_threads = 1
        config.inter_op_parallelism_threads = 0
    else:
        device_str = '/cpu:0'
        num_threads = os.getenv('OMP_NUM_THREADS', 1)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=int(num_threads))

    if FLAGS.xla:
        # Turns on XLA.  XLA is not included in the standard build.  For single GPU this shows ~5% improvement
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Graph().as_default(), tf.device(device_str), tf.Session(
            config=config) as sess:
        feature_dim = models.feature_dim
        label_dim = models.label_dim
        images = None
        labels = None
        iterator = None
        if FLAGS.use_dataset:
            with tf.device('/CPU:0'):
                d_features = mnist.train.images
                d_labels = mnist.train.labels
                dataset = tf.contrib.data.Dataset.from_tensor_slices(
                    (d_features, d_labels))
                dataset = dataset.repeat()
                dataset = dataset.shuffle(buffer_size=60000)
                dataset = dataset.batch(FLAGS.batch_size)
                # Trick to get datasets to buffer the next epoch.  This is needed because
                # the data loading is occuring outside DataSets in python.  Normally preprocessing
                # would occur in DataSets and this odd looking line is not needed.
                dataset = dataset.map(lambda x, y: (x, y),
                                      num_threads=1,
                                      output_buffer_size=1)
                iterator = dataset.make_initializable_iterator()
                images, labels = iterator.get_next()

        else:
            images = tf.placeholder(tf.float32, [None, feature_dim],
                                    name="images_placeholder")
            labels = tf.placeholder(tf.int64, [None, label_dim],
                                    name="labels_placeholder")

        logits = None
        loss = None
        if model == 'fcn5':
            logits = models.model_fcn5(images)
        else:
            logits = models.model_fcn8(images)
        loss = models.loss(logits, labels)

        predictionCorrectness = tf.equal(tf.argmax(logits, 1),
                                         tf.argmax(labels, 1))
        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))

        lr = 0.05
        optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss)

        init = tf.global_variables_initializer()

        sess.run(init)
        if FLAGS.use_dataset:
            sess.run(iterator.initializer)
        batch_size_per_epoch = int(
            (EPOCH_SIZE + FLAGS.batch_size - 1) / FLAGS.batch_size)
        iterations = FLAGS.epochs * batch_size_per_epoch
        average_batch_time = 0.0
        epochs_info = []
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs = None
            labs = None
            if FLAGS.use_dataset:
                _, loss_value = sess.run([optimizer, loss])
            else:
                imgs, labs = get_real_batch_data(FLAGS.batch_size, 10)
                _, loss_value = sess.run([optimizer, loss],
                                         feed_dict={
                                             images: imgs,
                                             labels: labs
                                         })
            duration = time.time() - start_time
            average_loss += loss_value
            average_batch_time += float(duration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % FLAGS.log_step == 0:
                examples_per_sec = FLAGS.batch_size / duration
                sec_per_batch = float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                )
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
            if step > 0 and step % (FLAGS.eval_step *
                                    batch_size_per_epoch) == 0:
                average_loss /= FLAGS.eval_step * batch_size_per_epoch
                accuracy_value = accuracy.eval(feed_dict={
                    images: mnist.test.images,
                    labels: mnist.test.labels
                })
                print("test accuracy %g" % accuracy_value)
                epochs_info.append('%d:%g:%s' %
                                   (step /
                                    (FLAGS.eval_step * batch_size_per_epoch),
                                    accuracy_value, average_loss))
                average_loss = 0.0
        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print('epoch_info: %s' % ','.join(epochs_info))
예제 #5
0
def main():
    ckpt_state = tf.train.get_checkpoint_state(CHECKPOINTS_PATH)
    if not ckpt_state or not ckpt_state.model_checkpoint_path:
        print('No check point files are found!')
        return

    ckpt_files = ckpt_state.all_model_checkpoint_paths
    num_ckpt = len(ckpt_files)
    if num_ckpt < 1:
        print('No check point files are found!')
        return

    low_res_holder = tf.placeholder(
        tf.float32, shape=[BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, NUM_CHENNELS])
    high_res_holder = tf.placeholder(
        tf.float32, shape=[BATCH_SIZE, LABEL_SIZE, LABEL_SIZE, NUM_CHENNELS])

    inferences = models.create_model(MODEL_NAME, low_res_holder)
    testing_loss = models.loss(inferences,
                               high_res_holder,
                               name='testing_loss')

    low_res_batch, high_res_batch = batch_queue_for_testing(TESTING_DATA_PATH)

    sess = tf.Session()
    # we still need to initialize all variables even when we use Saver's restore method.
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(tf.global_variables())
    tf.train.start_queue_runners(sess=sess)

    best_mse = 100000
    best_ckpt = ''
    for ckpt_file in ckpt_files:
        saver.restore(sess, ckpt_file)
        mse = 0
        for i in range(NUM_TESTING_STEPS):
            low_res_images, high_res_images = sess.run(
                [low_res_batch, high_res_batch])
            feed_dict = {
                low_res_holder: low_res_images,
                high_res_holder: high_res_images
            }
            mse += sess.run(testing_loss, feed_dict=feed_dict)
        mse /= NUM_TESTING_STEPS
        print('Model: %s. MSE: %.3f' % (ckpt_file, mse))

        if mse < best_mse:
            best_mse = mse
            best_ckpt = ckpt_file

    print('Best model: %s. MSE: %.3f' % (best_ckpt, best_mse))

    # now, we use the best model to generate some inference patches and compare with the ground truthes
    print('\ngenerating inference patches...')
    saver.restore(sess, best_ckpt)

    for k in range(4):
        low_res_images, high_res_images = sess.run(
            [low_res_batch, high_res_batch])
        feed_dict = {
            low_res_holder: low_res_images,
            high_res_holder: high_res_images
        }
        inference_patches = sess.run(inferences, feed_dict=feed_dict)

        if not os.path.exists(INFERENCES_SAVE_PATH):
            os.mkdir(INFERENCES_SAVE_PATH)

        for i in range(BATCH_SIZE):
            low_res_input = low_res_images[i, ...]  # INPUT_SIZE x INPUT_SIZE
            ground_truth = high_res_images[i, ...]  # LABEL_SIZE x LABEL_SIZE
            inference = inference_patches[i, ...]

            crop_begin = (ground_truth.shape[0] - inference.shape[0]) // 2
            crop_end = crop_begin + inference.shape[0]
            ground_truth = ground_truth[crop_begin:crop_end,
                                        crop_begin:crop_end, ...]
            low_res_input = cv.resize(low_res_input, (LABEL_SIZE, LABEL_SIZE),
                                      interpolation=cv.INTER_CUBIC)
            low_res_input = low_res_input[crop_begin:crop_end,
                                          crop_begin:crop_end, ...]
            patch_pair = np.hstack((low_res_input, inference, ground_truth))

            # patch_pair += 0.5
            patch_pair = tf.image.convert_image_dtype(patch_pair, tf.uint8,
                                                      True)

            save_name = 'inference_%d_%d.png' % (k, i)
            cv.imwrite(join(INFERENCES_SAVE_PATH, save_name),
                       patch_pair.eval(session=sess))

    print('Test Finished!')
예제 #6
0
def main():
    low_res_holder = tf.placeholder(
        tf.float32, shape=[BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, NUM_CHENNELS])
    high_res_holder = tf.placeholder(
        tf.float32, shape=[BATCH_SIZE, LABEL_SIZE, LABEL_SIZE, NUM_CHENNELS])

    inferences = models.create_model(MODEL_NAME, low_res_holder)
    training_loss = models.loss(inferences,
                                high_res_holder,
                                name='training_loss',
                                weights_decay=0)
    validation_loss = models.loss(inferences,
                                  high_res_holder,
                                  name='validation_loss')
    tf.summary.scalar('training_loss', training_loss)
    tf.summary.scalar('validation_loss', validation_loss)

    global_step = tf.Variable(0, trainable=False, name='global_step')
    # learning_rate = tf.train.piecewise_constant(
    #     global_step,
    #     [2000, 5000, 8000, 12000, 16000],
    #     [0.0005, 0.0001, 0.00005, 0.00001, 0.000005, 0.000001]
    # )
    learning_rate = tf.train.inverse_time_decay(0.001, global_step, 10000, 2)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(
        training_loss, global_step=global_step)

    low_res_batch, high_res_batch = batch_queue_for_training(
        TRAINING_DATA_PATH)
    low_res_eval, high_res_eval = batch_queue_for_testing(VALIDATION_DATA_PATH)

    init = (tf.global_variables_initializer(),
            tf.local_variables_initializer())
    sess = tf.Session()
    #sess.run(tf.global_variables_initializer())
    sess.run(init)
    # Start the queue runners (make batches).
    tf.train.start_queue_runners(sess=sess)

    # the saver will restore all model's variables during training
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=MAX_CKPT_TO_KEEP)
    # Merge all the summaries and write them out to TRAINING_DIR
    merged_summary = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(TRAINING_SUMMARY_PATH, sess.graph)

    for step in range(1, NUM_TRAINING_STEPS + 1):
        start_time = time.time()
        low_res_images, high_res_images = sess.run(
            [low_res_batch, high_res_batch])
        feed_dict = {
            low_res_holder: low_res_images,
            high_res_holder: high_res_images
        }
        _, batch_loss = sess.run([train_step, training_loss],
                                 feed_dict=feed_dict)
        duration = time.time() - start_time
        assert not np.isnan(batch_loss), 'Model diverged with loss = NaN'

        if step % 100 == 0:  # show training status
            num_examples_per_step = BATCH_SIZE
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = float(duration)

            format_str = 'step %d, batch_loss = %.3f (%.1f examples/sec; %.3f sec/batch)'
            print(format_str %
                  (step, batch_loss, examples_per_sec, sec_per_batch))

        if step % 1000 == 0:  # run validation and show its result
            low_res_images, high_res_images = sess.run(
                [low_res_eval, high_res_eval])
            feed_dict = {
                low_res_holder: low_res_images,
                high_res_holder: high_res_images
            }
            batch_loss = sess.run(validation_loss, feed_dict=feed_dict)
            print('step %d, validation loss = %.3f' % (step, batch_loss))

            summary = sess.run(merged_summary, feed_dict=feed_dict)
            summary_writer.add_summary(summary)

        # Save the model checkpoint periodically.
        if step % 10000 == 0 or (step + 1) == NUM_TRAINING_STEPS:
            saver.save(sess,
                       join(CHECKPOINTS_PATH, 'model.ckpt'),
                       global_step=step)

    print('Training Finished!')
예제 #7
0
def train(training_files,
          testing_files,
          params={
              'tau': 1e-5,
              'priorlengthscale': 1e1
          },
          learning_rate=1e-4,
          save_dir=None,
          model_name=None,
          batch_size=128,
          device='/gpu:1',
          iterations=int(1e6),
          save_step=int(1e3),
          summary_step=int(1e2),
          N=int(1e9)):

    params_path = './default-checkpoint/bayes_opt/%s_best_parameters_Ax.txt' % model_name
    if os.path.exists(params_path):
        with open(params_path) as json_file:
            params = json.load(json_file)
            print('----loaded best parameters----')
    tau, priorlengthscale = params['tau'], params['priorlengthscale']

    if save_dir is None:
        save_dir = './default-checkpoint'
        save_dir = os.path.join(
            save_dir, "default-%s-tau-%.3E-pls-%s.ckpt" %
            (model_name, tau, priorlengthscale))

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    em = himawari.EmulatorData()
    train_set = em.make_dataset(training_files, batch_size=batch_size)
    test_set = em.make_dataset(testing_files, batch_size=batch_size)

    # Use CNN
    output_bands = 6
    if model_name == 'DCFC':
        model = DCCNN(layer_sizes=[512] * 3 + [output_bands * 2 + 1],
                      filter_sizes=[1] * 4,
                      output_bands=output_bands,
                      N=N,
                      tau=tau,
                      priorlengthscale=priorlengthscale)
    elif model_name == 'DCCNN':
        model = DCCNN(layer_sizes=[512] * 3 + [output_bands * 2 + 1],
                      filter_sizes=[3] * 4,
                      output_bands=output_bands,
                      N=N,
                      tau=tau,
                      priorlengthscale=priorlengthscale)
    elif model_name == 'DCResNet':
        model = DCResNet(blocks=5,
                         output_bands=output_bands,
                         N=N,
                         tau=tau,
                         priorlengthscale=priorlengthscale)
    elif model_name == 'DCVDSR':
        model = DCVDSR(hidden_layers=[512] * 3,
                       output_bands=output_bands,
                       N=N,
                       tau=tau,
                       priorlengthscale=priorlengthscale)

    optimizer = tf.compat.v2.keras.optimizers.Adam(learning_rate)
    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               optimizer=optimizer,
                               net=model)
    manager = tf.train.CheckpointManager(ckpt, save_dir, max_to_keep=3)
    if manager.latest_checkpoint:
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from checkpoint {}".format(manager.latest_checkpoint))

    summary_writer = tf.summary.create_file_writer(save_dir + '/log')

    with summary_writer.as_default():

        for i in range(iterations):
            element = train_set.get_next()
            x_train, y_train, m_train = element['AHI05'], element[
                'AHI12'], element['mask']
            element = test_set.get_next()
            x_test, y_test, m_test = element['AHI05'], element[
                'AHI12'], element['mask']

            start_time = time.time()
            with tf.GradientTape() as tape:
                loc, logvar, probs, prediction, reg_losses, dropout_probs = model(
                    x_train, training=True)
                train_loss = loss(y_train,
                                  m_train,
                                  loc,
                                  logvar,
                                  probs,
                                  reg_losses=reg_losses,
                                  is_training=tf.constant(True),
                                  step=ckpt.step)

            grads = tape.gradient(train_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            duration = time.time() - start_time
            ckpt.step.assign_add(1)

            if int(ckpt.step) % save_step == 0:
                tf.saved_model.save(model,
                                    save_dir)  # only the latest model for now
                manager.save()

            if int(ckpt.step) % summary_step == 0:
                loc, logvar, probs, prediction, reg_losses, dropout_probs = model(
                    x_test, training=False)
                test_loss = loss(y_test,
                                 m_test,
                                 loc,
                                 logvar,
                                 probs,
                                 reg_losses=reg_losses,
                                 is_training=tf.constant(False),
                                 step=ckpt.step)
                print("Step: %d, Examples/sec: %0.5f, Training Loss: %2.4f, Test Loss: %2.4f" %  \
                      (int(ckpt.step), batch_size / duration, train_loss, test_loss))
                print("dropout probabilities: ", dropout_probs)

                for i in range(len(dropout_probs)):
                    tf.compat.v2.summary.scalar('concrete/dropout_prob_%s' % i,
                                                tf.reduce_mean(
                                                    dropout_probs[i]),
                                                step=int(ckpt.step))

                tf.summary.image('input-band0',
                                 tf.expand_dims(x_test[:, :, :, 0], -1),
                                 step=int(ckpt.step))
                tf.summary.image('label-band0',
                                 tf.expand_dims(tf.nn.relu(y_test[:, :, :, 0]),
                                                -1),
                                 step=int(ckpt.step))
                tf.summary.image('output-band0',
                                 tf.expand_dims(
                                     tf.nn.relu(prediction[:, :, :, 0]), -1),
                                 step=int(ckpt.step))

    test_loss = 0
    for i in range(100):
        element = test_set.get_next()
        x_test, y_test, m_test = element['AHI05'], element['AHI12'], element[
            'mask']
        loc, logvar, probs, prediction, reg_losses = model(x_test,
                                                           training=False)
        test_loss += loss(y_test,
                          m_test,
                          loc,
                          logvar,
                          probs,
                          reg_losses=reg_losses,
                          step=ckpt.step,
                          is_training=tf.constant(False))
    return test_loss.numpy() / 100.
예제 #8
0
def train():
    config = Config()

    classNum = config.classnum
    batch = config.batch
    size = config.size
    epoches = config.epoches
    preTrain = config.preTrain
    trainweights = config.trainWeights
    weights = config.weightsSave

    start_lr = config.start_lr
    lr_change = config.lr_change
    lr_decay = config.lr_decay

    os.makedirs(weights, exist_ok=True)

    model = RetinaNet(weights=preTrain, classNum=classNum)
    #for name, param in model.named_parameters():
    #    print(name, param)
    if t.cuda.is_available():
        print("----GPU-Training----")
        model = model.cuda()

    if not trainweights == None:
        print("trainWeights:", trainweights)
        model.load_state_dict(t.load(trainweights))

    model.train()
    optimer = Adam(model.parameters(), lr=start_lr)
    optimer.zero_grad()
    scheduler = lr_scheduler.MultiStepLR(optimer, lr_change, lr_decay)
    datasets = TrainDataset(img_road="datasets/train.txt", size=(size, size))
    dataloader = DataLoader(datasets,
                            batch_size=batch,
                            shuffle=True,
                            collate_fn=datasets.collate_fn,
                            drop_last=True)
    Loss = loss()

    for epoch in range(epoches):
        print("epoch-{}".format(epoch))
        for i, (imgs, labels, paths) in enumerate(dataloader):

            print("--epoch-{}-batch-{}--".format(epoch, i))
            if t.cuda.is_available():
                imgs = imgs.cuda()
                labels = labels.cuda()
            classify, regression, all_anchor = model(imgs)
            all_loss = Loss(classify, regression, labels, all_anchor)
            print("Loss:", all_loss)
            all_loss.backward()

            #if (i + 1) % 2 == 0:
            optimer.step()
            optimer.zero_grad()
        scheduler.step()
        if (epoch + 1) % 10 == 0:
            t.save(model.state_dict(),
                   weights + "epoch{}.pth".format(epoch + 49))
    t.save(model.state_dict(), weights + "finally.pth")
def main():
    SEED = 42
    torch.manual_seed(SEED)

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--map_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="Folder containing maps")
    parser.add_argument(
        "--goal_dir",
        default=None,
        type=str,
        required=True,
        help="Folder containing goals for maps. See dataset class for info.")
    parser.add_argument(
        "--heuristic_dir",
        default=None,
        type=str,
        required=True,
        help=
        "Folder containing heurisctics for maps. See dataset class for info.")
    parser.add_argument(
        "--map_to_heuristic",
        default=None,
        type=str,
        required=True,
        help=
        "json file with maps names as keys and heuristic files as values. Note that goal and heuristic for one task should have the same names."
    )

    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: small, big")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    parser.add_argument('--alpha',
                        type=float,
                        default=0.0,
                        required=True,
                        help="Weight for gradient loss.")
    parser.add_argument(
        '--alpha1',
        type=float,
        default=1.0,
        required=True,
        help=
        "Weight for component of piece loss where output heuristic is less than minimal cost."
    )
    parser.add_argument(
        '--alpha2',
        type=float,
        default=0.0,
        required=True,
        help=
        "Weight for component of piece loss where output heuristic is more than target cost."
    )

    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--learning_rate",
                        default=1e-3,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        '--desired_batch_size',
        type=int,
        default=32,
        help=
        "Desired batch size to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--num_train_epochs",
                        default=10,
                        type=int,
                        help="Total number of training epochs to perform.")

    args = parser.parse_args()
    alpha = args.alpha
    alpha1 = args.alpha1
    alpha2 = args.alpha2

    if args.model_type == 'small':
        model = SmallUNet()
    elif args.model_type == 'big':
        model = UNet()
    else:
        raise (ValueError, 'Model type should be in [small, big]')

    learning_rate = args.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = lambda output, target_map, minimal_cost: loss(
        output, target_map, minimal_cost, device, alpha, alpha1, alpha2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    exp_name = f'alpha_{alpha}_alpha1_{alpha1}_alpha2_{alpha2}'

    MAP_DIR = args.map_dir
    HEURISTIC_DIR = args.heuristic_dir
    GOAL_DIR = args.goal_dir
    map2heuristic_path = args.map_to_heuristic
    output_dir = args.output_dir

    with open(map2heuristic_path, 'r') as file:
        map2heuristic = json.load(file)

    batch_size = args.batch_size
    num_epochs = args.num_train_epochs
    desired_batch_size = args.desired_batch_size if args.desired_batch_size > batch_size else batch_size

    config = {
        'learning_rate': learning_rate,
        'alpha': alpha,
        'alpha1': alpha1,
        'alpha2': alpha2,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'desired_batch_size': desired_batch_size
    }

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    with open(os.path.join(output_dir, 'config.json'), 'w') as file:
        json.dump(config, file)

    dataset = MapsDataset(MAP_DIR,
                          HEURISTIC_DIR,
                          GOAL_DIR,
                          map2heuristic,
                          maps_size=(64, 64))
    train_dataset, val_dataset = random_split(dataset, [40000, 10000])
    train_batch_gen = DataLoader(train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 pin_memory=True,
                                 num_workers=cpu_count())
    val_batch_gen = DataLoader(val_dataset,
                               batch_size=batch_size,
                               shuffle=True,
                               pin_memory=True,
                               num_workers=cpu_count())

    _ = train_net(model,
                  criterion,
                  optimizer,
                  train_batch_gen,
                  val_batch_gen,
                  device,
                  num_epochs=num_epochs,
                  output_dir=output_dir,
                  desired_batch_size=desired_batch_size,
                  exp_name=exp_name)
예제 #10
0
import shutil

from torch.utils.tensorboard import SummaryWriter

if __name__ == '__main__':
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=2)
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--epoch', type=int, default=250)
    args = parser.parse_args()

    train_loader, test_loader = datasets.prepare(batch_size=args.batch_size)
    model = models.net(num_classes=datasets.num_classes).to(device)
    criterion = models.loss(num_classes=datasets.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    log_dir = 'data/runs'
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
        os.makedirs(log_dir)
    else:
        os.makedirs(log_dir)
    writer = SummaryWriter(log_dir=log_dir)
    epoch_digit = len(list(str(args.epoch)))
    for epoch in range(args.epoch):
        model.train()
        train_loss = 0
        train_acc = 0
        train_number = 0