Exemplo n.º 1
0
def train(train_dir,
          examples_path,
          hparams,
          checkpoints_to_keep=5,
          num_steps=None,
          master='',
          task=0,
          num_ps_tasks=0):
    """Train loop."""
    tf.gfile.MakeDirs(train_dir)
    is_chief = task == 0

    _trial_summary(hparams, examples_path, train_dir)
    with tf.Graph().as_default():
        with tf.device(tf.ReplicaDeviceSetter(num_ps_tasks,
                                              merge_devices=True)):
            transcription_data = _get_data(examples_path,
                                           hparams,
                                           is_training=True)

            loss, losses, unused_labels, unused_predictions, images = model.get_model(
                transcription_data, hparams, is_training=True)

            tf.summary.scalar('loss', loss)
            for label, loss_collection in losses.iteritems():
                loss_label = 'losses/' + label
                tf.summary.scalar(loss_label, tf.reduce_mean(loss_collection))
            for name, image in images.iteritems():
                tf.summary.image(name, image)
            optimizer = tf.train.AdamOptimizer(
                learning_rate=hparams.learning_rate)

            train_op = slim.learning.create_train_op(
                loss,
                optimizer,
                clip_gradient_norm=hparams.clip_norm,
                summarize_gradients=True)

            logging_dict = {
                'global_step': tf.train.get_global_step(),
                'loss': loss
            }

            hooks = [
                tf.train.LoggingTensorHook(logging_dict, every_n_iter=100)
            ]
            if num_steps:
                hooks.append(tf.StopAtStepHook(num_steps))

            scaffold = tf.train.Scaffold(saver=tf.train.Saver(
                max_to_keep=checkpoints_to_keep))

            tf.contrib.training.train(train_op=train_op,
                                      logdir=train_dir,
                                      scaffold=scaffold,
                                      hooks=hooks,
                                      save_checkpoint_secs=300,
                                      master=master,
                                      is_chief=is_chief)
Exemplo n.º 2
0
    def run(self):
        """Run training."""
        is_chief = FLAGS.task_id == 0 or not FLAGS.supervisor
        sv = None

        def init_fn(sess, saver):
            ckpt = None
            if FLAGS.save_dir and sv is None:
                load_dir = FLAGS.save_dir
                ckpt = tf.train.get_checkpoint_state(load_dir)
            if ckpt and ckpt.model_checkpoint_path:
                logging.info('restoring from %s', ckpt.model_checkpoint_path)
                saver.restore(sess, ckpt.model_checkpoint_path)
            elif FLAGS.load_path:
                logging.info('restoring from %s', FLAGS.load_path)
                saver.restore(sess, FLAGS.load_path)

        if FLAGS.supervisor:
            with tf.device(
                    tf.ReplicaDeviceSetter(FLAGS.ps_tasks,
                                           merge_devices=True)):
                self.global_step = tf.contrib.framework.get_or_create_global_step(
                )
                tf.set_random_seed(FLAGS.tf_seed)
                self.controller = self.get_controller(self.env)
                self.model = self.controller.model
                self.controller.setup()
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    self.eval_controller = self.get_controller(self.eval_env)
                    self.eval_controller.setup(train=False)

                saver = tf.train.Saver(max_to_keep=10)
                step = self.model.global_step
                sv = tf.Supervisor(
                    logdir=FLAGS.save_dir,
                    is_chief=is_chief,
                    saver=saver,
                    save_model_secs=600,
                    summary_op=None,  # we define it ourselves
                    save_summaries_secs=60,
                    global_step=step,
                    init_fn=lambda sess: init_fn(sess, saver))
                sess = sv.PrepareSession(FLAGS.master)
        else:
            tf.set_random_seed(FLAGS.tf_seed)
            self.global_step = tf.contrib.framework.get_or_create_global_step()
            self.controller = self.get_controller(self.env)
            self.model = self.controller.model
            self.controller.setup()
            with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                self.eval_controller = self.get_controller(self.eval_env)
                self.eval_controller.setup(train=False)

            saver = tf.train.Saver(max_to_keep=10)
            sess = tf.Session()
            sess.run(tf.initialize_all_variables())
            init_fn(sess, saver)

        self.sv = sv
        self.sess = sess

        logging.info('hparams:\n%s', self.hparams_string())

        model_step = sess.run(self.model.global_step)
        if model_step >= self.num_steps:
            logging.info('training has reached final step')
            return

        losses = []
        rewards = []
        all_ep_rewards = []
        for step in range(1 + self.num_steps):

            if sv is not None and sv.ShouldStop():
                logging.info('stopping supervisor')
                break

            self.do_before_step(step)

            (loss, summary, total_rewards,
             episode_rewards) = self.controller.train(sess)
            _, greedy_episode_rewards = self.eval_controller.eval(sess)
            self.controller.greedy_episode_rewards = greedy_episode_rewards
            losses.append(loss)
            rewards.append(total_rewards)
            all_ep_rewards.extend(episode_rewards)

            if (random.random() < 0.1 and summary and episode_rewards
                    and is_chief and sv and sv._summary_writer):
                sv.summary_computed(sess, summary)

            model_step = sess.run(self.model.global_step)
            if is_chief and step % self.validation_frequency == 0:
                logging.info(
                    'at training step %d, model step %d: '
                    'avg loss %f, avg reward %f, '
                    'episode rewards: %f, greedy rewards: %f', step,
                    model_step, np.mean(losses), np.mean(rewards),
                    np.mean(all_ep_rewards), np.mean(greedy_episode_rewards))

                losses = []
                rewards = []
                all_ep_rewards = []

            if model_step >= self.num_steps:
                logging.info('training has reached final step')
                break

        if is_chief and sv is not None:
            logging.info('saving final model to %s', sv.save_path)
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
Exemplo n.º 3
0
def Train(train_dir,
          model_str,
          train_data,
          max_steps,
          master='',
          task=0,
          ps_tasks=0,
          initial_learning_rate=0.001,
          final_learning_rate=0.001,
          learning_rate_halflife=160000,
          optimizer_type='Adam',
          num_preprocess_threads=1,
          reader=None):
    """Testable trainer with no dependence on FLAGS.

  Args:
    train_dir: Directory to write checkpoints.
    model_str: Network specification string.
    train_data: Training data file pattern.
    max_steps: Number of training steps to run.
    master: Name of the TensorFlow master to use.
    task: Task id of this replica running the training. (0 will be master).
    ps_tasks: Number of tasks in ps job, or 0 if no ps job.
    initial_learning_rate: Learing rate at start of training.
    final_learning_rate: Asymptotic minimum learning rate.
    learning_rate_halflife: Number of steps over which to halve the difference
      between initial and final learning rate.
    optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
    num_preprocess_threads: Number of input threads.
    reader: Function that returns an actual reader to read Examples from input
      files. If None, uses tf.TFRecordReader().
  """
    if master.startswith('local'):
        device = tf.ReplicaDeviceSetter(ps_tasks)
    else:
        device = '/cpu:0'
    with tf.Graph().as_default():
        with tf.device(device):
            model = InitNetwork(train_data, model_str, 'train',
                                initial_learning_rate, final_learning_rate,
                                learning_rate_halflife, optimizer_type,
                                num_preprocess_threads, reader)

            # Create a Supervisor.  It will take care of initialization, summaries,
            # checkpoints, and recovery.
            #
            # When multiple replicas of this program are running, the first one,
            # identified by --task=0 is the 'chief' supervisor.  It is the only one
            # that takes case of initialization, etc.
            sv = tf.train.Supervisor(logdir=train_dir,
                                     is_chief=(task == 0),
                                     saver=model.saver,
                                     save_summaries_secs=10,
                                     save_model_secs=30,
                                     recovery_wait_secs=5)

            step = 0
            while step < max_steps:
                try:
                    # Get an initialized, and possibly recovered session.  Launch the
                    # services: Checkpointing, Summaries, step counting.
                    with sv.managed_session(master) as sess:
                        while step < max_steps:
                            _, step = model.TrainAStep(sess)
                            if sv.coord.should_stop():
                                break
                except tf.errors.AbortedError as e:
                    logging.error('Received error:%s', e)
                    continue
Exemplo n.º 4
0
def RunComputation():

    # filename for saving file
    if FLAGS.architecture == '2 layer_stimulus':
        architecture_string = ('_architecture=' + str(FLAGS.architecture) +
                               '_stim_downsample_window=' +
                               str(FLAGS.stim_downsample_window) +
                               '_stim_downsample_stride=' +
                               str(FLAGS.stim_downsample_stride))
    else:
        architecture_string = ('_architecture=' + str(FLAGS.architecture))

    short_filename = ('model=' + str(FLAGS.model_id) + '_loss=' +
                      str(FLAGS.loss) + '_batch_sz=' + str(FLAGS.batchsz) +
                      '_lam_w=' + str(FLAGS.lam_w) + '_step_sz' +
                      str(FLAGS.step_sz) + '_tlen=' + str(FLAGS.train_len) +
                      '_window=' + str(FLAGS.window) + '_stride=' +
                      str(FLAGS.stride) + str(architecture_string) + '_jitter')

    # make a folder with name derived from parameters of the algorithm - it saves checkpoint files and summaries used in tensorboard
    parent_folder = FLAGS.save_location + FLAGS.folder_name + '/'
    # make folder if it does not exist
    if not gfile.IsDirectory(parent_folder):
        gfile.MkDir(parent_folder)
    FLAGS.save_location = parent_folder + short_filename + '/'
    print('Does the file exist?', gfile.IsDirectory(FLAGS.save_location))
    if not gfile.IsDirectory(FLAGS.save_location):
        gfile.MkDir(FLAGS.save_location)

    save_filename = FLAGS.save_location + short_filename
    """Main function which runs all TensorFlow computations."""
    with tf.Graph().as_default() as gra:
        with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)):
            print(FLAGS.config_params)
            tf.logging.info(FLAGS.config_params)
            # set up training dataset
            tc_mean = get_data_mat.init_chunks(FLAGS.n_chunks)
            '''
      # plot histogram of a training dataset
      stim_train, resp_train, train_len = get_data_mat.get_stim_resp('train',
                                                                     num_chunks=FLAGS.num_chunks_to_load)
      plt.hist(np.ndarray.flatten(stim_train[:,:,0:]))
      plt.show()
      plt.draw()
      '''
            # Create computation graph.
            #
            # Graph should be fully constructed before you create supervisor.
            # Attempt to modify graph after supervisor is created will cause an error.

            with tf.name_scope('model'):
                if FLAGS.architecture == '1 layer':
                    # single GPU model
                    if False:
                        global_step = tf.contrib.framework.create_global_step()
                        model, stim, resp = jitter_model.approximate_conv_jitter(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels)

                    # multiGPU model
                    if True:
                        model, stim, resp, global_step = jitter_model.approximate_conv_jitter_multigpu(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels, FLAGS.config_params)

                if FLAGS.architecture == '2 layer_stimulus':
                    # stimulus is first smoothened to lower dimensions, then same model is applied
                    print(' put stimulus to lower dimenstions!')
                    model, stim, resp, global_step, stim_tuple = jitter_model.approximate_conv_jitter_multigpu_stim_lr(
                        FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride,
                        FLAGS.step_sz, tc_mean, FLAGS.su_channels,
                        FLAGS.config_params, FLAGS.stim_downsample_window,
                        FLAGS.stim_downsample_stride)

            # Print the number of variables in graph
            print('Calculating model size')  # Hope we do not exceed memory
            PrintModelAnalysis(gra, max_depth=10)
            #import pdb; pdb.set_trace()

            # Builds our summary op.
            summary_op = model.merged_summary

            # Create a Supervisor.  It will take care of initialization, summaries,
            # checkpoints, and recovery.
            #
            # When multiple replicas of this program are running, the first one,
            # identified by --task=0 is the 'chief' supervisor.  It is the only one
            # that takes case of initialization, etc.
            is_chief = (FLAGS.task == 0)  # & (FLAGS.learn==1)
            print(save_filename)
            if FLAGS.learn == 1:
                # use supervisor only for learning,
                # otherwise it messes up data as it tries to store variables while you are doing analysis

                sv = tf.train.Supervisor(logdir=save_filename,
                                         is_chief=is_chief,
                                         saver=tf.train.Saver(),
                                         summary_op=None,
                                         save_model_secs=100,
                                         global_step=global_step,
                                         recovery_wait_secs=5)

                if (is_chief and FLAGS.learn == 1):
                    tf.train.write_graph(tf.get_default_graph().as_graph_def(),
                                         save_filename, 'graph.pbtxt')

                # Get an initialized, and possibly recovered session.  Launch the
                # services: Checkpointing, Summaries, step counting.
                #
                # When multiple replicas of this program are running the services are
                # only launched by the 'chief' replica.
                session_config = tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False)
                #import pdb; pdb.set_trace()
                sess = sv.PrepareSession(FLAGS.master, config=session_config)

                FitComputation(sv, sess, model, stim, resp, global_step,
                               summary_op, stim_tuple)
                sv.Stop()

            else:
                # if not learn, then analyse

                session_config = tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False)
                with tf.Session(config=session_config) as sess:
                    saver_var = tf.train.Saver(
                        tf.all_variables(),
                        keep_checkpoint_every_n_hours=float('inf'))
                    restore_file = tf.train.latest_checkpoint(save_filename)
                    print(restore_file)
                    start_iter = int(
                        restore_file.split('/')[-1].split('-')[-1])
                    saver_var.restore(sess, restore_file)

                    if FLAGS.architecture == '2 layer_stimulus':
                        AnalyseModel_lr(sess, model)
                    else:
                        AnalyseModel(sv, sess, model)
Exemplo n.º 5
0
def main(_):
    if not tf.gfile.Exists(FLAGS.train_log_dir):
        tf.gfile.MakeDirs(FLAGS.train_log_dir)

    config = configuration.Configuration()

    g = tf.Graph()
    with g.as_default():
        # If ps_tasks is zero, the local device is used. When using multiple
        # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
        # across the different devices.
        with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)):
            if config.dataset == 'cub':
                images, gt_labels_list, _ = (cub_provider.provide_data(
                    'train',
                    FLAGS.batch_size,
                    split_type=config.split_type,
                    preprocess_options=config.preprocess_options,
                    image_resize=config.image_size,
                    shuffle_data=True,
                    use_inception_resnet_v2=config.cub_irv2_features,
                    box_cox_lambda=config.cub_box_cox_lambda,
                    categorical=config.cub_categorical,
                    classes_only=config.cub_classes_only,
                    skip_classes=config.cub_skip_classes,
                ))
                num_classes_per_attribute = config.num_classes_per_attribute
            else:
                images, gt_labels_list, _, _, num_classes_per_attribute = (
                    dataset_provider.provide_data(
                        'train',
                        FLAGS.batch_size,
                        split_type=config.split_type,
                        preprocess_options=config.preprocess_options,
                        grayscale=config.grayscale))

            attribute_label_map = label_map.LabelMap(config.label_map_json)

            assert attribute_label_map.count_labels == num_classes_per_attribute, (
                'Please check your label_map_file, to make sure it corresponds to '
                'the dataset being loaded.')

            # Define the model:
            if config.dataset == 'cub' and config.cub_irv2_features:
                logits_list = multi_attribute_net.mlp_multi_attribute_net(
                    images,
                    num_classes_per_attribute=num_classes_per_attribute,
                    attribute_names=attribute_label_map.attributes,
                    hidden_units=config.comprehensibility_hidden_units,
                    is_training=True)
            else:
                logits_list = multi_attribute_net.conv_multi_attribute_net(
                    images,
                    num_classes_per_attribute=num_classes_per_attribute,
                    attribute_names=attribute_label_map.attributes,
                    hidden_units=config.comprehensibility_hidden_units,
                    is_training=True)

            # Specify the loss function:
            for logits, sparse_gt_labels in zip(logits_list, gt_labels_list):
                tf.contrib.losses.add_loss(
                    tf.reduce_mean(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(
                            labels=sparse_gt_labels, logits=logits)))
            total_loss = tf.contrib.losses.get_total_loss()
            tf.contrib.deprecated.scalar_summary('Total Loss', total_loss)

            # Specify the optimization scheme:
            optimizer = tf.train.GradientDescentOptimizer(config.learning_rate)

            # Set up training.
            train_op = slim.learning.create_train_op(total_loss, optimizer)

            # Run training.
            slim.learning.train(train_op=train_op,
                                logdir=FLAGS.train_log_dir,
                                master=FLAGS.master,
                                is_chief=FLAGS.task == 0,
                                number_of_steps=FLAGS.max_number_of_steps,
                                save_summaries_secs=FLAGS.save_summaries_secs,
                                save_interval_secs=FLAGS.save_interval_secs)
Exemplo n.º 6
0
def RunComputation():

    # filename for saving files, derived from FLAGS.
    short_filename = get_filename()

    # make a folder with name derived from parameters of the algorithm
    # it saves checkpoint files and summaries used in tensorboard
    parent_folder = FLAGS.save_location + FLAGS.folder_name + '/'
    # make folder if it does not exist
    if not gfile.IsDirectory(parent_folder):
        gfile.MkDir(parent_folder)
    FLAGS.save_location = parent_folder + short_filename + '/'
    print('Does the file exist?', gfile.IsDirectory(FLAGS.save_location))
    if not gfile.IsDirectory(FLAGS.save_location):
        gfile.MkDir(FLAGS.save_location)
    save_filename = FLAGS.save_location + short_filename

    if FLAGS.learn == 0:
        # for analysis, use smaller batch sizes, so that we can work with single GPU.
        FLAGS.batchsz = 600

    #Set up tensorflow
    with tf.Graph().as_default() as gra:
        with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)):
            print(FLAGS.config_params)
            tf.logging.info(FLAGS.config_params)

            # set up training dataset
            # tc_mean = get_data_mat.init_chunks(FLAGS.n_chunks) <- use this with old get_data_mat
            tc_mean = get_data_mat.init_chunks(FLAGS.batchsz)
            #plt.plot(tc_mean)
            #plt.show()
            #plt.draw()

            # Create computation graph.
            #
            # Graph should be fully constructed before you create supervisor.
            # Attempt to modify graph after supervisor is created will cause an error.
            with tf.name_scope('model'):
                if FLAGS.architecture == '1 layer':
                    # single GPU model
                    if False:
                        global_step = tf.contrib.framework.create_global_step()
                        model, stim, resp = jitter_model.approximate_conv_jitter(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels)

                    # multiGPU model
                    if True:
                        model, stim, resp, global_step = jitter_model.approximate_conv_jitter_multigpu(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels, FLAGS.config_params)

                if FLAGS.architecture == '2 layer_stimulus':
                    # stimulus is first smoothened to lower dimensions, then same model is applied
                    print('First take a low resolution version of stimulus')
                    model, stim, resp, global_step, stim_tuple = (
                        jitter_model.approximate_conv_jitter_multigpu_stim_lr(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels, FLAGS.config_params,
                            FLAGS.stim_downsample_window,
                            FLAGS.stim_downsample_stride))

                if FLAGS.architecture == 'complex':
                    print(' Multiple modifications over 2 layered model above')
                    model, stim, resp, global_step = (
                        jitter_model_2.
                        approximate_conv_jitter_multigpu_complex(
                            FLAGS.n_cells, FLAGS.lam_w, FLAGS.window,
                            FLAGS.stride, FLAGS.step_sz, tc_mean,
                            FLAGS.su_channels, FLAGS.config_params,
                            FLAGS.stim_downsample_window,
                            FLAGS.stim_downsample_stride))

            # Print the number of variables in graph
            print('Calculating model size')  # Hope we do not exceed memory
            PrintModelAnalysis(gra, max_depth=10)

            # Builds our summary op.
            summary_op = model.merged_summary

            # Create a Supervisor.  It will take care of initialization, summaries,
            # checkpoints, and recovery.
            #
            # When multiple replicas of this program are running, the first one,
            # identified by --task=0 is the 'chief' supervisor.  It is the only one
            # that takes case of initialization, etc.
            is_chief = (FLAGS.task == 0)  # & (FLAGS.learn==1)
            print(save_filename)

            if FLAGS.learn == 1:
                # use supervisor only for learning,
                # otherwise it messes up data as it tries to store variables while you are doing analysis

                sv = tf.train.Supervisor(logdir=save_filename,
                                         is_chief=is_chief,
                                         saver=tf.train.Saver(),
                                         summary_op=None,
                                         save_model_secs=100,
                                         global_step=global_step,
                                         recovery_wait_secs=5)

                if (is_chief and FLAGS.learn == 1):
                    # save graph only if task id =0 (is_chief) and learning the model
                    tf.train.write_graph(tf.get_default_graph().as_graph_def(),
                                         save_filename, 'graph.pbtxt')

                # Get an initialized, and possibly recovered session.  Launch the
                # services: Checkpointing, Summaries, step counting.
                #
                # When multiple replicas of this program are running the services are
                # only launched by the 'chief' replica.
                session_config = tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False)
                sess = sv.PrepareSession(FLAGS.master, config=session_config)

                # Finally, learn the parameters of the model
                FitComputation(sv, sess, model, stim, resp, global_step,
                               summary_op)
                sv.Stop()

            else:
                # Analyse the model

                session_config = tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False)
                with tf.Session(config=session_config) as sess:

                    # First, recover the model
                    saver_var = tf.train.Saver(
                        tf.all_variables(),
                        keep_checkpoint_every_n_hours=float('inf'))
                    restore_file = tf.train.latest_checkpoint(save_filename)
                    print(restore_file)
                    start_iter = int(
                        restore_file.split('/')[-1].split('-')[-1])
                    saver_var.restore(sess, restore_file)

                    # model specific analysis
                    if FLAGS.architecture == '2 layer_stimulus':
                        AnalyseModel_lr(sess, model)
                    elif FLAGS.architecture == 'complex':
                        AnalyseModel_complex(sess, model, stim, resp,
                                             save_filename)
                    else:
                        AnalyseModel(sv, sess, model)