def train(train_dir, examples_path, hparams, checkpoints_to_keep=5, num_steps=None, master='', task=0, num_ps_tasks=0): """Train loop.""" tf.gfile.MakeDirs(train_dir) is_chief = task == 0 _trial_summary(hparams, examples_path, train_dir) with tf.Graph().as_default(): with tf.device(tf.ReplicaDeviceSetter(num_ps_tasks, merge_devices=True)): transcription_data = _get_data(examples_path, hparams, is_training=True) loss, losses, unused_labels, unused_predictions, images = model.get_model( transcription_data, hparams, is_training=True) tf.summary.scalar('loss', loss) for label, loss_collection in losses.iteritems(): loss_label = 'losses/' + label tf.summary.scalar(loss_label, tf.reduce_mean(loss_collection)) for name, image in images.iteritems(): tf.summary.image(name, image) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm, summarize_gradients=True) logging_dict = { 'global_step': tf.train.get_global_step(), 'loss': loss } hooks = [ tf.train.LoggingTensorHook(logging_dict, every_n_iter=100) ] if num_steps: hooks.append(tf.StopAtStepHook(num_steps)) scaffold = tf.train.Scaffold(saver=tf.train.Saver( max_to_keep=checkpoints_to_keep)) tf.contrib.training.train(train_op=train_op, logdir=train_dir, scaffold=scaffold, hooks=hooks, save_checkpoint_secs=300, master=master, is_chief=is_chief)
def run(self): """Run training.""" is_chief = FLAGS.task_id == 0 or not FLAGS.supervisor sv = None def init_fn(sess, saver): ckpt = None if FLAGS.save_dir and sv is None: load_dir = FLAGS.save_dir ckpt = tf.train.get_checkpoint_state(load_dir) if ckpt and ckpt.model_checkpoint_path: logging.info('restoring from %s', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) elif FLAGS.load_path: logging.info('restoring from %s', FLAGS.load_path) saver.restore(sess, FLAGS.load_path) if FLAGS.supervisor: with tf.device( tf.ReplicaDeviceSetter(FLAGS.ps_tasks, merge_devices=True)): self.global_step = tf.contrib.framework.get_or_create_global_step( ) tf.set_random_seed(FLAGS.tf_seed) self.controller = self.get_controller(self.env) self.model = self.controller.model self.controller.setup() with tf.variable_scope(tf.get_variable_scope(), reuse=True): self.eval_controller = self.get_controller(self.eval_env) self.eval_controller.setup(train=False) saver = tf.train.Saver(max_to_keep=10) step = self.model.global_step sv = tf.Supervisor( logdir=FLAGS.save_dir, is_chief=is_chief, saver=saver, save_model_secs=600, summary_op=None, # we define it ourselves save_summaries_secs=60, global_step=step, init_fn=lambda sess: init_fn(sess, saver)) sess = sv.PrepareSession(FLAGS.master) else: tf.set_random_seed(FLAGS.tf_seed) self.global_step = tf.contrib.framework.get_or_create_global_step() self.controller = self.get_controller(self.env) self.model = self.controller.model self.controller.setup() with tf.variable_scope(tf.get_variable_scope(), reuse=True): self.eval_controller = self.get_controller(self.eval_env) self.eval_controller.setup(train=False) saver = tf.train.Saver(max_to_keep=10) sess = tf.Session() sess.run(tf.initialize_all_variables()) init_fn(sess, saver) self.sv = sv self.sess = sess logging.info('hparams:\n%s', self.hparams_string()) model_step = sess.run(self.model.global_step) if model_step >= self.num_steps: logging.info('training has reached final step') return losses = [] rewards = [] all_ep_rewards = [] for step in range(1 + self.num_steps): if sv is not None and sv.ShouldStop(): logging.info('stopping supervisor') break self.do_before_step(step) (loss, summary, total_rewards, episode_rewards) = self.controller.train(sess) _, greedy_episode_rewards = self.eval_controller.eval(sess) self.controller.greedy_episode_rewards = greedy_episode_rewards losses.append(loss) rewards.append(total_rewards) all_ep_rewards.extend(episode_rewards) if (random.random() < 0.1 and summary and episode_rewards and is_chief and sv and sv._summary_writer): sv.summary_computed(sess, summary) model_step = sess.run(self.model.global_step) if is_chief and step % self.validation_frequency == 0: logging.info( 'at training step %d, model step %d: ' 'avg loss %f, avg reward %f, ' 'episode rewards: %f, greedy rewards: %f', step, model_step, np.mean(losses), np.mean(rewards), np.mean(all_ep_rewards), np.mean(greedy_episode_rewards)) losses = [] rewards = [] all_ep_rewards = [] if model_step >= self.num_steps: logging.info('training has reached final step') break if is_chief and sv is not None: logging.info('saving final model to %s', sv.save_path) sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
def Train(train_dir, model_str, train_data, max_steps, master='', task=0, ps_tasks=0, initial_learning_rate=0.001, final_learning_rate=0.001, learning_rate_halflife=160000, optimizer_type='Adam', num_preprocess_threads=1, reader=None): """Testable trainer with no dependence on FLAGS. Args: train_dir: Directory to write checkpoints. model_str: Network specification string. train_data: Training data file pattern. max_steps: Number of training steps to run. master: Name of the TensorFlow master to use. task: Task id of this replica running the training. (0 will be master). ps_tasks: Number of tasks in ps job, or 0 if no ps job. initial_learning_rate: Learing rate at start of training. final_learning_rate: Asymptotic minimum learning rate. learning_rate_halflife: Number of steps over which to halve the difference between initial and final learning rate. optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'. num_preprocess_threads: Number of input threads. reader: Function that returns an actual reader to read Examples from input files. If None, uses tf.TFRecordReader(). """ if master.startswith('local'): device = tf.ReplicaDeviceSetter(ps_tasks) else: device = '/cpu:0' with tf.Graph().as_default(): with tf.device(device): model = InitNetwork(train_data, model_str, 'train', initial_learning_rate, final_learning_rate, learning_rate_halflife, optimizer_type, num_preprocess_threads, reader) # Create a Supervisor. It will take care of initialization, summaries, # checkpoints, and recovery. # # When multiple replicas of this program are running, the first one, # identified by --task=0 is the 'chief' supervisor. It is the only one # that takes case of initialization, etc. sv = tf.train.Supervisor(logdir=train_dir, is_chief=(task == 0), saver=model.saver, save_summaries_secs=10, save_model_secs=30, recovery_wait_secs=5) step = 0 while step < max_steps: try: # Get an initialized, and possibly recovered session. Launch the # services: Checkpointing, Summaries, step counting. with sv.managed_session(master) as sess: while step < max_steps: _, step = model.TrainAStep(sess) if sv.coord.should_stop(): break except tf.errors.AbortedError as e: logging.error('Received error:%s', e) continue
def RunComputation(): # filename for saving file if FLAGS.architecture == '2 layer_stimulus': architecture_string = ('_architecture=' + str(FLAGS.architecture) + '_stim_downsample_window=' + str(FLAGS.stim_downsample_window) + '_stim_downsample_stride=' + str(FLAGS.stim_downsample_stride)) else: architecture_string = ('_architecture=' + str(FLAGS.architecture)) short_filename = ('model=' + str(FLAGS.model_id) + '_loss=' + str(FLAGS.loss) + '_batch_sz=' + str(FLAGS.batchsz) + '_lam_w=' + str(FLAGS.lam_w) + '_step_sz' + str(FLAGS.step_sz) + '_tlen=' + str(FLAGS.train_len) + '_window=' + str(FLAGS.window) + '_stride=' + str(FLAGS.stride) + str(architecture_string) + '_jitter') # make a folder with name derived from parameters of the algorithm - it saves checkpoint files and summaries used in tensorboard parent_folder = FLAGS.save_location + FLAGS.folder_name + '/' # make folder if it does not exist if not gfile.IsDirectory(parent_folder): gfile.MkDir(parent_folder) FLAGS.save_location = parent_folder + short_filename + '/' print('Does the file exist?', gfile.IsDirectory(FLAGS.save_location)) if not gfile.IsDirectory(FLAGS.save_location): gfile.MkDir(FLAGS.save_location) save_filename = FLAGS.save_location + short_filename """Main function which runs all TensorFlow computations.""" with tf.Graph().as_default() as gra: with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)): print(FLAGS.config_params) tf.logging.info(FLAGS.config_params) # set up training dataset tc_mean = get_data_mat.init_chunks(FLAGS.n_chunks) ''' # plot histogram of a training dataset stim_train, resp_train, train_len = get_data_mat.get_stim_resp('train', num_chunks=FLAGS.num_chunks_to_load) plt.hist(np.ndarray.flatten(stim_train[:,:,0:])) plt.show() plt.draw() ''' # Create computation graph. # # Graph should be fully constructed before you create supervisor. # Attempt to modify graph after supervisor is created will cause an error. with tf.name_scope('model'): if FLAGS.architecture == '1 layer': # single GPU model if False: global_step = tf.contrib.framework.create_global_step() model, stim, resp = jitter_model.approximate_conv_jitter( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels) # multiGPU model if True: model, stim, resp, global_step = jitter_model.approximate_conv_jitter_multigpu( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels, FLAGS.config_params) if FLAGS.architecture == '2 layer_stimulus': # stimulus is first smoothened to lower dimensions, then same model is applied print(' put stimulus to lower dimenstions!') model, stim, resp, global_step, stim_tuple = jitter_model.approximate_conv_jitter_multigpu_stim_lr( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels, FLAGS.config_params, FLAGS.stim_downsample_window, FLAGS.stim_downsample_stride) # Print the number of variables in graph print('Calculating model size') # Hope we do not exceed memory PrintModelAnalysis(gra, max_depth=10) #import pdb; pdb.set_trace() # Builds our summary op. summary_op = model.merged_summary # Create a Supervisor. It will take care of initialization, summaries, # checkpoints, and recovery. # # When multiple replicas of this program are running, the first one, # identified by --task=0 is the 'chief' supervisor. It is the only one # that takes case of initialization, etc. is_chief = (FLAGS.task == 0) # & (FLAGS.learn==1) print(save_filename) if FLAGS.learn == 1: # use supervisor only for learning, # otherwise it messes up data as it tries to store variables while you are doing analysis sv = tf.train.Supervisor(logdir=save_filename, is_chief=is_chief, saver=tf.train.Saver(), summary_op=None, save_model_secs=100, global_step=global_step, recovery_wait_secs=5) if (is_chief and FLAGS.learn == 1): tf.train.write_graph(tf.get_default_graph().as_graph_def(), save_filename, 'graph.pbtxt') # Get an initialized, and possibly recovered session. Launch the # services: Checkpointing, Summaries, step counting. # # When multiple replicas of this program are running the services are # only launched by the 'chief' replica. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) #import pdb; pdb.set_trace() sess = sv.PrepareSession(FLAGS.master, config=session_config) FitComputation(sv, sess, model, stim, resp, global_step, summary_op, stim_tuple) sv.Stop() else: # if not learn, then analyse session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=session_config) as sess: saver_var = tf.train.Saver( tf.all_variables(), keep_checkpoint_every_n_hours=float('inf')) restore_file = tf.train.latest_checkpoint(save_filename) print(restore_file) start_iter = int( restore_file.split('/')[-1].split('-')[-1]) saver_var.restore(sess, restore_file) if FLAGS.architecture == '2 layer_stimulus': AnalyseModel_lr(sess, model) else: AnalyseModel(sv, sess, model)
def main(_): if not tf.gfile.Exists(FLAGS.train_log_dir): tf.gfile.MakeDirs(FLAGS.train_log_dir) config = configuration.Configuration() g = tf.Graph() with g.as_default(): # If ps_tasks is zero, the local device is used. When using multiple # (non-local) replicas, the ReplicaDeviceSetter distributes the variables # across the different devices. with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)): if config.dataset == 'cub': images, gt_labels_list, _ = (cub_provider.provide_data( 'train', FLAGS.batch_size, split_type=config.split_type, preprocess_options=config.preprocess_options, image_resize=config.image_size, shuffle_data=True, use_inception_resnet_v2=config.cub_irv2_features, box_cox_lambda=config.cub_box_cox_lambda, categorical=config.cub_categorical, classes_only=config.cub_classes_only, skip_classes=config.cub_skip_classes, )) num_classes_per_attribute = config.num_classes_per_attribute else: images, gt_labels_list, _, _, num_classes_per_attribute = ( dataset_provider.provide_data( 'train', FLAGS.batch_size, split_type=config.split_type, preprocess_options=config.preprocess_options, grayscale=config.grayscale)) attribute_label_map = label_map.LabelMap(config.label_map_json) assert attribute_label_map.count_labels == num_classes_per_attribute, ( 'Please check your label_map_file, to make sure it corresponds to ' 'the dataset being loaded.') # Define the model: if config.dataset == 'cub' and config.cub_irv2_features: logits_list = multi_attribute_net.mlp_multi_attribute_net( images, num_classes_per_attribute=num_classes_per_attribute, attribute_names=attribute_label_map.attributes, hidden_units=config.comprehensibility_hidden_units, is_training=True) else: logits_list = multi_attribute_net.conv_multi_attribute_net( images, num_classes_per_attribute=num_classes_per_attribute, attribute_names=attribute_label_map.attributes, hidden_units=config.comprehensibility_hidden_units, is_training=True) # Specify the loss function: for logits, sparse_gt_labels in zip(logits_list, gt_labels_list): tf.contrib.losses.add_loss( tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sparse_gt_labels, logits=logits))) total_loss = tf.contrib.losses.get_total_loss() tf.contrib.deprecated.scalar_summary('Total Loss', total_loss) # Specify the optimization scheme: optimizer = tf.train.GradientDescentOptimizer(config.learning_rate) # Set up training. train_op = slim.learning.create_train_op(total_loss, optimizer) # Run training. slim.learning.train(train_op=train_op, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def RunComputation(): # filename for saving files, derived from FLAGS. short_filename = get_filename() # make a folder with name derived from parameters of the algorithm # it saves checkpoint files and summaries used in tensorboard parent_folder = FLAGS.save_location + FLAGS.folder_name + '/' # make folder if it does not exist if not gfile.IsDirectory(parent_folder): gfile.MkDir(parent_folder) FLAGS.save_location = parent_folder + short_filename + '/' print('Does the file exist?', gfile.IsDirectory(FLAGS.save_location)) if not gfile.IsDirectory(FLAGS.save_location): gfile.MkDir(FLAGS.save_location) save_filename = FLAGS.save_location + short_filename if FLAGS.learn == 0: # for analysis, use smaller batch sizes, so that we can work with single GPU. FLAGS.batchsz = 600 #Set up tensorflow with tf.Graph().as_default() as gra: with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks)): print(FLAGS.config_params) tf.logging.info(FLAGS.config_params) # set up training dataset # tc_mean = get_data_mat.init_chunks(FLAGS.n_chunks) <- use this with old get_data_mat tc_mean = get_data_mat.init_chunks(FLAGS.batchsz) #plt.plot(tc_mean) #plt.show() #plt.draw() # Create computation graph. # # Graph should be fully constructed before you create supervisor. # Attempt to modify graph after supervisor is created will cause an error. with tf.name_scope('model'): if FLAGS.architecture == '1 layer': # single GPU model if False: global_step = tf.contrib.framework.create_global_step() model, stim, resp = jitter_model.approximate_conv_jitter( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels) # multiGPU model if True: model, stim, resp, global_step = jitter_model.approximate_conv_jitter_multigpu( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels, FLAGS.config_params) if FLAGS.architecture == '2 layer_stimulus': # stimulus is first smoothened to lower dimensions, then same model is applied print('First take a low resolution version of stimulus') model, stim, resp, global_step, stim_tuple = ( jitter_model.approximate_conv_jitter_multigpu_stim_lr( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels, FLAGS.config_params, FLAGS.stim_downsample_window, FLAGS.stim_downsample_stride)) if FLAGS.architecture == 'complex': print(' Multiple modifications over 2 layered model above') model, stim, resp, global_step = ( jitter_model_2. approximate_conv_jitter_multigpu_complex( FLAGS.n_cells, FLAGS.lam_w, FLAGS.window, FLAGS.stride, FLAGS.step_sz, tc_mean, FLAGS.su_channels, FLAGS.config_params, FLAGS.stim_downsample_window, FLAGS.stim_downsample_stride)) # Print the number of variables in graph print('Calculating model size') # Hope we do not exceed memory PrintModelAnalysis(gra, max_depth=10) # Builds our summary op. summary_op = model.merged_summary # Create a Supervisor. It will take care of initialization, summaries, # checkpoints, and recovery. # # When multiple replicas of this program are running, the first one, # identified by --task=0 is the 'chief' supervisor. It is the only one # that takes case of initialization, etc. is_chief = (FLAGS.task == 0) # & (FLAGS.learn==1) print(save_filename) if FLAGS.learn == 1: # use supervisor only for learning, # otherwise it messes up data as it tries to store variables while you are doing analysis sv = tf.train.Supervisor(logdir=save_filename, is_chief=is_chief, saver=tf.train.Saver(), summary_op=None, save_model_secs=100, global_step=global_step, recovery_wait_secs=5) if (is_chief and FLAGS.learn == 1): # save graph only if task id =0 (is_chief) and learning the model tf.train.write_graph(tf.get_default_graph().as_graph_def(), save_filename, 'graph.pbtxt') # Get an initialized, and possibly recovered session. Launch the # services: Checkpointing, Summaries, step counting. # # When multiple replicas of this program are running the services are # only launched by the 'chief' replica. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = sv.PrepareSession(FLAGS.master, config=session_config) # Finally, learn the parameters of the model FitComputation(sv, sess, model, stim, resp, global_step, summary_op) sv.Stop() else: # Analyse the model session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=session_config) as sess: # First, recover the model saver_var = tf.train.Saver( tf.all_variables(), keep_checkpoint_every_n_hours=float('inf')) restore_file = tf.train.latest_checkpoint(save_filename) print(restore_file) start_iter = int( restore_file.split('/')[-1].split('-')[-1]) saver_var.restore(sess, restore_file) # model specific analysis if FLAGS.architecture == '2 layer_stimulus': AnalyseModel_lr(sess, model) elif FLAGS.architecture == 'complex': AnalyseModel_complex(sess, model, stim, resp, save_filename) else: AnalyseModel(sv, sess, model)