def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.contrib.deprecated.scalar_summary(loss_name, l) return total_loss
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10_resnet(images) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): _, loss_value = sess.run([train_op, loss]) if step % 10 == 0: print 'step %d, loss = %.3f' % (step, loss_value)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference6(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name + ' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() print('Finished getting images & labels') # Build a Graph that computes the logits predictions from the # inference model. logits = modified_inference(images) print('Finished building inference graph') # Calculate loss. loss = cifar10.loss(logits, labels) print('Finished building loss graph') # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) print('Finished building train graph') class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: print('Hooks attached, starting training') while not mon_sess.should_stop(): mon_sess.run(train_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() #images, labels = cifar10.inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def multilevel_train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) init = tf.initialize_all_variables() # Build a Graph that trains the model with one batch of examples and # updates the model parameters. # train_op = cifar10.train(loss, global_step) sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Create a saver. # saver = tf.train.Saver(tf.all_variables()) model_dir_exp = os.path.expanduser( "/home/chenz/Workspace/Analysis/Parameters") ckpt_file = "model.ckpt-1500" meta_file = "model.ckpt-1500.meta" saver = tf.train.import_meta_graph( os.path.join(model_dir_exp, meta_file)) #saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file)) saver.restore(sess, os.path.join(model_dir_exp, ckpt_file)) #saver = load_model("/home/chenz/Workspace/Analysis/Parameters", "model.ckpt-1500.meta", "model.ckpt-1500") # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. # Start running operations on the Graph. #sess = tf.Session(config=tf.ConfigProto( # log_device_placement=FLAGS.log_device_placement)) #sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) loss_value = sess.run(loss) print(loss_value)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(images) ## EDIT: Softmax activation softmax = tf.nn.softmax(logits) loss = cifar10.loss(softmax, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() print(labels) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 10 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') print("module save") saver.save(sess, checkpoint_path, global_step=step)
def test_read_cifar10(): from tensorflow.models.image.cifar10 import cifar10 FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string( 'my_train_dir', '../cifar10_model/model1', """Directory where to write event logs """ """and checkpoint.""") cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.my_train_dir): tf.gfile.DeleteRecursively(FLAGS.my_train_dir) tf.gfile.MakeDirs(FLAGS.my_train_dir) with tf.Session() as sess: images, labels = cifar10.distorted_inputs() sess.run(tf.initialize_all_variables()) a, b = sess.run([images, labels]) print(len(a), len(a[0]))
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) #GETTING THE TRAINING IMAGES images, labels = cifar10.distorted_inputs() # DATA FOR GRAPH. logits = cifar10.inference(images) # LOSS FUNCTION loss = cifar10.loss(logits, labels) # CREATING AND RUNNING A TENSORBOARD GRAPH train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # START THE QUEUE tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # SAVE CHECKPOINT TO EVALUATE PERIODICALLY if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): # ops global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE)) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) summary_op = tf.merge_all_summaries() with tf.Session() as sess: saver = tf.train.Saver(tf.all_variables(), max_to_keep=21) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) # restore or initialize variables ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.initialize_all_variables()) # Start the queue runners. tf.train.start_queue_runners(sess=sess) start = sess.run(global_step) for step in xrange(start, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 500 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) """Train CIFAR-10 for a number of steps.""" cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) device_setter = tf.train.replica_device_setter(cluster=cluster) with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary('learning_rate', lr) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), replica_id=FLAGS.task_id, total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # Compute gradients with respect to the loss. grads = opt.compute_gradients(loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print ("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print ("Before session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print ('Started %d queues for processing input data.' % len(queue_runners)) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) print ('Start training') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.checkpoint_dir is not None: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) print("checkpoint path is %s" % ckpt.model_checkpoint_path) tf.train.Saver().restore(sess, ckpt.model_checkpoint_path) # Start the queue runners. print("FLAGS.checkpoint_dir is %s" % FLAGS.checkpoint_dir) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) cur_step = sess.run(global_step) print("current step is %s" % cur_step) interrupt_check_duration = 0.0 elapsed_time = time.time() flag = 0 for step in xrange(cur_step, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time interrupt_check_duration += duration if float(interrupt_check_duration) > 5.0: print("checking for interruption: %s", interrupt_check_duration) if decision_for_migration(): print("have to migrate") checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') print("checkpoint path is %s" % checkpoint_path) saver.save(sess, checkpoint_path, global_step=step) random_id = generate_random_prefix() start_new_instance(checkpoint_path, step, random_id) upload_checkpoint_to_s3(checkpoint_path, step, "mj-bucket-1", random_id) break else: print("not interrupted") interrupt_check_duration = 0.0 assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) elapsed = (int(time.time() - elapsed_time)) if elapsed % 300 == 0 and flag == 0: print("uploading current status") uploading_current_status_to_rds(step) flag = 1 elif elapsed % 300 != 0 and flag == 1: flag = 0
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): with tf.variable_scope("model") as scope: global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() images_eval, labels_eval = cifar10.inputs(eval_data=True) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) scope.reuse_variables() logits_eval = cifar10.inference(images_eval) # Calculate loss. loss = cifar10.loss(logits, labels) # For evaluation top_k = tf.nn.in_top_k(logits, labels, 1) top_k_eval = tf.nn.in_top_k(logits_eval, labels_eval, 1) # Add precision summary summary_train_prec = tf.placeholder(tf.float32) summary_eval_prec = tf.placeholder(tf.float32) tf.scalar_summary('precision/train', summary_train_prec) tf.scalar_summary('precision/eval', summary_eval_prec) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) EVAL_STEP = 10 EVAL_NUM_EXAMPLES = 1024 if step % EVAL_STEP == 0: prec_train = evaluate_set(sess, top_k, EVAL_NUM_EXAMPLES) prec_eval = evaluate_set(sess, top_k_eval, EVAL_NUM_EXAMPLES) print('%s: precision train = %.3f' % (datetime.now(), prec_train)) print('%s: precision eval = %.3f' % (datetime.now(), prec_eval)) if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict={ summary_train_prec: prec_train, summary_eval_prec: prec_eval }) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=1) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print('Started %d queues for processing input data.' % len(queue_runners)) """Train CIFAR-10 for a number of steps.""" for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') cifar10.maybe_download_and_extract() with tf.Graph().as_default(), tf.device('/cpu:0'): opt = tf.train.AdamOptimizer(1e-4) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(NUM_GPU): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: #train augmentation code를 포함하고 있으며 tensorflow/models/image/cifar10.py에 있음. images, labels = cifar10.distorted_inputs() # conv1. 이해가 쉽도록 짧은 layer를 사용 with tf.variable_scope('h_conv1') as scope: W_conv1 = weight_variable([5, 5, 3, 64], 'W_conv1') b_conv1 = bias_variable([64], 'b_conv1') h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1, name=scope.name) # pool1 h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='h_pool1')
def main(unused_argv): cifar10.maybe_download_and_extract() if FLAGS.download_only: sys.exit(0) if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) print("task index = %d" % FLAGS.task_index) #Construct the cluster and start the server ps_spec = FLAGS.ps_hosts.split(",") worker_spec = FLAGS.worker_hosts.split(",") #Approximation Layers approx_layers = FLAGS.layers_to_train.split(",") len_approx_layers = len(approx_layers) # Get the number of workers. num_workers = len(worker_spec) num_ps = len(ps_spec) cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec}) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() is_chief = (FLAGS.task_index == 0) if FLAGS.num_gpus > 0: if FLAGS.num_gpus < num_workers: raise ValueError("number of gpus is less than number of workers") # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (FLAGS.task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter( worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( num_ps, tf.contrib.training.byte_size_load_fn))): global_step = tf.Variable(0, name="global_step", trainable=False) #variables_to_update = tf.Placeholder(, name="variables_to_update") # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Variables that affect learning rate. num_batches_per_epoch = 50000 / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * 350) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(0.1, global_step, decay_steps, 0.1, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizerV2( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="cifar10_sync_replicas") #trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) train_step = opt.minimize(loss, global_step=global_step) # Approximation Training var_list = [] for i in range(len_approx_layers): var_list = var_list + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=approx_layers[i]) train_step_approx = opt.minimize(loss, global_step=global_step, var_list=var_list) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp(dir="/mnt", suffix="data", prefix="cifar10_train") if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, saver=None, summary_op=summary_op, save_summaries_secs=120, save_model_secs=600, checkpoint_basename='model.ckpt', ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor(is_chief=is_chief, logdir=train_dir, init_op=init_op, saver=None, summary_op=summary_op, save_summaries_secs=120, save_model_secs=600, checkpoint_basename='model.ckpt', recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % FLAGS.task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Restore from Checkpoint if FLAGS.checkpoint_restore > 0: checkpoint_directory = FLAGS.checkpoint_dir + str( FLAGS.checkpoint_restore) ckpt = tf.train.get_checkpoint_state(checkpoint_directory) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] else: print('No checkpoint file found') return # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 num_examples_per_step = 128 f = open('/mnt/train_output.log', 'w') #f.write("Training begins @ " + str(time_begin) +"\n") f.write( "Duration\tWorker\tLocalStep\tGlobalStep\tLoss\tExamplesPerSec\n") f.close() last = time_begin while True: start_time = time.time() if local_step < FLAGS.approx_step: _, step, loss_value = sess.run([train_step, global_step, loss]) else: if local_step % FLAGS.approx_interval == 0: _, step, loss_value = sess.run( [train_step_approx, global_step, loss]) else: _, step, loss_value = sess.run( [train_step, global_step, loss]) duration = time.time() - start_time local_step += 1 if local_step % 10 == 0: now = time.time() examples_per_sec = 10 * num_examples_per_step / (now - last) print( "%f: Worker %d: step %d (global step: %d of %d) loss = %.2f examples_per_sec = %.2f \n" % (now - last, FLAGS.task_index, local_step, step, FLAGS.train_steps, loss_value, examples_per_sec)) f = open('/mnt/train_output.log', 'a') f.write( str(now - last) + "\t" + str(FLAGS.task_index) + "\t" + str(local_step) + "\t" + str(step) + "\t" + str(loss_value) + "\t" + str(examples_per_sec) + "\n") f.close() last = now if step >= FLAGS.train_steps: break if sv.should_stop(): print('Stopped due to abort') break # Save the model checkpoint periodically. #if is_chief and (step % 1000 == 0 or (step + 1) == FLAGS.train_steps): if (step % 1000 == 0 or (step + 1) == FLAGS.train_steps): print('Taking a Checkpoint @ Global Step ' + str(step)) checkpoint_dir = "/mnt/checkpoint" + str(step) if tf.gfile.Exists(checkpoint_dir): tf.gfile.DeleteRecursively(checkpoint_dir) tf.gfile.MakeDirs(checkpoint_dir) checkpoint_path = os.path.join(checkpoint_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step) time_end = time.time() print("Training ends @ %f" % time_end) f = open('/mnt/train_output.log', 'a') #f.write("Training ends @ " + str(time_end) +"\n") training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) f.write("Training elapsed time: " + str(training_time) + " s\n") f.close()
FLAGS = tf.app.flags.FLAGS # import cifar10 data from tensorflow.models.image.cifar10 import cifar10 cifar10.maybe_download_and_extract() # global variable to select which (and how many) GPU's to use # (tensorflow can be hungry with resources if not properly controlled) gpus_to_use = [3] # network input (data and correct labels) # x = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) # y_ = tf.placeholder(tf.float32, shape=[None, 10]) train_images, train_labels = cifar10.distorted_inputs() test_images, test_labels = cifar10.inputs(eval_data=True) # select stream to use (train or test) select_test = tf.placeholder(dtype=bool,shape=[],name='select_test') x = tf.cond( select_test, lambda:test_images, lambda:train_images ) y_ = tf.cond( select_test, lambda:test_labels, lambda:train_labels )
def train(): print("\nSource code of training file {}:\n\n{}".format(__file__, open(__file__).read())) log('loading CIFAR') # Import data training_batch = cifar10.distorted_inputs() lm = LayerManager(forward_biased_estimate=False) batch = tf.Variable(0) with tf.name_scope('input'): fed_input_data = tf.placeholder(tf.float32, [None, IMAGE_SIZE, IMAGE_SIZE, 3]) fed_input_labels = tf.placeholder(tf.int32, [None]) drop_probs = [tf.Variable(tf.constant(DEFAULT_KEEP_PROB, shape=[1, 1, 1, ], dtype=tf.float32), trainable=False, collections=['Dropout']) for _ in range(NUM_DROPOUT_LAYERS)] with tf.name_scope('posterior'): training_batch_error, _, _, _ = full_model(lm, drop_probs, *training_batch) training_merged = lm.summaries.merge_all_summaries() lm.is_training = False tf.get_variable_scope().reuse_variables() lm.summaries.reset() with tf.name_scope('test'): _, test_percent_error, _, _ = full_model(lm, drop_probs, *cifar10.inputs(eval_data=True)) with tf.name_scope('forward'): _, _, forward_per_example_error, forward_incorrect_examples = full_model(lm, drop_probs, fed_input_data, fed_input_labels) def compute_test_percent_error(): return numpy.mean([sess.run([test_percent_error]) for _ in range(int(numpy.ceil(FLAGS.num_test_examples / FLAGS.batch_size)))]) saver = tf.train.Saver(tf.trainable_variables() + tf.get_collection('BatchNormInternal')) learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, batch, 5000, 0.8, staircase=True) train_step = tf.train.AdamOptimizer(learning_rate).minimize(training_batch_error, global_step=batch, var_list=lm.filter_factory.variables + lm.weight_factory.variables + lm.bias_factory.variables + lm.scale_factory.variables) fed_drop_probs = tf.placeholder(tf.float32, [None, None, None, None]) update_drop_probs = [tf.assign(drop_prob, fed_drop_probs, validate_shape=False) for drop_prob in drop_probs] with tf.Session() as sess: sess.run(tf.initialize_all_variables()) sess.run(tf.initialize_variables(tf.get_collection('BatchNormInternal'))) sess.run(tf.initialize_variables(tf.get_collection('Dropout'))) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if TRAIN: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) # test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') try: log('starting training') for i in range(FLAGS.max_steps): if i % 1000 == 999: # Do test set err = compute_test_percent_error() for j in range(NUM_DROPOUT_LAYERS): sess.run([update_drop_probs[j]], feed_dict={fed_drop_probs: [[[[1.0]]]]}) det_err = compute_test_percent_error() for j in range(NUM_DROPOUT_LAYERS): sess.run([update_drop_probs[j]], feed_dict={fed_drop_probs: [[[[DEFAULT_KEEP_PROB]]]]}) log('batch %s: Random test classification error = %s%%, deterministic test classification error = %s%%' % (i, err, det_err)) if i % 100 == 99: # Record a summary run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([training_merged, train_step], options=run_options, run_metadata=run_metadata) train_writer.add_summary(summary, i) train_writer.add_run_metadata(run_metadata, 'batch%d' % i) else: sess.run([train_step]) finally: log('saving') saver.save(sess, FLAGS.train_dir, global_step=batch) log('done') else: restore_latest(saver, sess, '/tmp/derandomizing_dropout', suffix='-100000') if DERANDOMIZE_DROPOUT: # NUM_RUNS = 10 # runs = [] # for _ in range(NUM_RUNS): # new_output_probs, = sess.run([forward_output], feed_dict={fed_input_data: mnist.train.images, fed_input_labels: mnist.train.labels}) # new_output = numpy.argmax(new_output_probs, 1) # runs.append(new_output) # # all_runs = numpy.vstack(runs).T # entropy = numpy.array([scipy.stats.entropy(numpy.bincount(row), base=2.0) for row in all_runs]) derandomized_drop_probs = [DEFAULT_KEEP_PROB * numpy.ones((1, HIDDEN_LAYER_SIZE)) for _ in range(NUM_DROPOUT_LAYERS)] num_tests_performed = 0 for pass_count in range(1): for j in range(HIDDEN_LAYER_SIZE): for i in range(NUM_DROPOUT_LAYERS): # range(NUM_DROPOUT_LAYERS-1,-1,-1): if derandomized_drop_probs[i][0, j] == 0.0 or derandomized_drop_probs[i][0, j] == 1.0: continue num_tests_performed += 1 for k in range(NUM_DROPOUT_LAYERS): if k == i: # curr_drop_probs = numpy.tile(derandomized_drop_probs[i], (BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, 1)) # to_randomize = HIDDEN_LAYER_SIZE - j - 1 # randperms = numpy.argsort(numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, to_randomize), axis=1) # # to_keep = max(int(HIDDEN_LAYER_SIZE*DEFAULT_KEEP_PROB-derandomized_drop_probs[i][:j].sum()), 1) # curr_drop_probs[:, j+1:] = (randperms < to_keep) curr_drop_probs = (numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, HIDDEN_LAYER_SIZE) < derandomized_drop_probs[i]).astype(numpy.float32) curr_drop_probs[:, j] = 0.0 # curr_drop_probs[:, j+1:j+2] = 1.0 sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: curr_drop_probs}) else: sess.run([update_drop_probs[k]], feed_dict={fed_drop_probs: numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE, HIDDEN_LAYER_SIZE) < derandomized_drop_probs[k]}) #indices = numpy.argmax(entropy[:, numpy.newaxis] + -numpy.log(-numpy.log(numpy.random.rand(entropy.shape[0], BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE))), axis=0) # indices = [numpy.argmax(1000*entropy + -numpy.log(-numpy.log(numpy.random.rand(*entropy.shape)))) for _ in range(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE)] # examples = mnist.train.images[indices, :] # labels = mnist.train.labels[indices] # Collect a bunch of 64-example batches together examples, labels = [numpy.concatenate(things, axis=0) for things in zip(*[sess.run(training_batch) for _ in range(BATCHES_PER_DERANDOMIZE_STEP)])] # Might want to use cross entropy, but why not not use percent error since we're not differentiating? # Using "test" expressions so we can manually feed in data, but we are feeding training data (same data for obj0 and obj1) err0, cross_entropies0 = sess.run([forward_incorrect_examples, forward_per_example_error], feed_dict={fed_input_data: examples, fed_input_labels: labels}) curr_drop_probs[:, j] = 1.0 # curr_drop_probs[:, j+1:] = (randperms < to_keep - 1) # curr_drop_probs[:, j+1:j+2] = 0.0 sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: curr_drop_probs}) err1, cross_entropies1 = sess.run([forward_incorrect_examples, forward_per_example_error], feed_dict={fed_input_data: examples, fed_input_labels: labels}) # One-sided paired-sample t-test cross_entropy_diff = cross_entropies0 - cross_entropies1 t = numpy.sqrt(BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE)*cross_entropy_diff.mean()/cross_entropy_diff.std(ddof=1) p = scipy.stats.t.sf(-t, df=BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE - 1) b = numpy.sum(err0 & ~err1) c = numpy.sum(err1 & ~err0) # if b + c < BINOMIAL_TEST_CUTOFF: # p = 0.5 # stat_message = "too small" # else: # # McNemar's test # if b + c >= CHI2_TEST_CUTOFF: # chi2 = (b-c)**2/(b+c) # p = scipy.stats.distributions.chi2.sf(chi2, df=1) # Two-sided # else: # p = scipy.stats.binom_test([b,c]) - scipy.stats.binom.pmf(b, b+c, 0.5) # Mid-p test # # Form one-sided p-value # if b > c: # p = 1-0.5*p # else: # p = 0.5*p # if b + c >= CHI2_TEST_CUTOFF: # stat_message = "p = %.4f, chi square test" % p # else: # stat_message = "p = %.4f, binomial mid-p test" % p if p < SIGNIFICANCE_LEVEL: # cross_entropies0.mean() <= cross_entropies1.mean(): # b <= c: new_drop_prob = 0.0 neuron_status = "drop" elif p > 1 - SIGNIFICANCE_LEVEL: new_drop_prob = 1.0 neuron_status = "keep" else: new_drop_prob = DEFAULT_KEEP_PROB neuron_status = "hmmm" #log(neuron_status + ' L{} N{}: b + c = {}, {}'.format(i, j, b+c, stat_message)) log(neuron_status + ' P{} L{} N{}: b = {}, c = {}, p = {}'.format(pass_count, i, j, b, c, p)) derandomized_drop_probs[i][0, j] = new_drop_prob for i in range(NUM_DROPOUT_LAYERS): num_dropped = (derandomized_drop_probs[i] == 0.0).sum() num_kept = (derandomized_drop_probs[i] == 1.0).sum() num_hmmm = HIDDEN_LAYER_SIZE - num_dropped - num_kept sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: numpy.ceil(derandomized_drop_probs[i])}) log('layer {}: {} neurons dropped, {} kept, {} undecided'.format(i, num_dropped, num_kept, num_hmmm)) log('Performed {} statistical tests'.format(num_tests_performed)) log('saving') saver.save(sess, FLAGS.train_dir, global_step=batch+1) log('done') else: restore_latest(saver, sess, '/tmp/derandomizing_dropout', suffix='-100001') err, = compute_test_percent_error() log('Test classification error = %s%%' % err) coord.request_stop() coord.join(threads) sess.close()
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0) summary_writer1 = tf.train.SummaryWriter(FLAGS.train_dir1) summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2) summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3) summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4) summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5) summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6) summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7) summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8) summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9) summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10) summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11) summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12) summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13) summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14) summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15) summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16) summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17) summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18) summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary_writer0.add_summary(summary_str, step) summary_writer1.add_summary(summary_str, step) summary_writer2.add_summary(summary_str, step) summary_writer3.add_summary(summary_str, step) summary_writer4.add_summary(summary_str, step) summary_writer5.add_summary(summary_str, step) summary_writer6.add_summary(summary_str, step) summary_writer7.add_summary(summary_str, step) summary_writer8.add_summary(summary_str, step) summary_writer9.add_summary(summary_str, step) summary_writer10.add_summary(summary_str, step) summary_writer11.add_summary(summary_str, step) summary_writer12.add_summary(summary_str, step) summary_writer13.add_summary(summary_str, step) summary_writer14.add_summary(summary_str, step) summary_writer15.add_summary(summary_str, step) summary_writer16.add_summary(summary_str, step) summary_writer17.add_summary(summary_str, step) summary_writer18.add_summary(summary_str, step) summary_writer19.add_summary(summary_str, step) # Save the model checkpoint periodically. # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: # checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') # saver.save(sess, checkpoint_path, global_step=step/100) # hard cord here!!! if step == 100: checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 200: checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 300: checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 400: checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 500: checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 600: checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 700: checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 800: checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 900: checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1000: checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1100: checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1200: checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1300: checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1400: checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1500: checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1600: checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1700: checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1800: checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 1900: checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step == 2000: checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0) summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1) summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2) summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3) summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4) summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5) summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6) summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7) summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8) summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9) summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10) summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11) summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12) summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13) summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14) summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15) summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16) summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17) summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18) summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary_writer0.add_summary(summary_str, step) summary_writer1.add_summary(summary_str, step) summary_writer2.add_summary(summary_str, step) summary_writer3.add_summary(summary_str, step) summary_writer4.add_summary(summary_str, step) summary_writer5.add_summary(summary_str, step) summary_writer6.add_summary(summary_str, step) summary_writer7.add_summary(summary_str, step) summary_writer8.add_summary(summary_str, step) summary_writer9.add_summary(summary_str, step) summary_writer10.add_summary(summary_str, step) summary_writer11.add_summary(summary_str, step) summary_writer12.add_summary(summary_str, step) summary_writer13.add_summary(summary_str, step) summary_writer14.add_summary(summary_str, step) summary_writer15.add_summary(summary_str, step) summary_writer16.add_summary(summary_str, step) summary_writer17.add_summary(summary_str, step) summary_writer18.add_summary(summary_str, step) summary_writer19.add_summary(summary_str, step) # Save the model checkpoint periodically. # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: # checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') # saver.save(sess, checkpoint_path, global_step=step/100) # hard cord here!!! if step==100: checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==200: checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==300: checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==400: checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==500: checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==600: checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==700: checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==800: checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==900: checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1000: checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1100: checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1200: checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1300: checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1400: checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1500: checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1600: checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1700: checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1800: checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1900: checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==2000: checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # #Adding dropout # keep_drop_prob = tf.placeholder(tf.float32) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # ###########Changes for visualization ############### with tf.variable_scope('conv1') as scope_conv: tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid_x = grid_y = 8 # to get a square grid for 64 conv1 features grid = put_kernels_on_grid(weights, (grid_y, grid_x)) tf.image_summary('conv1/features', grid, max_images=1) # #################################################### # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=1) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print ("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print ("Session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print ('Started %d queues for processing input data.' % len(queue_runners)) """Train CIFAR-10 for a number of steps.""" for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
#test_model.index = ii #print test_model.weights models.append(test_model) with test_model.g.as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() test_images, test_labels = cifar10.inputs(eval_data='test') # Build a Graph that computes the logits predictions from the # inference model. logits = test_model.predict(images) logit_test = test_model.predict(test_images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() eval_data = FLAGS.eval_data == 'test' #timages, tlabels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) #tlogits = cifar10.inference(timages) # Calculate loss. top_k_op = tf.nn.in_top_k(logits, labels, 1) loss = cifar10.loss(logits, labels) #precision = tf.Variable(0.8, name='precision') #tf.scalar_summary('accuracy', precision) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) sess.graph.finalize() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 100 == 0: # Build a Graph that computes the logits predictions from the # inference model. # Calculate predictions. num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * FLAGS.batch_size i_step = 0 while i_step < num_iter: predictions = sess.run([top_k_op]) true_count += np.sum(predictions) i_step += 1 #Compute precision @ 1. #sess.run(precision.assign(true_count / total_sample_count)) prec = true_count / total_sample_count print(prec) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='accuracy', simple_value=prec) summary_writer.add_summary(summary, step) #summary_str = sess.run(summary_op) #summary_writer.add_summary(summary_str, step) #summary_writer.flush() # Save the model checkpoint periodically. if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. #with tf.device('/gpu:%d' % FLAGS.gpu_number): images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) loss_per_batch = cifar10.loss_per_batch(logits, labels) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step, FLAGS.gpu_number) # Create a saver. saver = tf.train.Saver(tf.all_variables(), max_to_keep=None) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. config = tf.ConfigProto() config.gpu_options.allow_growth=True config.allow_soft_placement=True config.log_device_placement=FLAGS.log_device_placement sess = tf.Session(config=config) tf.train.write_graph(sess.graph_def, FLAGS.train_dir, "cifar10_train.pb", False) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) train_start_time = time.time() for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, logits_value, loss_per_batch_value, labels_value = sess.run([train_op, loss, logits, loss_per_batch, labels]) duration = time.time() - start_time #logits_str = print_logits(logits_value, labels_value, loss_per_batch_value) #with open(os.path.join(FLAGS.train_dir, 'logits_%d.log' % step),'w') as f: # f.write("%s" % logits_str) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') log_str = (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) print(log_str) with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f: f.write("%s\n" % log_str) if step % 500 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') save_path = saver.save(sess, checkpoint_path, global_step=step) train_duration = time.time() - train_start_time log_str = ("Finishing. Training %d batches of %d images took %fs\n" % (FLAGS.max_steps, FLAGS.batch_size, float(train_duration))) print(log_str) with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f: f.write("%s" % log_str)
def main(unused_argv): #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) cifar10.maybe_download_and_extract() if FLAGS.download_only: sys.exit(0) #cifar10.maybe_download_and_extract() if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") if FLAGS.task_index is None or FLAGS.task_index =="": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) print("task index = %d" % FLAGS.task_index) #Construct the cluster and start the server ps_spec = FLAGS.ps_hosts.split(",") worker_spec = FLAGS.worker_hosts.split(",") # Get the number of workers. num_workers = len(worker_spec) cluster = tf.train.ClusterSpec({ "ps": ps_spec, "worker": worker_spec}) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server( cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() is_chief = (FLAGS.task_index == 0) if FLAGS.num_gpus > 0: if FLAGS.num_gpus < num_workers: raise ValueError("number of gpus is less than number of workers") # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (FLAGS.task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter( worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): cifar10.maybe_download_and_extract() global_step = tf.Variable(0, name="global_step", trainable=False) # # Variables of the hidden layer # hid_w = tf.Variable( # tf.truncated_normal( # [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], # stddev=1.0 / IMAGE_PIXELS), # name="hid_w") # hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # # Variables of the softmax layer # sm_w = tf.Variable( # tf.truncated_normal( # [FLAGS.hidden_units, 10], # stddev=1.0 / math.sqrt(FLAGS.hidden_units)), # name="sm_w") # sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # # Ops: located on the worker specified with FLAGS.task_index # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) # y_ = tf.placeholder(tf.float32, [None, 10]) # hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # hid = tf.nn.relu(hid_lin) # y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) # cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries(); # Variables that affect learning rate. num_batches_per_epoch = 50000 / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * 350) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(0.1, global_step, decay_steps, 0.1, staircase=True) # Generate moving averages of all losses and associated summaries. #loss_averages_op = _add_loss_summaries(total_loss) opt = tf.train.GradientDescentOptimizer(lr) #opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizerV2( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="cifar10_sync_replicas") train_step = opt.minimize(loss, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp(dir="/mnt") if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % FLAGS.task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 while True: start_time = time.time() _, step = sess.run([train_step, global_step]) duration = time.time() - start_time local_step += 1 #assert not np.isnan(loss_value), 'Model diverged with loss = NaN' #if step % 10 == 0: # num_examples_per_step = FLAGS.batch_size # examples_per_sec = num_examples_per_step / duration # sec_per_batch = float(duration) # loss_value = 0 # format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' # 'sec/batch)') # print (format_str % (datetime.now(), local_step, loss_value, # examples_per_sec, sec_per_batch)) now = time.time() print("%f: Worker %d: training step %d done (global step: %d)" % (now, FLAGS.task_index, local_step, step)) if step >= FLAGS.train_steps: break #if step % 100 == 0: # summary_str = sess.run(summary_op) # summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. #if step % 1000 == 0 or (step + 1) == FLAGS.train_steps: # checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') # saver.save(sess, checkpoint_path, global_step=step) # local_step = 0 # while True: # # Training feed # batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) # train_feed = {x: batch_xs, y_: batch_ys} # _, step = sess.run([train_step, global_step], feed_dict=train_feed) # local_step += 1 # now = time.time() # print("%f: Worker %d: training step %d done (global step: %d)" % # (now, FLAGS.task_index, local_step, step)) # if step >= FLAGS.train_steps: # break time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time)
def multilevel_train_1ord(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) #Accurarcy top_k_op = tf.nn.in_top_k(logits, labels, 1) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) accurarcy = sess.run(top_k_op) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' output_list = [] # Do something with intermediate data (intermediate) # Save data on iterations of 0, 1000, 2000, 3000 if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: for v in tf.all_variables(): if "conv1/weights:" in v.name: print(v.name) output_list.append( tf.get_default_graph().get_tensor_by_name(v.name)) break if (step == 0): conv1_data_0 = sess.run(output_list) if (step == 1000): conv1_data_1000 = sess.run(output_list) if (step == 2000): conv1_data_2000 = sess.run(output_list) if (step == 3000): conv1_data_3000 = sess.run(output_list) (A, B, C, D, E) = np.array(conv1_data_3000).shape # do something. # do experiments if step == 3000 or (step + 1) == FLAGS.max_steps: print("************\n Chen process executing") _, new_data = process.exp_2_commMax(conv1_data_0, conv1_data_1000, conv1_data_2000, conv1_data_3000) for v in tf.all_variables(): if "conv1/weights:" in v.name: print("start assign: ") sess.run( tf.assign( tf.get_default_graph().get_tensor_by_name( v.name), new_data[0])) break value = sess.run(loss) pred = process.Count(accurarcy) print("new loss value is: " + str(value) + " accurarcy :" + str(pred)) if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) predict = process.Count(accurarcy) format_str = ( '%s: step %d, loss = %.2f, accu = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, predict, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def SGDBead(self, bead, thresh, maxindex): finalerror = 0. #thresh = .05 # Parameters learning_rate = 0.001 training_epochs = 15 batch_size = 100 display_step = 1 curWeights, curBiases = self.AllBeads[bead] #test_model = multilayer_perceptron(w=curWeights, b=curBiases) test_model = convnet(w=curWeights, b=curBiases) with test_model.g.as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() test_images, test_labels = cifar10.inputs(eval_data='test') # Build a Graph that computes the logits predictions from the # inference model. logits = test_model.predict(images) logit_test = test_model.predict(test_images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) top_k_op = tf.nn.in_top_k(logit_test, test_labels, 1) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. #sess = tf.Session(config=tf.ConfigProto( # log_device_placement=FLAGS.log_device_placement)) with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as sess: sess.run(init) tf.train.start_queue_runners(sess=sess) step = 0 stopcond = True while step < max_steps and stopcond: start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: num_iter = int(math.ceil(num_examples / batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * batch_size stepp = 0 while stepp < num_iter: predictions = sess.run([top_k_op]) true_count += np.sum(predictions) stepp += 1 # Compute precision @ 1. precision = true_count / total_sample_count print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) if precision > 1 - thresh: stopcond = False test_model.params = sess.run(test_model.weightslist), sess.run(test_model.biaseslist) self.AllBeads[bead]=test_model.params finalerror = 1 - precision print ("Final bead error: ",str(finalerror)) step += 1 return finalerror