def main(argv=None): with tf.Graph().as_default(): model_fn = select_model(FLAGS.model_type) # Open the metadata file and figure out nlabels, and size of epoch input_file = os.path.join(FLAGS.train_dir, 'md.json') print(input_file) with open(input_file, 'r') as f: md = json.load(f) images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size, FLAGS.image_size, FLAGS.num_preprocess_threads) logits = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True) total_loss = loss(logits, labels) ini_global_step = 0 train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss, FLAGS.steps_per_decay, FLAGS.eta_decay_rate) saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) tf.global_variables_initializer().run(session=sess) # This is total hackland, it only works to fine-tune iv3 if FLAGS.pre_model: inception_variables = tf.get_collection(tf.GraphKeys.VARIABLES, scope="InceptionV3") restorer = tf.train.Saver(inception_variables) restorer.restore(sess, FLAGS.pre_model) if FLAGS.pre_checkpoint_path: if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True: print('Trying to restore checkpoint from %s' % FLAGS.pre_checkpoint_path) restorer = tf.train.Saver() restorer.restore( sess, tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path)) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pre_checkpoint_path)) ini_global_step = get_restored_step(FLAGS.pre_checkpoint_path) print('Initail Global Step is {}'.format(ini_global_step)) run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid()) checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint) if tf.gfile.Exists(run_dir) is False: print('Creating %s' % run_dir) tf.gfile.MakeDirs(run_dir) tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(run_dir, sess.graph) steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size) num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch print('Requested number of steps [%d]' % num_steps) for step in xrange(num_steps): step += ini_global_step start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # Loss only actually evaluated every 100 steps? if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 1000 == 0 or (step + 1) == num_steps: saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None): with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) model_fn = select_model(FLAGS.model_type) # Open the metadata file and figure out nlabels, and size of epoch input_file = os.path.join(FLAGS.train_dir, 'md.json') print(input_file) with open(input_file, 'r') as f: md = json.load(f) images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size, FLAGS.image_size, FLAGS.num_preprocess_threads) if not FLAGS.dual: logits = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True) total_loss, accuracy = loss(logits, labels, global_step) else: with tf.variable_scope("net1") as scope: logits1 = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True) with tf.variable_scope("net2") as scope: logits2 = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True) pred1 = tf.argmax(logits1, 1) pred2 = tf.argmax(logits2, 1) update_step = tf.stop_gradient( tf.to_float( tf.logical_or(tf.not_equal(pred1, pred2), global_step < FLAGS.init_iter))) with tf.variable_scope("net1") as scope: if FLAGS.min_batch_size == -1: total_loss1, accuracy1 = loss(logits1, labels, global_step, None, scope.name) else: total_loss1, accuracy1 = loss(logits1, labels, global_step, update_step, scope.name) with tf.variable_scope("net2") as scope: if FLAGS.min_batch_size == -1: total_loss2, accuracy2 = loss(logits2, labels, global_step, None, scope.name) else: total_loss2, accuracy2 = loss(logits2, labels, global_step, update_step, scope.name) disagree_rate = tf.reduce_mean( tf.to_float(tf.not_equal(pred1, pred2))) if not FLAGS.dual: train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss) else: with tf.variable_scope("net1") as scope: var_net1 = [ var for var in tf.all_variables() if var.name.startswith("net1") ] train_op1 = optimizer(FLAGS.optim, FLAGS.eta, total_loss1, variables=var_net1, name=scope.name) with tf.variable_scope("net2") as scope: var_net2 = [ var for var in tf.all_variables() if var.name.startswith("net2") ] train_op2 = optimizer(FLAGS.optim, FLAGS.eta, total_loss2, variables=var_net2, name=scope.name) saver = tf.train.Saver(tf.all_variables(), max_to_keep=151) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # This is total hackland, it only works to fine-tune iv3 if FLAGS.pre_model: inception_variables = tf.get_collection(tf.GraphKeys.VARIABLES, scope="InceptionV3") restorer = tf.train.Saver(inception_variables) restorer.restore(sess, FLAGS.pre_model) if FLAGS.pre_checkpoint_path: if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True: print('Trying to restore checkpoint from %s' % FLAGS.pre_checkpoint_point) restorer = tf.train.Saver() tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pre_checkpoint_path)) run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid()) checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint) if tf.gfile.Exists(run_dir) is False: print('Creating %s' % run_dir) tf.gfile.MakeDirs(run_dir) tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True) tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(run_dir, sess.graph) steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size) num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch print('Requested number of steps [%d]' % num_steps) trainable_buffer_img = None trainable_buffer_lbl = None for step in range(num_steps): start_time = time.time() if FLAGS.Qloss: _, loss_value, acc_value, q_val = sess.run( [train_op, total_loss, accuracy, Q_GLOBAL], feed_dict={global_step: step}) print(q_val) elif not FLAGS.dual: _, loss_value, acc_value = sess.run( [train_op, total_loss, accuracy], feed_dict={global_step: step}) elif FLAGS.dual and (step < FLAGS.init_iter or FLAGS.min_batch_size != -1): _, _, loss_value, acc_value1, acc_value2, drate = sess.run( [ train_op1, train_op2, total_loss1, accuracy1, accuracy2, disagree_rate ], feed_dict={global_step: step}) else: #loss_value, acc_value1, acc_value2, drate = (0,0,0,0) img, lbl, us, loss_value, acc_value1, acc_value2, drate = sess.run( [ images, labels, update_step, total_loss1, accuracy1, accuracy2, disagree_rate ], feed_dict={global_step: step}) rel_img = img[us == 1] rel_lbl = lbl[us == 1] if trainable_buffer_img is None: trainable_buffer_img = rel_img trainable_buffer_lbl = rel_lbl else: print(np.shape(trainable_buffer_lbl), np.shape(rel_lbl)) trainable_buffer_img = np.vstack( (trainable_buffer_img, rel_img)) trainable_buffer_lbl = np.hstack( (trainable_buffer_lbl, rel_lbl)) if trainable_buffer_img.shape[0] >= FLAGS.batch_size: batch_img = trainable_buffer_img[:FLAGS.batch_size] batch_lbl = trainable_buffer_lbl[:FLAGS.batch_size] _, _, loss_value, acc_value1, acc_value2, drate = sess.run( [ train_op1, train_op2, total_loss1, accuracy1, accuracy2, disagree_rate ], feed_dict={ global_step: step, images: batch_img, labels: batch_lbl }) trainable_buffer_img = trainable_buffer_img[FLAGS. batch_size:] trainable_buffer_lbl = trainable_buffer_lbl[FLAGS. batch_size:] #_, loss_value, acc_value2, drate = sess.run([train_op2, total_loss2, accuracy2, disagree_rate], feed_dict={global_step: step}) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 1 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) if not FLAGS.dual: format_str = ( '%s: step %d, loss = %.3f, acc = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, acc_value, examples_per_sec, sec_per_batch)) else: format_str = ( '%s: step %d, loss = %.3f, acc1 = %.3f, acc2 = %.3f, disagree_rate = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, acc_value1, acc_value2, drate, examples_per_sec, sec_per_batch)) # Loss only actually evaluated every 100 steps? if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 200 == 0 or (step + 1) == num_steps: saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None): with tf.Graph().as_default(): model_fn = select_model(FLAGS.model_type) # Open the metadata file and figure out nlabels, and size of epoch input_file = os.path.join(FLAGS.train_dir, 'md.json') print(input_file) with open(input_file, 'r') as f: md = json.load(f) images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size, FLAGS.image_size, FLAGS.num_preprocess_threads) logits = model_fn(md['nlabels'], images, 1-FLAGS.pdrop, True) total_loss = loss(logits, labels) train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss) saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) tf.global_variables_initializer().run(session=sess) # This is total hackland, it only works to fine-tune iv3 if FLAGS.pre_model: inception_variables = tf.get_collection( tf.GraphKeys.VARIABLES, scope="InceptionV3") restorer = tf.train.Saver(inception_variables) restorer.restore(sess, FLAGS.pre_model) if FLAGS.pre_checkpoint_path: if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True: print('Trying to restore checkpoint from %s' % FLAGS.pre_checkpoint_path) restorer = tf.train.Saver() tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pre_checkpoint_path)) run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid()) checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint) if tf.gfile.Exists(run_dir) is False: print('Creating %s' % run_dir) tf.gfile.MakeDirs(run_dir) tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(run_dir, sess.graph) steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size) num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch print('Requested number of steps [%d]' % num_steps) for step in xrange(num_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # Loss only actually evaluated every 100 steps? if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 1000 == 0 or (step + 1) == num_steps: saver.save(sess, checkpoint_path, global_step=step)