def main(_): with tf.Graph().as_default(): num_batches_per_epoch_train = int(60000 / cfg.batch_size) num_batches_test = int(10000 / cfg.batch_size) batch_x, batch_labels = create_inputs(is_train=False) output = net.build_arch(batch_x, is_train=False) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session() as sess: tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(cfg.test_logdir, graph=sess.graph) for epoch in range(cfg.epoch): ckpt = os.path.join( cfg.logdir, 'model.ckpt-%d' % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) for i in range(num_batches_test): summary_str = sess.run(summary_op) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) step += 1
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size) batch_x, batch_labels = create_inputs() output = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session() as sess: tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter( cfg.test_logdir, graph=None) # graph=sess.graph, huge! for epoch in range(cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = re.compile() ckpt = os.path.join( cfg.logdir, 'model.ckpt-%d' % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(args): tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_test = int(dataset_size_test / cfg.batch_size * 0.5) batch_x, batch_labels = create_inputs() output, pose_out = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={ 'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0' }, allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) mode_file = tf.train.latest_checkpoint(ckpt) saver.restore(sess, mode_file) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v = sess.run([batch_acc]) accuracy_sum += batch_acc_v[0] print(accuracy_sum) ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] with tf.Graph().as_default(): batch_x, batch_labels, datanum = utils.get_batch_data( is_training=False) num_batches_test = math.ceil(datanum / cfg.batch_size) #get the ceiling int output = net.build_arch(batch_x, coord_add, is_train=False) predict = tf.argmax(output, axis=1) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) sess = tf.Session() tf.train.start_queue_runners(sess=sess) ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print(ckpt.model_checkpoint_path) summary_writer = tf.summary.FileWriter(cfg.test_logdir, graph=sess.graph) for epoch in range(cfg.test_epoch): accuracy_sum = 0 for i in range(num_batches_test): y_pred, y, batch_acc_v, summary_str = sess.run( [predict, batch_labels, batch_acc, summary_op]) if i % 10 == 0: print('%d/%d batches are tested.' % (step, num_batches_test)) #print("labels:\n",batch_labels) print("Y:\n", y) print("Y_prediction:", batch_acc_v, "\n", y_pred) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 if i == 0: y_pred1 = y_pred label1 = y else: y_pred1 = np.concatenate((y_pred1, y_pred), axis=0) label1 = np.concatenate((label1, y), axis=0) #print("Label:",np.shape(label1),"\n", label1) ave_acc = accuracy_sum / num_batches_test # print("The last batch----Y:",np.shape(y),"\n", y) # print("Y_prediction:", batch_acc_v, "\n", y_pred) print(epoch, 'epoch: average accuracy is %f' % ave_acc) print(np.shape(y_pred1), ",", datanum) label1 = label1[:datanum] y_pred1 = y_pred1[:datanum] print("label:", np.shape(label1)) trade_data.out_indi_data(cfg.test_dataset, y_pred1, datalen=cfg.image_size)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) if model_name == "caps": output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) elif model_name == "cnn_baseline": output = net.build_arch_baseline(batch_x, is_train=False, num_classes=num_classes) else: raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!" batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run([batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter( cfg.test_logdir, graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join(cfg.logdir, __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) print('%d batch accuracy.' % batch_acc_v) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] coord_add = np.array(coord_add, dtype=np.float32)/28. data = utils.load_trade(is_training=True) datanum = data.num_examples with tf.Graph().as_default(), tf.device('/cpu:0'): batch_x =tf.placeholder(tf.float32,[cfg.batch_size,cfg.image_size,cfg.image_size,3]) batch_labels = tf.placeholder(tf.int32,[cfg.batch_size]) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) opt = tf.train.AdamOptimizer() #batch_x, batch_labels,datanum = utils.get_shuffle_batch_data(is_training=True) num_batches_per_epoch = int(datanum / cfg.batch_size) print(datanum,num_batches_per_epoch) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, coord_add, is_train=True) # loss = net.cross_ent_loss(output, batch_labels) loss = net.spread_loss(output, batch_labels, m_op) accuracy = net.test_accuracy(output,batch_labels) grad = opt.compute_gradients(loss) loss_name = 'spread_loss' # Print trainable variable parameter statistics to stdout. # By default, statistics are associated with each graph node. param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) # param_stats is tensorflow.tfprof.TFGraphNodeProto proto. # Let's print the root below. sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) summaries = [] summaries.append(tf.summary.scalar(loss_name, loss)) summaries.append(tf.summary.scalar("accuracy",accuracy)) train_op = opt.apply_gradients(grad, global_step=global_step) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) # add addition options to trace the session execution options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) #cfg.epoch) # restore from the check point ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = int(ckpt.model_checkpoint_path.split('-')[1]) print(ckpt, ckpt.model_checkpoint_path, initial_step) else: initial_step =0 m = 0.2 summary_op = tf.summary.merge(summaries) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(cfg.logdir, graph=sess.graph) cal_num=0 for step in range(cfg.epoch): for i in range(num_batches_per_epoch): tic = time.time() x,y = data.next_batch(cfg.batch_size) _, loss_value,accuracy_val = sess.run([train_op, loss,accuracy], feed_dict={batch_x:x,batch_labels:y,m_op: m}) print('%d/%d, %d/%d iteration is finished in ' % (step,cfg.epoch,i,num_batches_per_epoch) + '%f second' % (time.time()-tic) + ',m:',m,',loss: %f'% loss_value,",accuracy:",accuracy_val) assert not np.isnan(loss_value), 'loss is nan' cal_num+=1 if i % 30 == 0: summary_str = sess.run(summary_op, feed_dict={batch_x:x,batch_labels:y,m_op: m}, options=options, run_metadata=run_metadata ) summary_writer.add_run_metadata(run_metadata,'step%d'% cal_num) summary_writer.add_summary(summary_str, initial_step+cal_num) # Print to stdout an analysis of the memory usage and the timing information # broken down by operations. # tf.contrib.tfprof.model_analyzer.print_model_analysis( # tf.get_default_graph(), # run_meta=run_metadata, # tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) # fetched_timeline = timeline.Timeline(run_metadata.step_stats) # chrome_trace = fetched_timeline.generate_chrome_trace_format() # with open('./time_line/timeline_02_step_%d.json' % i, 'w') as f: # f.write(chrome_trace) if cal_num % cfg.saveperiod == 0: ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) if m<0.9: m += round((0.9-0.2) / num_batches_per_epoch,5) else: m = 0.9 ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step+cal_num)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance( args[2], str) dataset_name = args[1] model_name = args[2] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) if model_name == "caps": output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) elif model_name == "cnn_baseline": output = net.build_arch_baseline(batch_x, is_train=False, num_classes=num_classes) else: raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!" batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={ 'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0' }, allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] coord_add = np.array(coord_add, dtype=np.float32) / 28. """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(), tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) batch_x, batch_labels = utils.get_shuffle_tfrecord(is_training=True) datanum = 272965 num_batches_per_epoch = int(datanum / cfg.batch_size) print(datanum, num_batches_per_epoch) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer(learning_rate=lrn_rate) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, coord_add, is_train=True) # loss = net.cross_ent_loss(output, batch_labels) loss = net.spread_loss(output, batch_labels, m_op) accuracy = net.test_accuracy(output, batch_labels) tf.summary.scalar("spread_loss", loss) tf.summary.scalar("accuracy", accuracy) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Print trainable variable parameter statistics to stdout. # By default, statistics are associated with each graph node. param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) # param_stats is tensorflow.tfprof.TFGraphNodeProto proto. # Let's print the root below. print('total_params: %d\n' % param_stats.total_parameters) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=30) #cfg.epoch) # restore from the check point ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = int(ckpt.model_checkpoint_path.split('-')[1]) print(ckpt, ckpt.model_checkpoint_path, initial_step) m = 0.9 else: initial_step = 0 m = 0.2 # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" # if not os.path.exists(cfg.logdir): # os.makedirs(cfg.logdir) summary_writer = tf.summary.FileWriter( cfg.logdir, graph=sess.graph) # graph = sess.graph, huge! cal_num = 0 for step in range(cfg.epoch): for i in range(num_batches_per_epoch): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, accuracy_val = sess.run( [train_op, loss, accuracy], feed_dict={m_op: m}) print( '%d/%d, %d/%d iteration is finished in ' % (step, cfg.epoch, i, num_batches_per_epoch) + '%f second' % (time.time() - tic) + ',m:', m, ',loss: %f' % loss_value, ",accuracy:", accuracy_val) cal_num += 1 except tf.errors.InvalidArgumentError: print('%d iteration contains NaN gradients. Discard.' % cal_num) continue else: """Write to summary.""" if i % 30 == 0: summary_str = sess.run(summary_op, feed_dict={m_op: m}) summary_writer.add_summary(summary_str, initial_step + cal_num) if cal_num % cfg.saveperiod == 0: ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) if m < 0.9: m += round((0.9 - 0.2) / num_batches_per_epoch, 5) else: m = 0.9 ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) """Join threads""" coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = 2 # int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_squash = tf.divide(batch_x, 255.) batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x_norm, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) m_op = tf.constant(0.9) loss, spread_loss, mse, recon_img_squash = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) data_size = int(batch_x.get_shape()[1]) recon_img = tf.multiply(tf.reshape(recon_img_squash, shape=[ cfg.batch_size, data_size, data_size, 1]), 255.) orig_img = tf.reshape(batch_x, shape=[ cfg.batch_size, data_size, data_size, 1]) tf.summary.image('orig_image', orig_img) tf.summary.image('recon_image', recon_img) saver = tf.train.Saver() step = 0 tf.summary.scalar('accuracy', batch_acc) summary_op = tf.summary.merge_all() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(14, 15): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str, orig_image, recon_image = sess.run( [batch_acc, summary_op, orig_img, recon_img]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 # display original/reconstructed images in matplotlib plot_imgs(orig_image, i, 'ori') plot_imgs(recon_image, i, 'rec') ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(args): assert len(args) == 2 and isinstance(args[1], str) # Get dataset name dataset_name = args[1] # mnist logger.info(f'Using dataset: {dataset_name}') # Set reproducible random seed tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) # (3, 3, 2) dataset_size = get_dataset_size_train(dataset_name) # 55,000 num_classes = get_num_classes(dataset_name) # 10 create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): # Get global_step global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = dataset_size // cfg.batch_size # 1100 opt = tf.train.AdamOptimizer() # Get batch from data queue batch_x, batch_labels = create_inputs() # (50 28, 28, 1), (50,) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch( batch_x, coord_add, is_train=True, num_classes=num_classes) # (50, 10), (50, 10, 18) tf.logging.debug(pose_out.get_shape()) # Define loss = spread_loss + reconstruction loss loss, spread_loss, mse, _ = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] # Apply graident with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Set Session settings sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # Set Saver var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) # Display parameters total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) # Set summary op summary_op = tf.summary.merge_all() # Start coord & queue coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Set summary writer if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + f"/caps/{dataset_name}/train_log/", graph=sess.graph) # graph = sess.graph, huge! # Main loop m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() # TF queue would pop batch until no file try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max # Save model periodically ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(args): assert len(args) == 2 and isinstance(args[1], str) # Get dataset name dataset_name = args[1] # mnist logger.info(f'Using dataset: {dataset_name}') # Set reproducible random seed tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) # (3, 3, 2) dataset_size = get_dataset_size_train(dataset_name) # 55,000 num_classes = get_num_classes(dataset_name) # 10 create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): # Get global_step global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = dataset_size // cfg.batch_size # 1100 opt = tf.train.AdamOptimizer() # Get batch from data queue batch_x, batch_labels = create_inputs() # (50 28, 28, 1), (50,) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # (50, 10), (50, 10, 18) tf.logging.debug(pose_out.get_shape()) # Define loss = spread_loss + reconstruction loss loss, spread_loss, mse, _ = net.spread_loss(output, pose_out, batch_squash, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] # Apply graident with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Set Session settings sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # Set Saver var_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) # Display parameters total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) # Set summary op summary_op = tf.summary.merge_all() # Start coord & queue coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Set summary writer if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + f"/caps/{dataset_name}/train_log/", graph=sess.graph) # graph = sess.graph, huge! # Main loop m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() # TF queue would pop batch until no file try: _, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max # Save model periodically ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(args): with tf.Graph().as_default(), tf.device('/gpu:0'): """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = cfg.get_coord_add(dataset_name) dataset_size = cfg.get_dataset_size_train(dataset_name) num_classes = cfg.get_num_classes(dataset_name) #数据的创建要放在同一个图中,否则容易报错 create_inputs = cfg.get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-1, global_step, num_batches_per_epoch, 0.95), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/gpu:0'): # batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # loss = net.cross_ent_loss(output, batch_labels) tf.logging.debug(pose_out.get_shape()) loss, spread_loss, mse, _ = net.spread_loss( output, pose_out, batch_x, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={ 'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0' }, allow_soft_placement=True) sess = tf.Session(config=session_config) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=5) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) ####################read snapshot######################################## if restore_model: model_file = tf.train.latest_checkpoint(ckpt) saver.restore(sess, model_file) #################### end ######################################## """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/caps/{}/train_log/'.format(dataset_name), graph=sess.graph) # graph = sess.graph, huge! """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min for step in range(start_epoch * num_batches_per_epoch, cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" epoch_th = step / num_batches_per_epoch if (((epoch_th + 1) % 5)) == 0: print('epoch_th: %d ' % epoch_th) if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max """Save model periodically""" ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance( args[2], str) dataset_name = args[1] model_name = args[2] """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = 2 # int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_squash = tf.divide(batch_x, 255.) batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x_norm, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) m_op = tf.constant(0.9) loss, spread_loss, mse, recon_img_squash = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) data_size = int(batch_x.get_shape()[1]) recon_img = tf.multiply( tf.reshape(recon_img_squash, shape=[cfg.batch_size, data_size, data_size, 1]), 255.) orig_img = tf.reshape(batch_x, shape=[cfg.batch_size, data_size, data_size, 1]) tf.summary.image('orig_image', orig_img) tf.summary.image('recon_image', recon_img) saver = tf.train.Saver() step = 0 tf.summary.scalar('accuracy', batch_acc) summary_op = tf.summary.merge_all() with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(45, 46): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) #ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) ############Comentar linea de abajo #ckpt = os.path.join(cfg.logdir, "caps/mnist/model-0.3764.ckpt-1718") saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str, orig_image, recon_image = sess.run( [batch_acc, summary_op, orig_img, recon_img]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 # display original/reconstructed images in matplotlib plot_imgs(orig_image, i, 'ori') plot_imgs(recon_image, i, 'rec') ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_x_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output = net.build_arch_baseline(batch_x, is_train=True, num_classes=num_classes) loss, recon_loss, _ = net.cross_ent_loss( output, batch_x_squash, batch_labels) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('train_acc', acc) tf.summary.scalar('recon_loss', recon_loss) tf.summary.scalar('all_loss', loss) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists( cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name), graph=sess.graph) """Main loop.""" for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op]) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: """Save model periodically""" ckpt_path = os.path.join( cfg.logdir + '/cnn_baseline/{}'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) """Join threads""" coord.request_stop() coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum(tf.train.exponential_decay( 1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_x_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output = net.build_arch_baseline(batch_x, is_train=True, num_classes=num_classes) loss, recon_loss, _ = net.cross_ent_loss(output, batch_x_squash, batch_labels) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('train_acc', acc) tf.summary.scalar('recon_loss', recon_loss) tf.summary.scalar('all_loss', loss) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [v for v in tf.global_variables( ) if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name), graph=sess.graph) """Main loop.""" for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op]) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: """Save model periodically""" ckpt_path = os.path.join( cfg.logdir + '/cnn_baseline/{}'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) """Join threads""" coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) # Prepare Training Data (x_train, y_train), (x_test, y_test) = utils.load_mnist_excluded() with tf.Graph().as_default(): #, tf.device('/cpu:0'): # Placeholders for input data and the targets x_input = tf.placeholder(tf.float32, (None, *IMG_DIM), name='Input') y_target = tf.placeholder(tf.int32, [ None, ], name='Target') """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable]): #, device='/cpu:0'): sample_batch = tf.identity(x_input) batch_labels = tf.identity(y_target) batch_squash = tf.divide(sample_batch, 255.) batch_x = slim.batch_norm(sample_batch, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) loss, spread_loss, mse, reconstruction = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) sample_batch = tf.squeeze(sample_batch) decode_res_op = tf.concat([ sample_batch, 255 * tf.reshape(reconstruction, [cfg.batch_size, IMAGE_SIZE, IMAGE_SIZE]) ], axis=0) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train__batch_acc', acc) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/caps/{}/train_log/'.format(dataset_name), graph=sess.graph) # graph = sess.graph, huge! if not os.path.exists(cfg.logdir + '/caps/{}/images/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/images/'.format(dataset_name)) """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min max_iter = cfg.epoch * num_batches_per_epoch + 1 for step in range(max_iter): tic = time.time() """"TF queue would pop batch until no file""" batch_x, batch_y = utils.get_random_mnist_batch( x_train, y_train, cfg.batch_size) try: _, loss_value, train_acc_val, summary_str, mse_value = sess.run( [train_op, loss, acc, summary_op, mse], feed_dict={ m_op: m, x_input: batch_x, y_target: batch_y }) sys.stdout.write(ERASE_LINE) sys.stdout.write('\r\r%d/%d iteration finishes in ' % (step, max_iter) + '%f second' % (time.time() - tic) + ' training accuracy = %f' % train_acc_val + ' loss=%f' % loss_value + '\treconstruction_loss=%f' % mse_value) sys.stdout.flush() time.sleep(0.001) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 10 == 0: summary_writer.add_summary(summary_str, step) if step % 200 == 0: images = sess.run(decode_res_op, feed_dict={ m_op: m, x_input: batch_x, y_target: batch_y }) image = combine_images(images) img_name = cfg.logdir + '/caps/{}/images/'.format( dataset_name) + "/step_{}.png".format(str(step)) Image.fromarray(image.astype(np.uint8)).save(img_name) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max """Save model periodically """ ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name), 'finall-model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) print('Training is finished!')