def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format( FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders source = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) target = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) is_training = tf.placeholder('bool', []) par = tf.Variable(tf.constant(0.2), dtype=tf.float32) # Model train_layers = FLAGS.train_layers.split(',') source_model = ResNetModel(source, is_training, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) target_model = ResNetModel(target, is_training, reuse=True, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=source_model.prob, labels=y) cross_entropy_mean = tf.reduce_mean(cross_entropy) regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) loss = tf.add_n([cross_entropy_mean] + regularization_losses) # domain_loss=tf.maximum(0.0001,KMMD(source_model.avg_pool,target_model.avg_pool)) domain_loss = coral_loss(source_model.avg_pool, target_model.avg_pool) centers_update_op, discriminative_loss = CenterBased( source_model.avg_pool, y) # domain_loss = mmatch(source_model.avg_pool,target_model.avg_pool, 5) # domain_loss = log_coral_loss(source_model.adapt, target_model.adapt) loss = loss + 1 * par * domain_loss + 0.03 * discriminative_loss # train_op = model.optimize(FLAGS.learning_rate, train_layers) Varall = tf.trainable_variables() # print(Varall) trainable_var_names = ['weights', 'biases', 'beta', 'gamma'] var_list = [ v for v in tf.trainable_variables() if v.name.split(':')[0].split('/')[-1] in trainable_var_names and contains(v.name, train_layers) ] optimizer = tf.train.AdamOptimizer( FLAGS.learning_rate) #.minimize(loss, var_list=var_list) # ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY) # tf.add_to_collection(UPDATE_OPS_COLLECTION, ema.apply([loss])) # batchnorm_updates = tf.get_collection(UPDATE_OPS_COLLECTION) # batchnorm_updates_op = tf.group(*batchnorm_updates) # train_op=tf.group(train_op, batchnorm_updates_op) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # total_op=tf.group(update_ops,centers_update_op) with tf.control_dependencies(update_ops): with tf.control_dependencies([centers_update_op]): train_op = optimizer.minimize(loss, var_list=var_list) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(source_model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor( dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=False, shuffle=True, multi_scale=multi_scale) target_preprocessor = BatchPreprocessor( dataset_file_path='../data/webcam.txt', num_classes=FLAGS.num_classes, output_size=[224, 224], shuffle=True) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224]) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) target_batches_per_epoch = np.floor( len(target_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) # train_batches_per_epoch=np.minimum(train_batches_per_epoch,target_batches_per_epoch) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: varall = tf.trainable_variables() sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights source_model.load_original_weights(sess, skip_layers=train_layers) # target_model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch + 1)) step = 1 param = 2 / (1 + np.exp(-10 * (epoch) / FLAGS.num_epochs)) - 1 print(param) sess.run(tf.assign(par, param)) print(sess.run(par)) # Start training while step < train_batches_per_epoch: if step % target_batches_per_epoch == 0: target_preprocessor.reset_pointer() batch_xs, batch_ys = train_preprocessor.next_batch( FLAGS.batch_size) batch_xt, batch_yt = target_preprocessor.next_batch( FLAGS.batch_size) sess.run(train_op, feed_dict={ source: batch_xs, target: batch_xt, y: batch_ys, is_training: True }) # Logging # if step % FLAGS.log_step == 0: # s = sess.run(merged_summary, feed_dict={source: batch_xs, y: batch_ys, is_training: False}) # train_writer.add_summary(s, epoch * train_batches_per_epoch + step) step += 1 # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch( FLAGS.batch_size) acc = sess.run(accuracy, feed_dict={ source: batch_tx, y: batch_ty, is_training: False }) test_acc += acc test_count += 1 test_acc /= test_count s = tf.Summary(value=[ tf.Summary.Value(tag="validation_accuracy", simple_value=test_acc) ]) val_writer.add_summary(s, epoch + 1) print("{} Validation Accuracy = {:.4f}".format( datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() target_preprocessor.reset_pointer()
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('vggnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) #flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('tensorboard_root_dir={}\n'.format( FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders img_size = 256 x = tf.placeholder(tf.float32, [FLAGS.batch_size, img_size, img_size, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) dropout_keep_prob = tf.placeholder(tf.float32) # Model #train_layers = FLAGS.train_layers.split(',') model = VggNetModel(num_classes=FLAGS.num_classes, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) #train_op = model.optimize(FLAGS.learning_rate, train_layers) train_op = model.optimize(FLAGS.learning_rate) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors train_preprocessor = BatchPreprocessor( dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[img_size, img_size], horizontal_flip=True, shuffle=True) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[img_size, img_size]) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch + 1)) step = 1 # Start training while step < train_batches_per_epoch: batch_xs, batch_ys = train_preprocessor.next_batch( FLAGS.batch_size) sess.run(train_op, feed_dict={ x: batch_xs, y: batch_ys, dropout_keep_prob: FLAGS.dropout_keep_prob }) # Logging if step % FLAGS.log_step == 0: s = sess.run(merged_summary, feed_dict={ x: batch_xs, y: batch_ys, dropout_keep_prob: 1. }) train_writer.add_summary( s, epoch * train_batches_per_epoch + step) step += 1 # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch( FLAGS.batch_size, 1) acc = sess.run(accuracy, feed_dict={ x: batch_tx, y: batch_ty, dropout_keep_prob: 1. }) test_acc += acc test_count += 1 test_acc /= test_count s = tf.Summary(value=[ tf.Summary.Value(tag="validation_accuracy", simple_value=test_acc) ]) val_writer.add_summary(s, epoch + 1) print("{} Validation Accuracy = {:.4f}".format( datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() print("{} Saving checkpoint of model...".format( datetime.datetime.now())) #save checkpoint of the model checkpoint_path = os.path.join( checkpoint_dir, 'model_epoch' + str(epoch + 1) + '.ckpt') save_path = saver.save(sess, checkpoint_path) print("{} Model checkpoint saved at {}".format( datetime.datetime.now(), checkpoint_path))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('ft_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('model name: {}\n'.format(MODEL_NAME)) flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [None, 227, 227, 3], 'x') y = tf.placeholder(tf.float32, [None, NUM_CLASSES], 'y') decay_learning_rate = tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) # Model train_layers = FLAGS.train_layers.split(',') model = AlexNetModel(num_classes=NUM_CLASSES, dropout_keep_prob=dropout_keep_prob) loss = model.get_loss(x, y) train_op = model.optimize(decay_learning_rate, train_layers) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) correct = tf.reduce_sum(tf.cast(correct_pred, tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Initialize the FileWriter train_writer = tf.summary.FileWriter(tensorboard_dir + '/train') test_writer = tf.summary.FileWriter(tensorboard_dir + '/val') # Summaries tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor( dataset_file_path=TRAINING_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor( dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], multi_scale=multi_scale, istraining=False) # Initialize an saver for store model checkpoints saver = tf.train.Saver() # Get the number of training steps per epoch train_batches_per_epoch = np.floor(len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: # Initialize all variables sess.run(tf.global_variables_initializer()) # Add the model graph to TensorBoard train_writer.add_graph(sess.graph) # Load the pretrained weights model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") logger.info("Start training...") logger.info("tensorboard --logdir {}".format(tensorboard_dir)) global_step = 0 for epoch in range(FLAGS.num_epochs): # Reset the dataset pointers train_preprocessor.reset_pointer() step = 1 while step < train_batches_per_epoch: global_step += 1 rate = decay(FLAGS.learning_rate, global_step, MAX_STEP) batch_xs, batch_ys = train_preprocessor.next_batch(FLAGS.batch_size) summary, loss, _ = sess.run( [merged, model.loss, train_op], feed_dict={ x: batch_xs, decay_learning_rate: rate, y: batch_ys, dropout_keep_prob: 0.5 }) train_writer.add_summary(summary, global_step) step += 1 if global_step % 10 == 0: logger.info("epoch {}, step {}, loss {:.6f}".format(epoch, global_step, loss)) test_acc = 0. test_count = 0 for _ in range((len(val_preprocessor.labels))): batch_tx, batch_ty = val_preprocessor.next_batch(1) acc = sess.run(correct, feed_dict={x: batch_tx, y: batch_ty, dropout_keep_prob: 1.}) test_acc += acc test_count += 1 test_acc_ = test_acc / test_count s = tf.Summary(value=[tf.Summary.Value(tag="accuracy", simple_value=test_acc_)]) test_writer.add_summary(s, global_step) logger.info("test accuracy: {:.4f}, {}/{}".format(test_acc_, test_acc, test_count)) # Reset the dataset pointers val_preprocessor.reset_pointer() #save checkpoint of the model if global_step % 1000 == 0 and global_step > 0: logger.info("saving checkpoint of model") checkpoint_path = os.path.join(checkpoint_dir, 'model_epoch' + str(global_step) + '.ckpt') saver.save(sess, checkpoint_path)
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [None, 227, 227, 3],'x') xt = tf.placeholder(tf.float32, [None, 227, 227, 3],'xt') y = tf.placeholder(tf.float32, [None, NUM_CLASSES],'y') yt = tf.placeholder(tf.float32, [None, NUM_CLASSES],'yt') adlamb=tf.placeholder(tf.float32) decay_learning_rate=tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) # Model train_layers = FLAGS.train_layers.split(',') model = AlexNetModel(num_classes=NUM_CLASSES, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) correct=tf.reduce_sum(tf.cast(correct_pred,tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) #G_loss,D_loss=model.wganloss(x,xt,FLAGS.batch_size,10.0) G_loss,D_loss,sc,tc=model.adloss(x,xt,y,10) target_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1)) target_correct=tf.reduce_sum(tf.cast(target_correct_pred,tf.float32)) target_accuracy = tf.reduce_mean(tf.cast(target_correct_pred, tf.float32)) train_op = model.optimize(decay_learning_rate, train_layers,adlamb,sc,tc) D_op=model.adoptimize(decay_learning_rate,train_layers) optimizer=tf.group(train_op,D_op) train_writer=tf.summary.FileWriter('./log/tensorboard_restore') train_writer.add_graph(tf.get_default_graph()) tf.summary.scalar('Testing Accuracy',target_accuracy) merged=tf.summary.merge_all() print '============================GLOBAL TRAINABLE VARIABLES ============================' print tf.trainable_variables() #print '============================GLOBAL VARIABLES ======================================' #print tf.global_variables() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None print '==================== MULTI SCALE===================================================' print multi_scale train_preprocessor = BatchPreprocessor(dataset_file_path=TRAINING_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) Ttrain_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227],multi_scale=multi_scale,istraining=False) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor(len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) Ttrain_batches_per_epoch = np.floor(len(Ttrain_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor(len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver=tf.train.Saver() train_writer.add_graph(sess.graph) # Load the pretrained weights #model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) saver.restore(sess, "./trained_mstn_model/10000.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir)) gs=0 gd=0 for epoch in range(FLAGS.num_epochs): #print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: gd+=1 lamb=adaptation_factor(gd*1.0/MAX_STEP) rate=decay(FLAGS.learning_rate,gd,MAX_STEP) if gd%1==0: print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range((len(val_preprocessor.labels))): batch_tx, batch_ty = val_preprocessor.next_batch(1) acc = sess.run(correct, feed_dict={x: batch_tx, y: batch_ty, dropout_keep_prob: 1.}) test_acc += acc test_count += 1 print test_acc,test_count test_acc /= test_count print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() return
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format(FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders source = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) target = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) is_training = tf.placeholder('bool', []) dropout_rate=tf.placeholder(dtype=tf.float32,shape=None) domain_loss_param=tf.get_variable(name="domain_loss_param",dtype=tf.float32,initializer=tf.constant(1.0),trainable=False) target_loss_param=tf.get_variable(name='target_loss_param',dtype=tf.float32,initializer=tf.constant(0.0),trainable=False) logits_threshold=tf.get_variable(name='logits_threshold',dtype=tf.float32,initializer=tf.constant(0.0),trainable=False) ring_norm = tf.get_variable(name="fc/ring_norm", shape=None, dtype=tf.float32, initializer=tf.constant(100.0),trainable=False) clustering_param=tf.get_variable(name='Ortho_loss_param',dtype=tf.float32,initializer=tf.constant(0.0),trainable=False) # Model train_layers = FLAGS.train_layers.split(',') source_model = ResNetModel(source,is_training, depth=FLAGS.resnet_depth, dropout_rate=dropout_rate,num_classes=FLAGS.num_classes) target_model = ResNetModel(target,is_training,reuse=True, depth=FLAGS.resnet_depth, dropout_rate=dropout_rate,num_classes=FLAGS.num_classes) # fc_weights=tf.get_default_graph().get_tensor_by_name("fc/weights:0") # Orthogonal_regularizer=tf.reduce_mean(tf.norm(tf.matmul(tf.transpose(fc_weights),fc_weights)-tf.eye(FLAGS.num_classes),ord=2)) # Grad_loss=GradRegularization(target_model.prob,target_model.avg_pool) ### Calculating the loss function cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=source_model.prob, labels=y) cross_entropy_mean = tf.reduce_mean(cross_entropy) regularization_losses = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) source_loss = cross_entropy_mean+1.0*regularization_losses domain_loss= HoMM(source_model.adapt,target_model.adapt,order=4,num=300000) target_loss=Target_loss(tf.nn.softmax(target_model.prob),logits_threshold) centers_update_op,discriminative_loss,source_centers=CenterBased(source_model.adapt,y) target_feature,target_logits=SelectTargetSamples(target_model.adapt,target_model.prob,logits_threshold=0.75) target_predict_label=tf.argmax(target_logits,axis=1) target_pseudo_label=tf.one_hot(target_predict_label,FLAGS.num_classes) with tf.variable_scope('target'): centers_update_op_1, discriminative_clustering,target_centers = CenterBased(target_feature, target_pseudo_label) # class_domain_loss=AlignCenter(centers_update_op,centers_update_op_1) ring_loss = Cal_RingLoss(ring_norm,source_model.avg_pool,target_model.avg_pool) # office 1000 0.01 0.0003 ## office-Home 1000 0.01 0.001 loss=source_loss+200*domain_loss_param*domain_loss+clustering_param*discriminative_clustering # train_op = model.optimize(FLAGS.learning_rate, train_layers) Varall=tf.trainable_variables() # print(Varall) trainable_var_names = ['weights', 'biases', 'beta', 'gamma','adapt'] # var_list_1 = [v for v in tf.trainable_variables() if v.name.split(':')[0].split('/')[-1] in trainable_var_names and contains(v.name, train_layers)] var_list_1 = [var for var in tf.trainable_variables() if 'scale5/block3' in var.name] var_list_2=[var for var in tf.trainable_variables() if 'fc' in var.name or 'adapt' in var.name] var_list_3 = [var for var in tf.trainable_variables() if 'scale5/block2' in var.name] Varall = tf.trainable_variables() optimizer1 = tf.train.AdamOptimizer(FLAGS.learning_rate) optimizer2 = tf.train.AdamOptimizer(learning_rate=0.0003) # optimizer3 = tf.train.AdamOptimizer(learning_rate=0.000005) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): with tf.control_dependencies([centers_update_op,centers_update_op_1]): op1 = optimizer1.minimize(loss,var_list=var_list_1) op2 = optimizer2.minimize(loss,var_list=var_list_2) # op3 = optimizer3.minimize(loss,var_list=var_list_3) train_op=tf.group(op1,op2) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(source_model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=False, shuffle=True, multi_scale=multi_scale) target_preprocessor = BatchPreprocessor(dataset_file_path='../data/webcam.txt', num_classes=FLAGS.num_classes,output_size=[224, 224],shuffle=True) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224]) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor(len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) target_batches_per_epoch = np.floor(len(target_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor(len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) # train_batches_per_epoch=np.minimum(train_batches_per_epoch,target_batches_per_epoch) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: varall=tf.trainable_variables() sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights source_model.load_original_weights(sess, skip_layers=train_layers) # target_model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir)) Acc_convergency=[] for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: if step%target_batches_per_epoch==0: target_preprocessor.reset_pointer() batch_xs, batch_ys = train_preprocessor.next_batch(FLAGS.batch_size) batch_xt, batch_yt = target_preprocessor.next_batch(FLAGS.batch_size) TotalLoss, SourceLoss, DomainLoss, TargetLoss, RingLoss, _=sess.run( fetches=[loss, source_loss, domain_loss, target_loss, ring_loss, train_op], feed_dict={source: batch_xs,target:batch_xt, y: batch_ys, is_training: True,dropout_rate:1.0}) ############################ print loss ################################################## print "Loss={} ### SourceLoss={} ### DomainLoss={} ### TargetLoss={} ### RingLoss={}".format(TotalLoss, SourceLoss, DomainLoss, TargetLoss, RingLoss) # Logging # if step % FLAGS.log_step == 0: # s = sess.run(merged_summary, feed_dict={source: batch_xs, y: batch_ys, is_training: False}) # train_writer.add_summary(s, epoch * train_batches_per_epoch + step) step += 1 if epoch % 3 == 0 : # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch(FLAGS.batch_size) acc= sess.run(accuracy, feed_dict={source: batch_tx, y: batch_ty, is_training: False,dropout_rate:1.0}) test_acc += acc test_count += 1 test_acc /= test_count s = tf.Summary(value=[tf.Summary.Value(tag="validation_accuracy", simple_value=test_acc)]) val_writer.add_summary(s, epoch+1) print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) Acc_convergency.append(test_acc) print Acc_convergency if epoch==100: sess.run(tf.assign(clustering_param, 0.0)) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() target_preprocessor.reset_pointer() ####################### log the convergency data####################### savedata = np.array(Acc_convergency) np.save("AtoD_SDDA_Source.npy", savedata)
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('vggnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Placeholders img_size = 256 x = tf.placeholder(tf.float32, [FLAGS.batch_size, img_size, img_size, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) dropout_keep_prob = tf.placeholder(tf.float32) # Model #train_layers = FLAGS.train_layers.split(',') model = VggNetModel(num_classes=FLAGS.num_classes, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) #train_op = model.optimize(FLAGS.learning_rate, train_layers) train_op = model.optimize(FLAGS.learning_rate) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[img_size, img_size]) # Get the number of training/validation steps per epoch val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) test_accuracy = 0 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Directly restore (your model should be exactly the same with checkpoint) saver.restore(sess, FLAGS.ckpt_path) print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch + 1)) step = 1 # Epoch completed, start validation print("{} Start Test".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch( FLAGS.batch_size, 1) acc = sess.run(accuracy, feed_dict={ x: batch_tx, y: batch_ty, dropout_keep_prob: 1. }) test_acc += acc test_count += 1 test_acc /= test_count print("{} Test Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) test_accuracy = test_acc # Reset the dataset pointers val_preprocessor.reset_pointer() # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.write('checkpoint_path={}\n'.format(FLAGS.ckpt_path)) flags_file.write('test_accuracy={}'.format(test_accuracy)) flags_file.close()
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format( FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) is_training = tf.placeholder('bool', []) # Model train_layers = FLAGS.train_layers.split(',') model = ResNetModel(is_training, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) loss = model.loss(x, y) train_op = model.optimize(FLAGS.learning_rate, train_layers) # Training accuracy of the model corr_pred = tf.argmax(model.prob, 1) correct_pred = tf.equal(tf.argmax(model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor( dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224]) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print(" {} | Start training...".format( datetime.datetime.now().strftime("%H:%M:%S"))) print(" {} | Open Tensorboard at --logdir {}".format( datetime.datetime.now().strftime("%H:%M:%S"), tensorboard_dir)) #train_batches_per_epoch = 11 #val_batches_per_epoch = 2 for epoch in range(FLAGS.num_epochs): print(" {} | Epoch number: {}".format( datetime.datetime.now().strftime("%H:%M:%S"), epoch + 1)) step = 1 # Start training while step < train_batches_per_epoch: batch_xs, batch_ys, ys = train_preprocessor.next_batch( FLAGS.batch_size) sess.run(train_op, feed_dict={ x: batch_xs, y: batch_ys, is_training: True }) # Logging if step % FLAGS.log_step == 0: s = sess.run(merged_summary, feed_dict={ x: batch_xs, y: batch_ys, is_training: False }) train_writer.add_summary( s, epoch * train_batches_per_epoch + step) train_writer.flush() batch_tx, batch_ty, ty = val_preprocessor.next_batch( FLAGS.batch_size) v = sess.run(merged_summary, feed_dict={ x: batch_tx, y: batch_ty, is_training: False }) val_writer.add_summary( v, epoch * train_batches_per_epoch + step) val_writer.flush() step += 1 # Reset the dataset pointers val_preprocessor.reset_pointer() # Epoch completed, start validation print(" {} | Start validation".format( datetime.datetime.now().strftime("%H:%M:%S"))) test_acc = 0. test_count = 0 val_pred = [] val_ty = [] for _ in range(val_batches_per_epoch): batch_tx, batch_ty, ty = val_preprocessor.next_batch( FLAGS.batch_size) acc, pred = sess.run([accuracy, corr_pred], feed_dict={ x: batch_tx, y: batch_ty, is_training: False }) val_pred.extend(pred) val_ty.extend(ty) try: confusion += tf.confusion_matrix( labels=ty, predictions=pred, num_classes=FLAGS.num_classes) except: confusion = tf.confusion_matrix( labels=ty, predictions=pred, num_classes=FLAGS.num_classes) test_acc += acc test_count += 1 test_acc /= test_count s = tf.Summary(value=[ tf.Summary.Value(tag="val_accuracy", simple_value=test_acc) ]) val_writer.add_summary(s, epoch * train_batches_per_epoch + step - 1) print(" {} | Validation Accuracy = {:.4f}".format( datetime.datetime.now().strftime("%H:%M:%S"), test_acc)) # Confusion Matrix with tf.Session(): conf_out = tf.Tensor.eval(confusion, feed_dict=None, session=None) conf_matrix = plot_confusion_matrix(correct_labels=val_ty, predict_labels=val_pred, labels=FLAGS.labels, tensor_name='Confusion Matrix') val_writer.add_summary(conf_matrix, epoch + 1) conf_matrix_norm = plot_confusion_matrix( correct_labels=val_ty, predict_labels=val_pred, labels=FLAGS.labels, normalize=True, tensor_name='Confusion Matrix Normalized') val_writer.add_summary(conf_matrix_norm, epoch + 1) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() print(" {} | Saving checkpoint of model...".format( datetime.datetime.now().strftime("%H:%M:%S"))) #save checkpoint of the model checkpoint_path = os.path.join( checkpoint_dir, 'model_epoch' + str(epoch + 1) + '.ckpt') save_path = saver.save(sess, checkpoint_path) print(" {} | Model checkpoint saved at {}".format( datetime.datetime.now().strftime("%H:%M:%S"), checkpoint_path))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [None, 227, 227, 3], 'x') xt = tf.placeholder(tf.float32, [None, 227, 227, 3], 'xt') y = tf.placeholder(tf.float32, [None, NUM_CLASSES], 'y') yt = tf.placeholder(tf.float32, [None, NUM_CLASSES], 'yt') adlamb = tf.placeholder(tf.float32) decay_learning_rate = tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) # Model train_layers = FLAGS.train_layers.split(',') model = AlexNetModel(num_classes=NUM_CLASSES, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) correct = tf.reduce_sum(tf.cast(correct_pred, tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) #G_loss,D_loss=model.wganloss(x,xt,FLAGS.batch_size,10.0) G_loss, D_loss, sc, tc = model.adloss(x, xt, y, 10) target_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1)) target_correct = tf.reduce_sum(tf.cast(target_correct_pred, tf.float32)) target_accuracy = tf.reduce_mean(tf.cast(target_correct_pred, tf.float32)) train_op = model.optimize(decay_learning_rate, train_layers, adlamb, sc, tc) D_op = model.adoptimize(decay_learning_rate, train_layers) optimizer = tf.group(train_op, D_op) train_writer = tf.summary.FileWriter('./log/tensorboard_restore') train_writer.add_graph(tf.get_default_graph()) tf.summary.scalar('Testing Accuracy', target_accuracy) merged = tf.summary.merge_all() print '============================GLOBAL TRAINABLE VARIABLES ============================' print tf.trainable_variables() #print '============================GLOBAL VARIABLES ======================================' #print tf.global_variables() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None print '==================== MULTI SCALE===================================================' print multi_scale train_preprocessor = BatchPreprocessor(dataset_file_path=TRAINING_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) Ttrain_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], multi_scale=multi_scale, istraining=False) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) Ttrain_batches_per_epoch = np.floor( len(Ttrain_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() train_writer.add_graph(sess.graph) # Load the pretrained weights #model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) saver.restore(sess, "./trained_mstn_model/10000.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) gs = 0 gd = 0 for epoch in range(FLAGS.num_epochs): #print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: gd += 1 lamb = adaptation_factor(gd * 1.0 / MAX_STEP) rate = decay(FLAGS.learning_rate, gd, MAX_STEP) if gd % 1 == 0: print("{} Start validation".format( datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range((len(val_preprocessor.labels))): batch_tx, batch_ty = val_preprocessor.next_batch(1) acc = sess.run(correct, feed_dict={ x: batch_tx, y: batch_ty, dropout_keep_prob: 1. }) test_acc += acc test_count += 1 print test_acc, test_count test_acc /= test_count print("{} Validation Accuracy = {:.4f}".format( datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() return
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format(FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) y = tf.placeholder(tf.int32, [None, FLAGS.num_classes]) y_label = tf.placeholder(tf.int32, [None]) is_training = tf.placeholder('bool', []) # is_training = tf.constant(True, dtype=tf.bool) # Model train_layers = FLAGS.train_layers.split(',') model = ResNetModel(is_training, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) prob, avg_pool, _a, _b = model.inference(x) loss = model.loss(x, y) train_op = model.optimize(FLAGS.learning_rate, train_layers) # Training accuracy of the model softmax_prob = tf.nn.softmax(prob) predict_label = tf.argmax(softmax_prob, 1) correct_pred = tf.equal(tf.argmax(model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) # tf.summary.histogram('softmax_prob', softmax_prob) # tf.summary.histogram('fc_weights', _a) tf.summary.scalar('train_accuracy', accuracy) # tf.summary.histogram('avg_pool', avg_pool) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver(max_to_keep=20) # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None # print(tf.trainable_variables()) # assert 1 == 2 train_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.train_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=False, shuffle=True, multi_scale=multi_scale, alpha_label_smooth=FLAGS.alpha_label_smooth) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224], alpha_label_smooth=FLAGS.alpha_label_smooth) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor(len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor(len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights if FLAGS.is_pretrain == '1': model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "../training/resnet_20200214_222540/checkpoint/model_epoch15.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: batch_xs, batch_ys, batch_y_label = train_preprocessor.next_batch(FLAGS.batch_size) ss_loss, _ = sess.run([loss, train_op], feed_dict={x: batch_xs, y: batch_ys, is_training: True, y_label: batch_y_label}) # Logging if step % FLAGS.log_step == 0: s = sess.run(merged_summary, feed_dict={x: batch_xs, y: batch_ys, is_training: False, y_label: batch_y_label}) train_writer.add_summary(s, epoch * train_batches_per_epoch + step) print("{} Epoch number: {}, Step number: {}, Loss: {}".format( datetime.datetime.now(), epoch + 1, step, ss_loss)) step += 1 # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0.001 test_y_list, test_y_pre_list = [], [] for _ in range(val_batches_per_epoch): batch_tx, batch_ty, batch_ty_label = val_preprocessor.next_batch(FLAGS.batch_size) _prob, _predict, _softmax_prob, acc = sess.run([prob, predict_label, softmax_prob, accuracy], feed_dict={x: batch_tx, y: batch_ty, is_training: False, y_label: batch_ty_label}) test_y_pre_list.extend(_predict) test_y_list.extend(batch_ty_label) # print(_predict, batch_ty_label, acc) test_acc += acc test_count += 1 test_acc /= test_count s = tf.Summary(value=[ tf.Summary.Value(tag="validation_total_accuracy", simple_value=test_acc), tf.Summary.Value(tag="validation_accuracy", simple_value=acc), ]) val_writer.add_summary(s, epoch+1) print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) print("{} Validation F1 values:".format(datetime.datetime.now()), classification_report(y_true=test_y_list, y_pred=test_y_pre_list)) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() print("{} Saving checkpoint of model...".format(datetime.datetime.now())) # save checkpoint of the model checkpoint_path = os.path.join(checkpoint_dir, 'model_epoch'+str(epoch+1)+'.ckpt') saver.save(sess, checkpoint_path) print("{} Model checkpoint saved at {}".format(datetime.datetime.now(), checkpoint_path))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format( FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders #x = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3], name='input') x = tf.placeholder(tf.float32, [None, 224, 224, 3], name='input') y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) is_training = tf.placeholder('bool', [], name='trainval') # Model train_layers = FLAGS.train_layers.split(',') model = ResNetModel(is_training, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) loss = model.loss(x, y) train_op = model.optimize(FLAGS.learning_rate, train_layers) # Link variable to model output predict = model.prob output = tf.nn.softmax(predict, name='output') # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor( dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=False, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224]) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights #model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch + 1)) step = 1 # Start training while step < train_batches_per_epoch: batch_xs, batch_ys = train_preprocessor.next_batch( FLAGS.batch_size) sess.run(train_op, feed_dict={ x: batch_xs, y: batch_ys, is_training: True }) # Logging if step % FLAGS.log_step == 0: train_loss, train_acc, s = sess.run( [loss, accuracy, merged_summary], feed_dict={ x: batch_xs, y: batch_ys, is_training: False }) train_writer.add_summary( s, epoch * train_batches_per_epoch + step) print( "Iter {}/{}, training mini-batch loss = {:.5f}, training accuracy = {:.5f}" .format(step * FLAGS.batch_size, train_batches_per_epoch * FLAGS.batch_size, train_loss, train_acc)) step += 1 # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 test_loss = 0 t1 = time.time() for i in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch( FLAGS.batch_size) val_loss, val_acc, val_out = sess.run([loss, accuracy, output], feed_dict={ x: batch_tx, y: batch_ty, is_training: False }) test_acc += val_acc test_loss += val_loss test_count += 1 y_true = np.argmax(batch_ty, 1) y_pre = np.argmax(val_out, 1) #print(len(y_true),len(y_pre)) #for k in range(FLAGS.batch_size): # if not (y_pre[k] == 0 or y_pre[k] == 1): # y_pre[k] = 0 #if i == 0: # conf_matrix = confusion_matrix(y_true, y_pre) #else: # conf_matrix += confusion_matrix(y_true, y_pre) #conf_matrix = confusion_matrix(y_true, y_pre) #print(i, conf_matrix) if i == 0: all_pred_y = y_pre all_real_y = y_true else: all_pred_y = np.concatenate((all_pred_y, y_pre), axis=0) all_real_y = np.concatenate((all_real_y, y_true), axis=0) test_acc /= test_count test_loss /= test_count t2 = time.time() - t1 s = tf.Summary(value=[ tf.Summary.Value(tag="validation_accuracy", simple_value=test_acc) ]) val_writer.add_summary(s, epoch + 1) print("{} Validation Accuracy = {:.4f}, loss = {:.4f}".format( datetime.datetime.now(), test_acc, test_loss)) print("Test image {:.4f}ms per image".format( t2 * 1000 / (val_batches_per_epoch * FLAGS.batch_size))) conf_matrix = confusion_matrix(all_real_y, all_pred_y) print(conf_matrix.ravel()) #y_batch_predict = np.zeros((val_batches_per_epoch*FLAGS.batch_size, FLAGS.num_classes)) #for j in range(val_batches_per_epoch*FLAGS.batch_size): # y_batch_predict[j][all_pred_y[j]] = 1 class_report = classification_report(all_real_y, all_pred_y) print(class_report) # Reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() print("{} Saving checkpoint of model...".format( datetime.datetime.now())) #save checkpoint of the model checkpoint_path = os.path.join( checkpoint_dir, 'model_epoch' + str(epoch + 1) + '.ckpt') save_path = saver.save(sess, checkpoint_path) print("{} Model checkpoint saved at {}".format( datetime.datetime.now(), checkpoint_path))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [None, 227, 227, 3], 'x') xt = tf.placeholder(tf.float32, [None, 227, 227, 3], 'xt') y = tf.placeholder(tf.float32, [None, NUM_CLASSES], 'y') yt = tf.placeholder(tf.float32, [None, NUM_CLASSES], 'yt') adlamb = tf.placeholder(tf.float32) decay_learning_rate = tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) # Model train_layers = FLAGS.train_layers.split(',') model = AlexNetModel(num_classes=NUM_CLASSES, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) correct = tf.reduce_sum(tf.cast(correct_pred, tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) G_loss, D_loss, sc, tc = model.adloss(x, xt, y, adlamb) target_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1)) target_correct = tf.reduce_sum(tf.cast(target_correct_pred, tf.float32)) target_accuracy = tf.reduce_mean(tf.cast(target_correct_pred, tf.float32)) train_op = model.optimize(decay_learning_rate, train_layers, adlamb, sc, tc) D_op = model.adoptimize(decay_learning_rate, train_layers) optimizer = tf.group(train_op, D_op) train_writer = tf.summary.FileWriter('./log/tensorboard' + MODEL_NAME) train_writer.add_graph(tf.get_default_graph()) tf.summary.scalar('Testing Accuracy', target_accuracy) merged = tf.summary.merge_all() print '============================GLOBAL TRAINABLE VARIABLES ============================' print tf.trainable_variables() #print '============================GLOBAL VARIABLES ======================================' #print tf.global_variables() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None print '==================== MULTI SCALE===================================================' print multi_scale train_preprocessor = BatchPreprocessor(dataset_file_path=TRAINING_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) Ttrain_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], multi_scale=multi_scale, istraining=False) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) Ttrain_batches_per_epoch = np.floor( len(Ttrain_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() train_writer.add_graph(sess.graph) # Load the pretrained weights model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) gs = 0 gd = 0 for epoch in range(FLAGS.num_epochs): #print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: gd += 1 lamb = adaptation_factor(gd * 1.0 / MAX_STEP) rate = decay(FLAGS.learning_rate, gd, MAX_STEP) for it in xrange(1): gs += 1 if gs % Ttrain_batches_per_epoch == 0: Ttrain_preprocessor.reset_pointer() if gs % train_batches_per_epoch == 0: train_preprocessor.reset_pointer() batch_xs, batch_ys = train_preprocessor.next_batch( FLAGS.batch_size) Tbatch_xs, Tbatch_ys = Ttrain_preprocessor.next_batch( FLAGS.batch_size) summary, _ = sess.run( [merged, optimizer], feed_dict={ x: batch_xs, xt: Tbatch_xs, yt: Tbatch_ys, adlamb: lamb, decay_learning_rate: rate, y: batch_ys, dropout_keep_prob: 0.5 }) train_writer.add_summary(summary, gd) closs, gloss, dloss, gregloss, dregloss, floss, smloss = sess.run( [ model.loss, model.G_loss, model.D_loss, model.Gregloss, model.Dregloss, model.F_loss, model.Semanticloss ], feed_dict={ x: batch_xs, xt: Tbatch_xs, adlamb: lamb, decay_learning_rate: rate, y: batch_ys, dropout_keep_prob: 0.5 }) step += 1 if gd % 50 == 0: print '=================== Step {0:<10} ================='.format( gs) print 'Epoch {0:<5} Step {1:<5} Closs {2:<10} Gloss {3:<10} Dloss {4:<10} Total_Loss {7:<10} Gregloss {5:<10} Dregloss {6:<10} Semloss {7:<10}'.format( epoch, step, closs, gloss, dloss, gregloss, dregloss, floss, smloss) print 'lambda: ', lamb print 'rate: ', rate # Epoch completed, start validation print("{} Start validation".format( datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range((len(val_preprocessor.labels))): batch_tx, batch_ty = val_preprocessor.next_batch(1) acc = sess.run(correct, feed_dict={ x: batch_tx, y: batch_ty, dropout_keep_prob: 1. }) test_acc += acc test_count += 1 print test_acc, test_count test_acc /= test_count print("{} Validation Accuracy = {:.4f}".format( datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() #train_preprocessor.reset_pointer() if gd % 5000 == 0 and gd > 0: saver.save( sess, './log/mstnmodel_' + MODEL_NAME + str(gd) + '.ckpt') print("{} Saving checkpoint of model...".format( datetime.datetime.now()))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() # Placeholders x = tf.placeholder(tf.float32, [None, 227, 227, 3],'x') xt = tf.placeholder(tf.float32, [None, 227, 227, 3],'xt') y = tf.placeholder(tf.float32, [None, NUM_CLASSES],'y') yt = tf.placeholder(tf.float32, [None, NUM_CLASSES],'yt') adlamb=tf.placeholder(tf.float32) decay_learning_rate=tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) # Model train_layers = FLAGS.train_layers.split(',') model = AlexNetModel(num_classes=NUM_CLASSES, dropout_keep_prob=dropout_keep_prob) loss = model.loss(x, y) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) correct=tf.reduce_sum(tf.cast(correct_pred,tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) G_loss,D_loss,sc,tc=model.adloss(x,xt,y,adlamb) target_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1)) target_correct=tf.reduce_sum(tf.cast(target_correct_pred,tf.float32)) target_accuracy = tf.reduce_mean(tf.cast(target_correct_pred, tf.float32)) train_op = model.optimize(decay_learning_rate, train_layers,adlamb,sc,tc) D_op=model.adoptimize(decay_learning_rate,train_layers) optimizer=tf.group(train_op,D_op) train_writer=tf.summary.FileWriter('./log/tensorboard'+MODEL_NAME) train_writer.add_graph(tf.get_default_graph()) tf.summary.scalar('Testing Accuracy',target_accuracy) merged=tf.summary.merge_all() print '============================GLOBAL TRAINABLE VARIABLES ============================' print tf.trainable_variables() #print '============================GLOBAL VARIABLES ======================================' #print tf.global_variables() # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None print '==================== MULTI SCALE===================================================' print multi_scale train_preprocessor = BatchPreprocessor(dataset_file_path=TRAINING_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) Ttrain_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=VAL_FILE, num_classes=NUM_CLASSES, output_size=[227, 227],multi_scale=multi_scale,istraining=False) # Get the number of training/validation steps per epoch train_batches_per_epoch = np.floor(len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) Ttrain_batches_per_epoch = np.floor(len(Ttrain_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor(len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver=tf.train.Saver() train_writer.add_graph(sess.graph) # Load the pretrained weights model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir)) gs=0 gd=0 for epoch in range(FLAGS.num_epochs): #print("{} Epoch number: {}".format(datetime.datetime.now(), epoch+1)) step = 1 # Start training while step < train_batches_per_epoch: gd+=1 lamb=adaptation_factor(gd*1.0/MAX_STEP) rate=decay(FLAGS.learning_rate,gd,MAX_STEP) for it in xrange(1): gs+=1 if gs%Ttrain_batches_per_epoch==0: Ttrain_preprocessor.reset_pointer() if gs%train_batches_per_epoch==0: train_preprocessor.reset_pointer() batch_xs, batch_ys = train_preprocessor.next_batch(FLAGS.batch_size) Tbatch_xs, Tbatch_ys = Ttrain_preprocessor.next_batch(FLAGS.batch_size) summary,_=sess.run([merged,optimizer], feed_dict={x: batch_xs,xt: Tbatch_xs,yt:Tbatch_ys,adlamb:lamb, decay_learning_rate:rate,y: batch_ys,dropout_keep_prob:0.5}) train_writer.add_summary(summary,gd) closs,gloss,dloss,gregloss,dregloss,floss,smloss=sess.run([model.loss,model.G_loss,model.D_loss,model.Gregloss,model.Dregloss,model.F_loss,model.Semanticloss], feed_dict={x: batch_xs,xt: Tbatch_xs,adlamb:lamb, decay_learning_rate:rate,y: batch_ys,dropout_keep_prob:0.5}) step += 1 if gd%50==0: print '=================== Step {0:<10} ================='.format(gs) print 'Epoch {0:<5} Step {1:<5} Closs {2:<10} Gloss {3:<10} Dloss {4:<10} Total_Loss {7:<10} Gregloss {5:<10} Dregloss {6:<10} Semloss {7:<10}'.format(epoch,step,closs,gloss,dloss,gregloss,dregloss,floss,smloss) print 'lambda: ',lamb print 'rate: ',rate # Epoch completed, start validation print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range((len(val_preprocessor.labels))): batch_tx, batch_ty = val_preprocessor.next_batch(1) acc = sess.run(correct, feed_dict={x: batch_tx, y: batch_ty, dropout_keep_prob: 1.}) test_acc += acc test_count += 1 print test_acc,test_count test_acc /= test_count print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) # Reset the dataset pointers val_preprocessor.reset_pointer() #train_preprocessor.reset_pointer() if gd%5000==0 and gd>0: saver.save(sess,'./log/mstnmodel_'+MODEL_NAME+str(gd)+'.ckpt') print("{} Saving checkpoint of model...".format(datetime.datetime.now()))
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('resnet_%Y%m%d_%H%M%S') print(train_dir_name) train_dir = os.path.join(FLAGS.tensorboard_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.tensorboard_root_dir): os.mkdir(FLAGS.tensorboard_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('resnet_depth={}\n'.format(FLAGS.resnet_depth)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('tensorboard_root_dir={}\n'.format( FLAGS.tensorboard_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.write('gpus={}\n'.format(FLAGS.gpus)) flags_file.close() #gpus if FLAGS.gpus: print('Use Gpu: {}'.format(FLAGS.gpus)) os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpus # Placeholders x = tf.placeholder(tf.float32, [FLAGS.batch_size, 224, 224, 3]) y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) is_training = tf.placeholder('bool', []) # Model train_layers = FLAGS.train_layers.split(',') model = ResNetModel(is_training, depth=FLAGS.resnet_depth, num_classes=FLAGS.num_classes) loss = model.loss(x, y) train_op = model.optimize(FLAGS.learning_rate, train_layers) # Training accuracy of the model correct_pred = tf.equal(tf.argmax(model.prob, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Summaries tf.summary.scalar('train_loss', loss) tf.summary.scalar('train_accuracy', accuracy) merged_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_train_dir) val_writer = tf.summary.FileWriter(tensorboard_val_dir) saver = tf.train.Saver() # set gpu gpu_options = tf.GPUOptions(allow_growth=True) # Batch preprocessors multi_scale = FLAGS.multi_scale.split(',') if len(multi_scale) == 2: multi_scale = [int(multi_scale[0]), int(multi_scale[1])] else: multi_scale = None train_preprocessor = BatchPreprocessor( dataset_file_path=FLAGS.training_file, num_classes=FLAGS.num_classes, output_size=[224, 224], horizontal_flip=True, shuffle=True, multi_scale=multi_scale) val_preprocessor = BatchPreprocessor(dataset_file_path=FLAGS.val_file, num_classes=FLAGS.num_classes, output_size=[224, 224]) # Get the number of training/validation steps per epoch, 向下取整 train_batches_per_epoch = np.floor( len(train_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) val_batches_per_epoch = np.floor( len(val_preprocessor.labels) / FLAGS.batch_size).astype(np.int16) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) # Load the pretrained weights # model.load_original_weights(sess, skip_layers=train_layers) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) print("{} Open Tensorboard at --logdir {}".format( datetime.datetime.now(), tensorboard_dir)) for epoch in range(FLAGS.num_epochs): print("{} Epoch number: {}".format(datetime.datetime.now(), epoch + 1)) step = 1 # Start training while step < train_batches_per_epoch: batch_xs, batch_ys = train_preprocessor.next_batch( FLAGS.batch_size) sess.run(train_op, feed_dict={ x: batch_xs, y: batch_ys, is_training: True }) # Logging if step % FLAGS.log_step == 0: s = sess.run(merged_summary, feed_dict={ x: batch_xs, y: batch_ys, is_training: False }) train_writer.add_summary( s, epoch * train_batches_per_epoch + step) if step % 10 == 0: train_loss_num, train_acc_num = sess.run([loss, accuracy], feed_dict={ x: batch_xs, y: batch_ys, is_training: False }) print('{} step:{} loss: {} acc: {}'.format( datetime.datetime.now(), step, train_loss_num, train_acc_num)) step += 1 # epoch completed, start validation print("{} start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 for _ in range(val_batches_per_epoch): batch_tx, batch_ty = val_preprocessor.next_batch( FLAGS.batch_size) acc = sess.run(accuracy, feed_dict={ x: batch_tx, y: batch_ty, is_training: False }) test_acc += acc test_count += 1 test_acc /= test_count # s = tf.summary(value=[ # tf.summary.value(tag="validation_accuracy", simple_value=test_acc) # ]) val_writer.add_summary(s, epoch + 1) print("{} validation accuracy = {:.4f}".format( datetime.datetime.now(), test_acc)) # reset the dataset pointers val_preprocessor.reset_pointer() train_preprocessor.reset_pointer() print("{} saving checkpoint of model...".format( datetime.datetime.now())) #save checkpoint of the model checkpoint_path = os.path.join( checkpoint_dir, 'model_epoch' + str(epoch + 1) + '.ckpt') save_path = saver.save(sess, checkpoint_path) print("{} model checkpoint saved at {}".format( datetime.datetime.now(), checkpoint_path))