def main(model_config, train_config): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the training and validation model model = BiseNet(model_config, train_config, num_classes, mode="train") model.build() model_va = BiseNet(model_config, train_config, num_classes, mode="validation") model_va.build(reuse=True) # Save configurations for future reference save_cfgs(train_dir, model_config, train_config) learning_rate = _configure_learning_rate(train_config, model.global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # Set up the training ops update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.contrib.layers.optimize_loss(loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer=optimizer, clip_gradients=train_config['clip_gradients'], learning_rate_decay_fn=None, summaries=['learning_rate']) # t_vars = tf.trainable_variables() # r_vars = [var for var in t_vars if "regression" in var.name] # # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # gen_updates = [op for op in update_ops if "regression" in op.name] # # with tf.control_dependencies(gen_updates): # train_op = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(model.loss, var_list=r_vars) saver = tf.train.Saver(tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() g.finalize() # Finalize graph to avoid adding ops by mistake # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['frontend_config']['pretrained_dir'] and model.init_fn: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) for step in range(start_step, total_steps): start_time = time.time() _, predict_loss, loss = sess.run([train_op, model.loss, model.total_loss]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float(duration) time_remain = data_config['batch_size'] * (total_steps - step) / examples_per_sec m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ('%s: step %d, total loss = %.2f, predict loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info(format_str % (datetime.now(), step, loss, predict_loss, examples_per_sec, duration, h, m, s)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % train_config['save_model_every_n_step'] == 0 or (step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(): prince = True parser = argparse.ArgumentParser() parser.add_argument('--z_dim', type=int, default=10, help='Noise dimension') parser.add_argument('--t_dim', type=int, default=64, help='Text feature dimension') parser.add_argument('--batch_size', type=int, default=64, help='Batch Size') parser.add_argument('--image_size', type=int, default=64, help='Image Size a, a x a') parser.add_argument('--gf_dim', type=int, default=44, help='Number of conv in the first layer gen.') parser.add_argument('--df_dim', type=int, default=54, help='Number of conv in the first layer discr.') parser.add_argument( '--gfc_dim', type=int, default=1024, help='Dimension of gen untis for for fully connected layer 1024') parser.add_argument('--caption_vector_length', type=int, default=556, help='Caption Vector Length') parser.add_argument('--data_dir', type=str, default="Data", help='Data Directory') parser.add_argument('--beta1', type=float, default=.5, help='Momentum for Adam Update') parser.add_argument('--data_set', type=str, default="flowers", help='Data set: MS-COCO, flowers') args = parser.parse_args() save_epoch = [10, 20] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.54) cols = ['epoch_' + str(i) for i in save_epoch] df_loss = pd.DataFrame(index=cols, columns=cols) for i in range(len(save_epoch)): for j in range((len(save_epoch))): modelFile1 = "./Data/ModelEval/scratch_checkpoint_epoch_%d.ckpt" % ( save_epoch[i]) modelFile2 = "./Data/ModelEval/Feat_checkpoint_epoch_%d.ckpt" % ( save_epoch[j]) model_options = { 'z_dim': args.z_dim, 't_dim': args.t_dim, 'batch_size': args.batch_size, 'image_size': args.image_size, 'gf_dim': args.gf_dim, 'df_dim': args.df_dim, 'gfc_dim': args.gfc_dim, 'caption_vector_length': args.caption_vector_length } gan1 = model1.GAN(model_options) input_tensors1, _, _, outputs1 = gan1.build_model( args.beta1, .9, 1e-4) sess1 = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) if prince: sess1.run(tf.global_variables_initializer()) else: tf.initialize_all_variables().run() saver = tf.train.Saver() # Restore the first model: saver.restore(sess1, modelFile1) loaded_data = load_training_data(args.data_dir, args.data_set) batch_no = 0 _, _, caption_vectors, z_noise, _ = get_training_batch( batch_no, args.batch_size, args.image_size, args.z_dim, args.caption_vector_length, 'train', args.data_dir, args.data_set, loaded_data) # Get output image from first model g1_img3 = sess1.run(outputs1['img3'], feed_dict={ input_tensors1['t_real_caption']: caption_vectors, input_tensors1['t_z']: z_noise, input_tensors1['noise_indicator']: 0, input_tensors1['noise_gen']: 0 }) g1_d1_p_3_gen_img_logit, g1_d1_p_3_gen_txt_logit = sess1.run( [ outputs1['output_p_3_gen_img_logit'], outputs1['output_p_3_gen_txt_logit'] ], feed_dict={ input_tensors1['t_real_caption']: caption_vectors, input_tensors1['t_z']: z_noise, input_tensors1['noise_indicator']: 0, input_tensors1['gen_image1']: g1_img3, input_tensors1['noise_disc']: 0, input_tensors1['noise_gen']: 0 }) #print('g1_d1_p_3_gen_img_logit:') #print(g1_d1_p_3_gen_img_logit) g1_d1_loss = cross_entropy( g1_d1_p_3_gen_img_logit, np.ones( (args.batch_size, 1))) + cross_entropy( g1_d1_p_3_gen_txt_logit, np.ones((args.batch_size, 1))) tf.reset_default_graph() sess1.close() # Create second model model_options = { 'z_dim': args.z_dim, 't_dim': args.t_dim, 'batch_size': args.batch_size, 'image_size': args.image_size, 'gf_dim': 16, 'df_dim': 16, 'gfc_dim': args.gfc_dim, 'caption_vector_length': args.caption_vector_length } gan2 = model2.GAN(model_options) input_tensors2, _, _, outputs2 = gan2.build_model( args.beta1, .9, 1e-4) sess2 = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) if prince: sess2.run(tf.global_variables_initializer()) else: tf.initialize_all_variables().run() saver2 = tf.train.Saver() saver2.restore(sess2, modelFile2) # Get logits from the second model g1_d2_p_3_gen_img_logit, g1_d2_p_3_gen_txt_logit = sess2.run( [ outputs2['output_p_3_gen_img_logit'], outputs2['output_p_3_gen_txt_logit'] ], feed_dict={ input_tensors2['t_real_caption']: caption_vectors, input_tensors2['t_z']: z_noise, input_tensors2['noise_indicator']: 0, input_tensors2['gen_image1']: g1_img3, input_tensors2['noise_disc']: 0, input_tensors2['noise_gen']: 0 }) g1_d2_loss = cross_entropy( g1_d2_p_3_gen_img_logit, np.ones( (args.batch_size, 1))) + cross_entropy( g1_d2_p_3_gen_txt_logit, np.ones((args.batch_size, 1))) # Get output image from second model g2_img3 = sess2.run(outputs2['img3'], feed_dict={ input_tensors2['t_real_caption']: caption_vectors, input_tensors2['t_z']: z_noise, input_tensors2['noise_indicator']: 0, input_tensors2['noise_gen']: 0 }) # Get logits from the second model g2_d2_p_3_gen_img_logit, g2_d2_p_3_gen_txt_logit = sess2.run( [ outputs2['output_p_3_gen_img_logit'], outputs2['output_p_3_gen_txt_logit'] ], feed_dict={ input_tensors2['t_real_caption']: caption_vectors, input_tensors2['t_z']: z_noise, input_tensors2['noise_indicator']: 0, input_tensors2['gen_image1']: g2_img3, input_tensors2['noise_disc']: 0, input_tensors2['noise_gen']: 0 }) g2_d2_loss = cross_entropy( g2_d2_p_3_gen_img_logit, np.ones( (args.batch_size, 1))) + cross_entropy( g2_d2_p_3_gen_txt_logit, np.ones((args.batch_size, 1))) tf.reset_default_graph() sess2.close() model_options = { 'z_dim': args.z_dim, 't_dim': args.t_dim, 'batch_size': args.batch_size, 'image_size': args.image_size, 'gf_dim': args.gf_dim, 'df_dim': args.df_dim, 'gfc_dim': args.gfc_dim, 'caption_vector_length': args.caption_vector_length } gan1 = model1.GAN(model_options) input_tensors1, _, _, outputs1 = gan1.build_model( args.beta1, .9, 1e-4) sess1 = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) if prince: sess1.run(tf.global_variables_initializer()) else: tf.initialize_all_variables().run() saver = tf.train.Saver() saver.restore(sess1, modelFile1) # Get logits from the first model g2_d1_p_3_gen_img_logit, g2_d1_p_3_gen_txt_logit = sess1.run( [ outputs1['output_p_3_gen_img_logit'], outputs1['output_p_3_gen_txt_logit'] ], feed_dict={ input_tensors1['t_real_caption']: caption_vectors, input_tensors1['t_z']: z_noise, input_tensors1['noise_indicator']: 0, input_tensors1['gen_image1']: g2_img3, input_tensors1['noise_disc']: 0, input_tensors1['noise_gen']: 0 }) #print('g1_d1_p_3_gen_img_logit:') #print(g1_d1_p_3_gen_img_logit) #print('g2_d1_p_3_gen_img_logit:') #print(g2_d1_p_3_gen_img_logit) tf.reset_default_graph() sess1.close() g2_d1_loss = cross_entropy( g2_d1_p_3_gen_img_logit, np.ones( (args.batch_size, 1))) + cross_entropy( g2_d1_p_3_gen_txt_logit, np.ones((args.batch_size, 1))) g1_wins_on_d2 = 0 g2_wins_on_d1 = 0 for idx in range(g2_d1_loss.shape[0]): # Compare loss on disc 1 if g1_d1_loss[idx][0] > g2_d1_loss[idx][0]: g2_wins_on_d1 += 1 print(g2_d1_loss[idx][0], '<', g1_d1_loss[idx][0], 'g2 wins on d1') else: print(g2_d1_loss[idx][0], '>', g1_d1_loss[idx][0], 'g1 wins on d1') # Compare loss on disc 2 if g1_d2_loss[idx][0] < g2_d2_loss[idx][0]: g1_wins_on_d2 += 1 print(g1_d2_loss[idx][0], '<', g2_d2_loss[idx][0], 'g1 wins on d2') else: print(g1_d2_loss[idx][0], '>', g2_d2_loss[idx][0], 'g2 wins on d2') df_loss.loc[ cols[i], cols[j]] = str(g2_wins_on_d1) + '/' + str(g1_wins_on_d2) df_loss.to_csv('scratch_Feat.csv')
'filter_sizes': [2, 3, 4, 5], 'num_filters': 128, 'keep_prob': 0.9, 'l2_reg_lambda': 0.1, 'batch_size': 64, 'num_epochs': 40, 'allow_soft_placement': True, 'log_device_placement': False } if __name__ == '__main__': trainx, trainy, testx, testy, vocab = data_helpers.data_process(FLAGS) conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Graph().as_default() as graph, tf.Session(conf) as sess: model = TextCNN(vocab.embedding, FLAGS) saver = tf.train.Saver(max_to_keep=20) sess.run(tf.global_variables_initializer()) stime = time.time() eval_acc = [] for epoch in range(FLAGS['num_epochs']): # train for batch in data_helpers.batch_iter(list(zip(trainx, trainy)), FLAGS['batch_size'], 1): x_batch, y_batch = zip(*batch) step, loss, accuracy = model.train_step(sess, x_batch, y_batch) if (step + 1) % 200 == 0:
def model(self, train_file, log_path): trainX, trainY, testX, testY = self.read_data_disease(train_file) print("trainX.shape: ", trainX.shape, trainY.shape, testX.shape, testY.shape) trainX, trainY = self.next_batch(trainX, trainY, trainX.shape[0], trainX.shape[0]) # test small size train set. # trainX = trainX[0:300] # trainY = trainY[0:300] cnt = 0 for i in testY: if i == 0: cnt += 1 print("minimal precision: ", float(cnt) / testY.shape[0]) testX, testY = self.next_batch(testX, testY, testX.shape[0], testY.shape[0]) self.keep_prob = tf.placeholder(tf.float32) self.x_ = tf.placeholder(tf.float32, [None, 700 * 700 * 3]) # shape [None, 128] self.x = tf.reshape(self.x_, [-1, 700, 700, 3]) self.W_conv1 = self.weight_variable([5, 5, 3, 8]) self.b_conv1 = self.bias_variable([8]) self.W_conv2 = self.weight_variable([3, 3, 8, 32]) self.b_conv2 = self.bias_variable([32]) self.W_conv3 = self.weight_variable([5, 5, 32, 32]) self.b_conv3 = self.bias_variable([32]) # attr self.W_conv4_attr = self.weight_variable([3, 3, 32, 16]) self.b_conv4_attr = self.bias_variable([16]) self.W_conv5_attr = self.weight_variable([3, 3, 16, 8]) self.b_conv5_attr = self.bias_variable([8]) self.W_fc1_attr = self.weight_variable([7 * 7 * 8, 36]) # 288 * 36 self.b_fc1_attr = self.bias_variable([36]) self.W_fc2 = self.weight_variable([36, 3]) self.b_fc2 = self.bias_variable([3]) self.y_ = tf.placeholder(tf.float32, [None, 3]) # shape [None, 250] self.y_conv = tf.nn.softmax(self.my_image_filter(self.x)) # self.cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.y_ * tf.log(self.y_conv + 1e-10), reduction_indices=[1])) self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.y_conv, labels=self.y_)) tf.summary.scalar("cross_entropy", self.cross_entropy) self.learning_rate = tf.placeholder(tf.float32, shape=[]) tf.summary.scalar("learning_rate", self.learning_rate) # self.train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.cross_entropy) self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize( self.cross_entropy) self.correct_prediction = tf.equal(tf.arg_max(self.y_conv, 1), tf.arg_max(self.y_, 1)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) tf.summary.scalar("accuracy", self.accuracy) # init_op = tf.initialize_all_variables() init_op = tf.global_variables_initializer() self.merged_summary_op = tf.summary.merge_all() drops = [1.0] # overfitting. global_step = 101 for d in range(len(drops)): drop = drops[d] log_path = log_path + str(d) print("log_path: ", log_path, " drop:", drop) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # sess = tf.Session() sess.run(init_op) summary_writer_train = tf.summary.FileWriter( log_path + "train", graph=tf.get_default_graph()) summary_writer_test = tf.summary.FileWriter( log_path + "test", graph=tf.get_default_graph()) num_examples = trainX.shape[0] minibatch = 60 maxc = -1.0 for epoch in range(global_step): print("iter:", epoch) avg_cost = 0. total_batch = int(num_examples / minibatch) # total_batch = 1 rate = 0.001 * math.pow(0.7, int(epoch / 3)) print("learning rate: ", rate) for i in range(total_batch): batch_xs, batch_ys = self.next_batch_train( trainX, trainY, minibatch, num_examples) # print(batch_xs.shape, batch_ys.shape) # _, c, summary = sess.run([self.train_step, self.cross_entropy, self.merged_summary_op], feed_dict={self.x_: batch_xs, self.y_: batch_ys, self.learning_rate: rate, self.keep_prob: drop}) _, c, summary = sess.run( [ self.train_step, self.cross_entropy, self.merged_summary_op ], feed_dict={ self.x_: batch_xs, self.y_: batch_ys, self.learning_rate: rate, self.keep_prob: drop }) summary_writer_train.add_summary(summary, epoch * total_batch + i) avg_cost += c / total_batch if i % 1 == 0: print("i/tot: ", i, "/", total_batch, " current c:", c, " ave_cost:", avg_cost) # # test batch_xs, batch_ys = self.next_batch_train( testX, testY, minibatch, len(testX)) pre, summary = sess.run( [self.accuracy, self.merged_summary_op], feed_dict={ self.x_: batch_xs, self.y_: batch_ys, self.learning_rate: rate, self.keep_prob: drop }) summary_writer_test.add_summary(summary, epoch * total_batch + i) if pre > maxc: maxc = pre # test # if epoch % 1 == 0: # batch_xs, batch_ys = self.next_batch_train(testX, testY, minibatch, len(testX)) # pre, summary = sess.run([self.accuracy, self.merged_summary_op], feed_dict={self.x_: batch_xs, self.y_: batch_ys, self.learning_rate: rate, self.keep_prob: drop}) # summary_writer_test.add_summary(summary, epoch * total_batch) # # print("precision: ", pre) # if pre > maxc: # maxc = pre print("max precision: ", maxc)
def main(): #--------------------------------------------------------------------------- # Parse the commandline #--------------------------------------------------------------------------- parser = argparse.ArgumentParser(description='Train the SSD') parser.add_argument('--name', default='test', help='project name') parser.add_argument('--gpu', type=bool, default=True, help='gpu visibility to env') parser.add_argument('--data-dir', default='pascal-voc', help='data directory') parser.add_argument('--vgg-dir', default='vgg_graph', help='directory for the VGG-16 model') parser.add_argument('--epochs', type=int, default=200, help='number of training epochs') parser.add_argument('--batch-size', type=int, default=8, help='batch size') parser.add_argument('--tensorboard-dir', default="tb", help='name of the tensorboard data directory') parser.add_argument('--checkpoint-interval', type=int, default=5, help='checkpoint interval') parser.add_argument('--lr-values', type=str, default='0.00075;0.0001;0.00001', help='learning rate values') parser.add_argument('--lr-boundaries', type=str, default='320000;400000', help='learning rate chage boundaries (in batches)') parser.add_argument('--momentum', type=float, default=0.9, help='momentum for the optimizer') parser.add_argument('--weight-decay', type=float, default=0.0005, help='L2 normalization factor') parser.add_argument('--continue-training', type=str2bool, default='False', help='continue training from the latest checkpoint') parser.add_argument('--num-workers', type=int, default=mp.cpu_count(), help='number of parallel generators') args = parser.parse_args() print('[i] Project name: ', args.name) print('[i] GPU visibility ', args.gpu) print('[i] Data directory: ', args.data_dir) print('[i] VGG directory: ', args.vgg_dir) print('[i] # epochs: ', args.epochs) print('[i] Batch size: ', args.batch_size) print('[i] Tensorboard directory:', args.tensorboard_dir) print('[i] Checkpoint interval: ', args.checkpoint_interval) print('[i] Learning rate values: ', args.lr_values) print('[i] Learning rate boundaries: ', args.lr_boundaries) print('[i] Momentum: ', args.momentum) print('[i] Weight decay: ', args.weight_decay) print('[i] Continue: ', args.continue_training) print('[i] Number of workers: ', args.num_workers) #--------------------------------------------- # Set GPU Visibility #--------------------------------------------- if args.gpu == True: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #--------------------------------------------------------------------------- # Find an existing checkpoint #--------------------------------------------------------------------------- start_epoch = 0 if args.continue_training: state = tf.train.get_checkpoint_state(args.name) if state is None: print('[!] No network state found in ' + args.name) return 1 ckpt_paths = state.all_model_checkpoint_paths if not ckpt_paths: print('[!] No network state found in ' + args.name) return 1 last_epoch = None checkpoint_file = None for ckpt in ckpt_paths: ckpt_num = os.path.basename(ckpt).split('.')[0][1:] try: ckpt_num = int(ckpt_num) except ValueError: continue if last_epoch is None or last_epoch < ckpt_num: last_epoch = ckpt_num checkpoint_file = ckpt if checkpoint_file is None: print('[!] No checkpoints found, cannot continue!') return 1 metagraph_file = checkpoint_file + '.meta' if not os.path.exists(metagraph_file): print('[!] Cannot find metagraph', metagraph_file) return 1 start_epoch = last_epoch #--------------------------------------------------------------------------- # Create a project directory #--------------------------------------------------------------------------- else: try: print('[i] Creating directory {}...'.format(args.name)) os.makedirs(args.name) except (IOError) as e: print('[!]', str(e)) return 1 print('[i] Starting at epoch: ', start_epoch + 1) #--------------------------------------------------------------------------- # Configure the training data #--------------------------------------------------------------------------- print('[i] Configuring the training data...') try: td = TrainingData(args.data_dir) print('[i] # training samples: ', td.num_train) print('[i] # validation samples: ', td.num_valid) print('[i] # classes: ', td.num_classes) print('[i] Image size: ', td.preset.image_size) except (AttributeError, RuntimeError) as e: print('[!] Unable to load training data:', str(e)) return 1 #--------------------------------------------------------------------------- # Create the network #--------------------------------------------------------------------------- with tf.Session() as sess: print('[i] Creating the model...') n_train_batches = int(math.ceil(td.num_train / args.batch_size)) n_valid_batches = int(math.ceil(td.num_valid / args.batch_size)) global_step = None if start_epoch == 0: lr_values = args.lr_values.split(';') try: lr_values = [float(x) for x in lr_values] except ValueError: print('[!] Learning rate values must be floats') sys.exit(1) lr_boundaries = args.lr_boundaries.split(';') try: lr_boundaries = [int(x) for x in lr_boundaries] except ValueError: print('[!] Learning rate boundaries must be ints') sys.exit(1) ret = compute_lr(lr_values, lr_boundaries) learning_rate, global_step = ret net = SSDVGG(sess, td.preset) if start_epoch != 0: net.build_from_metagraph(metagraph_file, checkpoint_file) net.build_optimizer_from_metagraph() else: net.build_from_vgg(args.vgg_dir, td.num_classes) net.build_optimizer(learning_rate=learning_rate, global_step=global_step, weight_decay=args.weight_decay, momentum=args.momentum) initialize_uninitialized_variables(sess) #----------------------------------------------------------------------- # Create various helpers #----------------------------------------------------------------------- summary_writer = tf.summary.FileWriter(args.tensorboard_dir, sess.graph) saver = tf.train.Saver(max_to_keep=20) anchors = get_anchors_for_preset(td.preset) training_ap_calc = APCalculator() validation_ap_calc = APCalculator() #----------------------------------------------------------------------- # Summaries #----------------------------------------------------------------------- restore = start_epoch != 0 training_ap = PrecisionSummary(sess, summary_writer, 'training', td.lname2id.keys(), restore) validation_ap = PrecisionSummary(sess, summary_writer, 'validation', td.lname2id.keys(), restore) training_imgs = ImageSummary(sess, summary_writer, 'training', td.label_colors, restore) validation_imgs = ImageSummary(sess, summary_writer, 'validation', td.label_colors, restore) training_loss = LossSummary(sess, summary_writer, 'training', td.num_train, restore) validation_loss = LossSummary(sess, summary_writer, 'validation', td.num_valid, restore) #----------------------------------------------------------------------- # Get the initial snapshot of the network #----------------------------------------------------------------------- net_summary_ops = net.build_summaries(restore) if start_epoch == 0: net_summary = sess.run(net_summary_ops) summary_writer.add_summary(net_summary, 0) summary_writer.flush() #----------------------------------------------------------------------- # Cycle through the epoch #----------------------------------------------------------------------- print('[i] Training...') for e in range(start_epoch, args.epochs): training_imgs_samples = [] validation_imgs_samples = [] #------------------------------------------------------------------- # Train #------------------------------------------------------------------- generator = td.train_generator(args.batch_size, args.num_workers) description = '[i] Train {:>2}/{}'.format(e + 1, args.epochs) for x, y, gt_boxes in tqdm(generator, total=n_train_batches, desc=description, unit='batches'): if len(training_imgs_samples) < 3: saved_images = np.copy(x[:3]) feed = {net.image_input: x, net.labels: y} result, loss_batch, _ = sess.run( [net.result, net.losses, net.optimizer], feed_dict=feed) if math.isnan(loss_batch['confidence']): print('[!] Confidence loss is NaN.') training_loss.add(loss_batch, x.shape[0]) if e == 0: continue for i in range(result.shape[0]): boxes = decode_boxes(result[i], anchors, 0.5, td.lid2name) boxes = suppress_overlaps(boxes) training_ap_calc.add_detections(gt_boxes[i], boxes) if len(training_imgs_samples) < 3: training_imgs_samples.append((saved_images[i], boxes)) #------------------------------------------------------------------- # Validate #------------------------------------------------------------------- generator = td.valid_generator(args.batch_size, args.num_workers) description = '[i] Valid {:>2}/{}'.format(e + 1, args.epochs) for x, y, gt_boxes in tqdm(generator, total=n_valid_batches, desc=description, unit='batches'): feed = {net.image_input: x, net.labels: y} result, loss_batch = sess.run([net.result, net.losses], feed_dict=feed) validation_loss.add(loss_batch, x.shape[0]) if e == 0: continue for i in range(result.shape[0]): boxes = decode_boxes(result[i], anchors, 0.5, td.lid2name) boxes = suppress_overlaps(boxes) validation_ap_calc.add_detections(gt_boxes[i], boxes) if len(validation_imgs_samples) < 3: validation_imgs_samples.append((np.copy(x[i]), boxes)) #------------------------------------------------------------------- # Write summaries #------------------------------------------------------------------- training_loss.push(e + 1) validation_loss.push(e + 1) net_summary = sess.run(net_summary_ops) summary_writer.add_summary(net_summary, e + 1) APs = training_ap_calc.compute_aps() mAP = APs2mAP(APs) training_ap.push(e + 1, mAP, APs) APs = validation_ap_calc.compute_aps() mAP = APs2mAP(APs) validation_ap.push(e + 1, mAP, APs) training_ap_calc.clear() validation_ap_calc.clear() training_imgs.push(e + 1, training_imgs_samples) validation_imgs.push(e + 1, validation_imgs_samples) summary_writer.flush() #------------------------------------------------------------------- # Save a checktpoint #------------------------------------------------------------------- if (e + 1) % args.checkpoint_interval == 0: checkpoint = '{}/e{}.ckpt'.format(args.name, e + 1) saver.save(sess, checkpoint) print('[i] Checkpoint saved:', checkpoint) checkpoint = '{}/final.ckpt'.format(args.name) saver.save(sess, checkpoint) print('[i] Checkpoint saved:', checkpoint) return 0
def train(): with tf.device("/cpu:0"): if FLAGS.load_model is not None: if FLAGS.savefile is not None: checkpoints_dir = FLAGS.savefile + "/checkpoints/" + FLAGS.load_model.lstrip( "checkpoints/") else: checkpoints_dir = "checkpoints/" + FLAGS.load_model.lstrip( "checkpoints/") else: current_time = datetime.now().strftime("%Y%m%d-%H%M") if FLAGS.savefile is not None: checkpoints_dir = FLAGS.savefile + "/checkpoints/{}".format( current_time) else: checkpoints_dir = "checkpoints/{}".format(current_time) try: os.makedirs(checkpoints_dir + "/samples") except os.error: pass for attr, value in FLAGS.flag_values_dict().items(): logging.info("%s\t:\t%s" % (attr, str(value))) graph = tf.Graph() with graph.as_default(): gan = VAE_GAN(FLAGS.image_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.ngf) input_shape = [ int(FLAGS.batch_size / 4), FLAGS.image_size[0], FLAGS.image_size[1], FLAGS.image_size[2] ] G_optimizer, D_optimizer = gan.optimize() G_grad_list = [] D_grad_list = [] with tf.variable_scope(tf.get_variable_scope()): with tf.device("/gpu:0"): with tf.name_scope("GPU_0"): m_0 = tf.placeholder(tf.float32, shape=input_shape) s_0 = tf.placeholder(tf.float32, shape=input_shape) loss_list_0, image_list_0, j_list_0 = gan.model( s_0, m_0) variables_list_0 = gan.get_variables() G_grad_0 = G_optimizer.compute_gradients( loss_list_0[0], var_list=variables_list_0[0]) D_grad_0 = D_optimizer.compute_gradients( loss_list_0[1], var_list=variables_list_0[1]) G_grad_list.append(G_grad_0) D_grad_list.append(D_grad_0) with tf.device("/gpu:1"): with tf.name_scope("GPU_1"): m_1 = tf.placeholder(tf.float32, shape=input_shape) s_1 = tf.placeholder(tf.float32, shape=input_shape) loss_list_1, image_list_1, j_list_1 = gan.model( s_1, m_1) variables_list_1 = gan.get_variables() G_grad_1 = G_optimizer.compute_gradients( loss_list_1[0], var_list=variables_list_1[0]) D_grad_1 = D_optimizer.compute_gradients( loss_list_1[1], var_list=variables_list_1[1]) G_grad_list.append(G_grad_1) D_grad_list.append(D_grad_1) with tf.device("/gpu:2"): with tf.name_scope("GPU_2"): m_2 = tf.placeholder(tf.float32, shape=input_shape) s_2 = tf.placeholder(tf.float32, shape=input_shape) loss_list_2, image_list_2, j_list_2 = gan.model( s_2, m_2) variables_list_2 = gan.get_variables() G_grad_2 = G_optimizer.compute_gradients( loss_list_2[0], var_list=variables_list_2[0]) D_grad_2 = D_optimizer.compute_gradients( loss_list_2[1], var_list=variables_list_2[1]) G_grad_list.append(G_grad_2) D_grad_list.append(D_grad_2) with tf.device("/gpu:3"): with tf.name_scope("GPU_3"): m_3 = tf.placeholder(tf.float32, shape=input_shape) s_3 = tf.placeholder(tf.float32, shape=input_shape) loss_list_3, image_list_3, j_list_3 = gan.model( s_3, m_3) tensor_name_dirct = gan.tenaor_name variables_list_3 = gan.get_variables() G_grad_3 = G_optimizer.compute_gradients( loss_list_3[0], var_list=variables_list_3[0]) D_grad_3 = D_optimizer.compute_gradients( loss_list_3[1], var_list=variables_list_3[1]) G_grad_list.append(G_grad_3) D_grad_list.append(D_grad_3) D_ave_grad = average_gradients(D_grad_list) G_ave_grad = average_gradients(G_grad_list) G_optimizer_op = G_optimizer.apply_gradients(G_ave_grad) D_optimizer_op = D_optimizer.apply_gradients(D_ave_grad) optimizers = [G_optimizer_op, D_optimizer_op] saver = tf.train.Saver() with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=True))) as sess: if FLAGS.load_model is not None: logging.info("restore model:" + FLAGS.load_model) if FLAGS.checkpoint is not None: model_checkpoint_path = checkpoints_dir + "/model.ckpt-" + FLAGS.checkpoint latest_checkpoint = model_checkpoint_path else: checkpoint = tf.train.get_checkpoint_state(checkpoints_dir) model_checkpoint_path = checkpoint.model_checkpoint_path latest_checkpoint = tf.train.latest_checkpoint( checkpoints_dir) logging.info("model checkpoint path:" + model_checkpoint_path) meta_graph_path = model_checkpoint_path + ".meta" restore = tf.train.import_meta_graph(meta_graph_path) restore.restore(sess, latest_checkpoint) if FLAGS.step_clear == True: step = 0 else: step = int(meta_graph_path.split("-")[2].split(".")[0]) else: sess.run(tf.global_variables_initializer()) step = 0 sess.graph.finalize() logging.info("start step:" + str(step)) try: logging.info("tensor_name_dirct:\n" + str(tensor_name_dirct)) s_train_siles = read_silename(FLAGS.S) index = 0 epoch = 0 while epoch <= FLAGS.epoch: train_true_m = [] train_true_s = [] for b in range(FLAGS.batch_size): train_m_arr = read_sile(FLAGS.M, s_train_siles, index) train_s_arr = read_sile(FLAGS.S, s_train_siles, index) train_true_m.append(train_m_arr) train_true_s.append(train_s_arr) epoch = int(index / len(s_train_siles)) index = index + 1 logging.info("-----------train epoch " + str(epoch) + ", step " + str(step) + ": start-------------") sess.run(optimizers, feed_dict={ m_0: np.asarray(train_true_m) [0 * int(FLAGS.batch_size / 4):1 * int(FLAGS.batch_size / 4), :, :, :], m_1: np.asarray(train_true_m)[ 1 * int(FLAGS.batch_size / 4):2 * int(FLAGS.batch_size / 4), :, :, :], m_2: np.asarray(train_true_m)[ 2 * int(FLAGS.batch_size / 4):3 * int(FLAGS.batch_size / 4), :, :, :], m_3: np.asarray(train_true_m)[ 3 * int(FLAGS.batch_size / 4):4 * int(FLAGS.batch_size / 4), :, :, :], s_0: np.asarray(train_true_s)[ 0 * int(FLAGS.batch_size / 4):1 * int(FLAGS.batch_size / 4), :, :, :], s_1: np.asarray(train_true_s)[ 1 * int(FLAGS.batch_size / 4):2 * int(FLAGS.batch_size / 4), :, :, :], s_2: np.asarray(train_true_s)[ 2 * int(FLAGS.batch_size / 4):3 * int(FLAGS.batch_size / 4), :, :, :], s_3: np.asarray(train_true_s)[ 3 * int(FLAGS.batch_size / 4):4 * int(FLAGS.batch_size / 4), :, :, :], }) logging.info("-----------train epoch " + str(epoch) + ", step " + str(step) + ": end-------------") step += 1 except Exception as e: logging.info("ERROR:" + str(e)) save_path = saver.save(sess, checkpoints_dir + "/model.ckpt", global_step=step) logging.info("Model saved in file: %s" % save_path) finally: save_path = saver.save(sess, checkpoints_dir + "/model.ckpt", global_step=step) logging.info("Model saved in file: %s" % save_path)
name='batch_norm10') output = tf.nn.relu(output, name='relu10') output_final = tf.layers.dense(output, 1, name='dense11') # Loss ground_truth = tf.placeholder(tf.float32, [None, 1], name='labels') loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=ground_truth, logits=output_final) batch_loss = tf.reduce_mean(loss) ################################################################################ # Training loop ################################################################################ # Now, we've built our network. Let's start a session for training sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) saver = tf.train.Saver(max_to_keep=None) # a Saver to save trained models saver.restore(sess, TRAINED_MODEL) total_loss = 0 correct = 0 num_batch = int(np.ceil(num_sample / N)) for n_batch in range(num_batch): idx_begin = n_batch * N idx_end = (n_batch + 1) * N # OK if off-the-end LOADED_DATA = all_image_array[idx_begin:idx_end] LOADED_GT = all_label_array[idx_begin:idx_end] score, loss_value = sess.run((output_final, batch_loss), { data: LOADED_DATA,
def train(self, dataManager, train_samples, val_samples, batch_size, num_train_steps_per_epoch, num_epochs, num_val_steps, save_path, log_path, log_period=10, val_period=200, save_period=250, max_ckpts_to_keep=10, patience=0, is_load=False, model_load_dir=''): global_step = 0 init_op = tf.global_variables_initializer() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=sess_config) as sess: # if is_load: # saver = tf.train.Saver() # logger.info("Getting latest checkpoint in {}".format(model_load_dir)) # last_checkpoint = tf.train.latest_checkpoint(model_load_dir) # logger.info("Attempting to load checkpoint at {}".format(last_checkpoint)) # saver.restore(sess, last_checkpoint) # logger.info("Successfully loaded {}!".format(last_checkpoint)) # # else: sess.run(init_op) # Set up a Saver for periodically serializing the model. saver = tf.train.Saver(max_to_keep=max_ckpts_to_keep) # Set up the classes for logging to Tensorboard. train_writer = tf.summary.FileWriter(log_path + "/train", sess.graph) val_writer = tf.summary.FileWriter(log_path + "/val", sess.graph) epoch_validation_losses = [] # Iterate over a generator that returns batches. for epoch in tqdm(range(num_epochs), desc="Epochs Completed"): # Get a generator of train batches train_batch_gen = dataManager.get_next_batch # Iterate over the generated batches for it in tqdm(np.arange(num_train_steps_per_epoch)): global_step = sess.run(self.global_step) + 1 q1, q2, targets = train_batch_gen(train_samples, batch_index=it) inputs = [] inputs.append(q1) inputs.append(q2) train_batch = (inputs, targets) feed_dict = self._get_train_feed_dict(train_batch) # Do a gradient update, and log results to Tensorboard # if necessary. if global_step % log_period == 0: # Record summary with gradient update train_loss, _, train_summary = sess.run( [self.loss, self.training_op, self.summary_op], feed_dict=feed_dict) train_writer.add_summary(train_summary, global_step) else: # Do a gradient update without recording anything. train_loss, _ = sess.run([self.loss, self.training_op], feed_dict=feed_dict) #val_period if global_step % val_period == 0: # Evaluate on validation data val_acc, val_loss, val_summary = self._evaluate_on_validation( dataManager, val_samples=val_samples, batch_size=batch_size, num_val_steps=num_val_steps, session=sess) val_writer.add_summary(val_summary, global_step) # Write a model checkpoint if necessary. if global_step % save_period == 0: saver.save(sess, save_path, global_step=global_step) # End of the epoch, so save the model and check validation loss, # stopping if applicable. saver.save(sess, save_path, global_step=global_step) val_acc, val_loss, val_summary = self._evaluate_on_validation( dataManager, val_samples=val_samples, batch_size=batch_size, num_val_steps=num_val_steps, session=sess) val_writer.add_summary(val_summary, global_step) epoch_validation_losses.append(val_loss) # Get the lowest validation loss, with regards to the patience # threshold. patience_val_losses = epoch_validation_losses[:-(patience + 1)] if patience_val_losses: min_patience_val_loss = min(patience_val_losses) else: min_patience_val_loss = math.inf if min_patience_val_loss <= val_loss: # past loss was lower, so stop logger.info("Validation loss of {} in last {} " "epochs, which is lower than current " "epoch validation loss of {}; stopping " "early.".format(min_patience_val_loss, patience, val_loss)) break # Done training! logger.info("Finished {} epochs!".format(epoch + 1))
import sys import random as rn import setGPU import tensorflow as tf import keras gpu_config = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=0.2) session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_config)) keras.backend.tensorflow_backend.set_session(session) from keras.layers import Input from keras.layers import Activation from keras.models import Model from keras.optimizers import Adadelta from keras import backend as K import numpy as np import cv2 from keras.callbacks import TensorBoard, Callback from custom_model import get_unet import config rn.seed(2018) np.random.seed(2018) # Pipeline v1: mask in range [0,1] as single pixel + sigmoid + bce loss # Pipeline v2: mask in range [0,1] as blob + sigmoid + bce loss RADIUS = 5 if RADIUS == 1:
def main(args): sleep(random.random()) output_dir = os.path.expanduser(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) dataset = facenet.get_dataset(args.input_dir, max_items_per_class=99999) print('Creating networks and loading parameters') total = 0 for cls in dataset: total += len(cls.image_paths) print(total) with tf.Graph().as_default(): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None) minsize = 50 # minimum size of face threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 0.709 # scale factor # Add a random key to the filename to allow alignment using multiple processes random_key = np.random.randint(0, high=99999) bounding_boxes_filename = os.path.join( output_dir, 'bounding_boxes_%05d.txt' % random_key) with open(bounding_boxes_filename, "w") as text_file: nrof_images_total = 0 nrof_successfully_aligned = 0 if args.random_order: random.shuffle(dataset) for cls in dataset: output_class_dir = os.path.join(output_dir, cls.name) if not os.path.exists(output_class_dir): os.makedirs(output_class_dir) if args.random_order: random.shuffle(cls.image_paths) for image_path in cls.image_paths: nrof_images_total += 1 filename = os.path.splitext(os.path.split(image_path)[1])[0] output_filename = os.path.join(output_class_dir, filename + '.jpg') #print(image_path) if not os.path.exists(output_filename): try: #img = misc.imread(image_path) img = Image.open(image_path) img = np.asarray(img) except (IOError, ValueError, IndexError) as e: errorMessage = '{}: {}'.format(image_path, e) print(errorMessage) else: if img.ndim < 2: print('Not RGB, Unable to align "%s"' % image_path) #text_file.write('%s\n' % (output_filename)) continue if img.ndim == 2: img = facenet.to_rgb(img) img = img[:, :, 0:3] #minsize = 0.5*img.shape[0] bounding_boxes, _ = align.detect_face.detect_face( img, minsize, pnet, rnet, onet, threshold, factor) nrof_faces = bounding_boxes.shape[0] if nrof_faces > 0: det = bounding_boxes[:, 0:4] img_size = np.asarray(img.shape)[0:2] if nrof_faces > 1: bounding_box_size = (det[:, 2] - det[:, 0]) * ( det[:, 3] - det[:, 1]) img_center = img_size / 2 offsets = np.vstack([ (det[:, 0] + det[:, 2]) / 2 - img_center[1], (det[:, 1] + det[:, 3]) / 2 - img_center[0] ]) offset_dist_squared = np.sum( np.power(offsets, 2.0), 0) index = np.argmax( bounding_box_size - offset_dist_squared * 2.0) # some extra weight on the centering det = det[index, :] det = np.squeeze(det) bb = np.zeros(4, dtype=np.int32) # width margin = np.minimum(det[2] - det[0], det[3] - det[1]) * args.margin bb[0] = np.maximum(det[0] - margin, 0) bb[1] = np.maximum(det[1] - margin, 0) bb[2] = np.minimum(det[2] + margin, img_size[1]) bb[3] = np.minimum(det[3] + margin, img_size[0]) # change the bouinding to square but aspect ratio doesn't change diff = (bb[3] - bb[1]) - (bb[2] - bb[0]) side = 0 if diff > 0: # height greater than width side = bb[3] - bb[1] bb[0] = np.maximum(bb[0] - diff / 2, 0) bb[1] = bb[1] else: # height less than width side = bb[2] - bb[0] bb[0] = bb[0] bb[1] = np.maximum(bb[1] - diff / 2, 0) bb[3] = bb[1] + side bb[2] = bb[0] + side cropped = img[bb[1]:bb[3], bb[0]:bb[2], :] scaled = cropped nrof_successfully_aligned += 1 misc.imsave(output_filename, scaled) else: print('Unable to align "%s"' % image_path) print('Total number of images: %d' % nrof_images_total) print('Number of successfully aligned images: %d' % nrof_successfully_aligned)
def main(lr, batch_size, alpha, beta, image_size, K, T, num_iter, gpu): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu[0]) data_path = "../data/vimeo_interp_test/target/" f = open(data_path+"tri_testlist.txt","r") trainfiles = [l[:-1] for l in f.readlines()] margin = 0.3 updateD = True updateG = True iters = 0 prefix = ("KTH_MCNET" + "_image_size="+str(image_size) + "_K="+str(K) + "_T="+str(T) + "_batch_size="+str(batch_size) + "_alpha="+str(alpha) + "_beta="+str(beta) + "_lr="+str(lr)) print("\n"+prefix+"\n") checkpoint_dir = "../models/"+prefix+"/" samples_dir = "../samples/"+prefix+"/" summary_dir = "../logs/"+prefix+"/" if not exists(checkpoint_dir): makedirs(checkpoint_dir) if not exists(samples_dir): makedirs(samples_dir) if not exists(summary_dir): makedirs(summary_dir) with tf.device("/gpu:%d"%gpu[0]): model = MCNET(image_size=[image_size,image_size], c_dim=3, K=K, batch_size=batch_size, T=T, checkpoint_dir=checkpoint_dir) d_optim = tf.train.AdamOptimizer(lr, beta1=0.5).minimize( model.d_loss, var_list=model.d_vars ) g_optim = tf.train.AdamOptimizer(lr, beta1=0.5).minimize( alpha*model.L_img+beta*model.L_GAN, var_list=model.g_vars ) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) as sess: tf.global_variables_initializer().run() if model.load(sess, checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") g_sum = tf.summary.merge([model.L_p_sum, model.L_gdl_sum, model.loss_sum]) d_sum = tf.summary.merge([model.d_loss_real_sum, model.d_loss_sum, model.d_loss_fake_sum]) writer = tf.summary.FileWriter(summary_dir, sess.graph) counter = iters+1 start_time = time.time() # IPython.embed() with Parallel(n_jobs=batch_size) as parallel: while iters < num_iter: mini_batches = get_minibatches_idx(len(trainfiles), batch_size, shuffle=True) for _, batchidx in mini_batches: if len(batchidx) == batch_size: seq_batch = np.zeros((batch_size, image_size, image_size, K+T, 3), dtype="float32") diff_batch = np.zeros((batch_size, image_size, image_size, K-1, 3), dtype="float32") t0 = time.time() Ts = np.repeat(np.array([T]),batch_size,axis=0) Ks = np.repeat(np.array([K]),batch_size,axis=0) paths = np.repeat(data_path, batch_size,axis=0) tfiles = np.array(trainfiles)[batchidx] shapes = np.repeat(np.array([image_size]),batch_size,axis=0) output = parallel(delayed(load_vimeo_data)(f, p,img_sze, k, t) for f,p,img_sze,k,t in zip(tfiles, paths, shapes, Ks, Ts)) for i in range(batch_size): seq_batch[i] = output[i][0] diff_batch[i] = output[i][1] if updateD: _, summary_str = sess.run([d_optim, d_sum], feed_dict={model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) writer.add_summary(summary_str, counter) if updateG: _, summary_str = sess.run([g_optim, g_sum], feed_dict={model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) writer.add_summary(summary_str, counter) errD_fake = model.d_loss_fake.eval({model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) errD_real = model.d_loss_real.eval({model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) errG = model.L_GAN.eval({model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) errL_img = model.L_img.eval({model.diff_in: diff_batch, model.xt: seq_batch[:,:,:,K-1], model.target: seq_batch}) if errD_fake < margin or errD_real < margin: updateD = False if errD_fake > (1.-margin) or errD_real > (1.-margin): updateG = False if not updateD and not updateG: updateD = True updateG = True counter += 1 if counter % 50 == 0: print( "Iters: [%2d] time: %4.4f, d_loss: %.8f, L_GAN: %.8f, img_loss:%.8f" % (iters, time.time() - start_time, errD_fake+errD_real,errG,errL_img) ) # if np.mod(counter, 10) == 1: # samples = sess.run([model.G], # feed_dict={model.diff_in: diff_batch, # model.xt: seq_batch[:,:,:,K-1], # model.target: seq_batch})[0] # # IPython.embed() # samples = samples[0].swapaxes(0,2).swapaxes(1,2) # # IPython.embed() # sbatch = seq_batch[0,:,:,:].swapaxes(0,2).swapaxes(1,2) # sbatch2 = sbatch.copy() # # IPython.embed() # sbatch2[K:,:,:] = samples # # IPython.embed() # samples = np.concatenate((sbatch2,sbatch), axis=0) # # IPython.embed() # print("Saving sample ...") # save_images(samples, [2, K+T], # samples_dir+"train_%s.png" % (iters)) if np.mod(counter, 10000) == 2: model.save(sess, checkpoint_dir, counter) iters += 1
def main(args): global curr_model_dir args = parse_arguments() if not os.path.exists(models_dir): os.makedirs(models_dir) curr_model_dir = f"{models_dir}/{args.model_name}/" if not os.path.exists(curr_model_dir): os.makedirs(curr_model_dir) if not os.path.exists(logs_dir): os.makedirs(logs_dir) #start logging settings os.environ['TZ'] = 'EST+05EDT,M4.1.0,M10.5.0' time.tzset() time_str = time.strftime('%Y_%m_%d_%H_%M_%S') log_file_name = f'{logs_dir}/{args.model_name}_{time_str}.log' hdlr = logging.FileHandler(log_file_name) hdlr.setFormatter(formatter) logger.addHandler(hdlr) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) #end logging settings # Setting the session to allow growth, so it doesn't allocate all GPU memory. gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) sess = tf.Session(config=config) # Setting this as the default tensorflow session. keras.backend.tensorflow_backend.set_session(sess) #assert(args.env in ["CartPole-v0", "MountainCar-v0", "LunarLander-v2"]) assert(args.alg in ["a2c", "dqn", "ddpg"]) logger.info(f"Command line args: {args}") logger.info(f"Log saving to {log_file_name}") logger.info(f"Alg: {args.alg}") if args.hindsight_replay: if args.env_name == "MountainCar-v0" or args.env_name == "MountainCarContinuous-v0": default_goal = np.array([[0.5]]) # flag at x = 0.5 elif args.env_name == "LunarLander-v2" or args.env_name == "LunarLanderContinuous-v2": default_goal = np.array([[0.0, 0.0]]) # landing pad at x,y = (0,0) elif args.env_name == "Pendulum-v0": default_goal = np.array([[1.0, 0.0]]) # cos(theta), sin(theta), theta dot # theta is normalized between pi and -pi else: raise ValueError("Hindsight not enabled for this env") else: default_goal = None if args.seed != None: time_seed = args.seed else: #set consistent seed based on time time_seed = int(''.join(time_str.split('_'))) % (2 ** 32) np.random.seed(time_seed) logger.info(f"Numpy random seed {time_seed}") env = gym.make(args.env_name) num_inputs = env.observation_space.shape[0] num_outputs = None try: num_outputs = env.action_space.n except AttributeError: pass print(f"Num env inputs (state space): {num_inputs}") print(f"Num env outputs (actions): {num_outputs}") if args.alg == "a2c": agent = A2C(env, args.model_name, args.actor_model_path, args.actor_lr, args.critic_model_path, args.critic_lr, N=args.N, logger=logger) elif args.alg == "dqn": agent, use_episodes, num_train_episodes, num_train_steps = create_dqn(logger, args, env, default_goal, curr_model_dir, time_seed) elif args.alg == "ddpg": agent = DDPG(env, args.critic_lr, args.actor_lr, args.gamma, tau=args.tau, batch_size=args.replay_batch, default_goal=default_goal) if args.record_video_only: agent.test(record_video=True) return if args.test_only: agent.test() else: if args.alg == "a2c": assert not args.priority_replay, "NYI" assert not args.combined_replay, "NYI" assert not args.hindsight_replay, "NYI" agent.train(args.num_episodes, gamma=args.gamma, report_interval=args.train_mod, test_interval=args.test_mod, render=args.render) elif args.alg == "dqn": agent.train(use_episodes, num_train_episodes, num_train_steps, rep_batch_size = False if args.replay_batch == 0 else args.replay_batch, print_episode_mod=args.train_mod, test_episode_mod=args.test_mod, replay_mem_size=args.memory_size, default_goal=default_goal) elif args.alg == "ddpg": agent.train(env, args.num_episodes, default_goal=default_goal, hindsight_replay=args.hindsight_replay, priority_replay=args.priority_replay, combined_replay=args.combined_replay, train_mod=args.train_mod, test_mod=args.test_mod) logger.info(f"Log saved to {log_file_name}")
def get_faces(img, model_path='./models/20180402-114759/20180402-114759.pb'): args = {'gpu_memory_fraction': 0.25, 'margin': 44, 'image_size': 160} minsize = 20 # minimum size of face threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 0.709 # scale factor det_threshold = 0.95 global pnet, rnet, onet if pnet is None: print('Creating networks and loading parameters for mtnn') with tf.Graph().as_default(): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args['gpu_memory_fraction']) sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None) img_size = np.asarray(img.shape)[0:2] bounding_boxes, _ = align.detect_face.detect_face(img, minsize, pnet, rnet, onet, threshold, factor) # here thresholding only images with high confidence values det = [bb[0:4] for bb in bounding_boxes if bb[4] > det_threshold] det_arr = [np.squeeze(det[i]) for i in range(len(det))] faces = [] for i, det in enumerate(det_arr): det = np.squeeze(det) bb = np.zeros(4, dtype=np.int32) bb[0] = np.maximum(det[0] - args['margin'] / 2, 0) bb[1] = np.maximum(det[1] - args['margin'] / 2, 0) bb[2] = np.minimum(det[2] + args['margin'] / 2, img_size[1]) bb[3] = np.minimum(det[3] + args['margin'] / 2, img_size[0]) cropped = img[bb[1]:bb[3], bb[0]:bb[2], :] scaled = misc.imresize(cropped, (args['image_size'], args['image_size']), interp='bilinear') faces.append(scaled) # If you want to see a face: misc.imshow(faces[-4]) with tf.Graph().as_default(): with tf.Session() as sess: #np.random.seed(seed=666) ## Saket - add a randm seed here; not using random # Load the model print('Loading feature extraction model') facenet.load_model(model_path) # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Calculating features for images') nrof_images = len(faces) emb_array = np.zeros((nrof_images, embedding_size)) feed_dict = { images_placeholder: faces, phase_train_placeholder: False } emb_array = sess.run(embeddings, feed_dict=feed_dict) return (det_arr, emb_array)
return output # 建立sae图 config = Modelconfig() with tf.Graph().as_default(): image = tf.placeholder(tf.float32, [None, config.input_size]) sae = SAEnoplhV4(input_data=image, n_input=config.input_size, stack_size=config.stack_size, hidden_size=config.hidden_size, optimizer=tf.train.AdamOptimizer(learning_rate=0.00005) ) # 将权重和偏置放到定义到CPU中,性能有10%的提升 init = tf.global_variables_initializer() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) # 这个必须加,否者会出错 saver = tf.train.Saver(tf.global_variables()) checkpoint_path = os.path.join(log_dir, 'model.ckpt') with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # 如果不进行重新训练,则进入模型重载模式 if config.is_retrain == 0: saver.restore(sess, checkpoint_path) print("Model restored") # 否则进行模型重新训练 else: sess.run(init) # variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # a = sess.run(variables) # summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # 创建图写入器并写文件 coord = tf.train.Coordinator()
def main(): parser = argparse.ArgumentParser( description='Train a neural imaging pipeline') parser.add_argument('--cam', dest='camera', action='store', help='camera') parser.add_argument('--nip', dest='nips', action='append', choices=["INet", "UNet", "DNet", "OctUNet", "UNet3D"], help='add NIP for training (repeat if needed)') parser.add_argument('--out', dest='out_dir', action='store', default='./checkpoint/nip_model_snapshots', help='output directory for storing trained NIP models') parser.add_argument( '--data', dest='data_dir', action='store', default='../../datasets/raw/nip_training_data/', help='input directory with training data (.npy and .png pairs)') parser.add_argument('--patch', dest='patch_size', action='store', default=512, type=int, help='training patch size (RGB)') parser.add_argument('--epochs', dest='epochs', action='store', default=25000, type=int, help='maximum number of training epochs') parser.add_argument('--batch', dest='batch_size', action='store', default=20, type=int, help='training batch size') parser.add_argument( '--params', dest='nip_params', default=None, help='Extra parameters for NIP constructor (JSON string)') parser.add_argument( '--resume', dest='resume', action='store_true', default=False, help='Resume training from last checkpoint, if possible') parser.add_argument( '--split', dest='split', action='store', default='270:30:1', help= 'data split with #training:#validation:#validation_patches - e.g., 120:30:1' ) parser.add_argument('--ext', dest='extension', action='store', default='png', help='file extension of rgb images - e.g., png, JPG') args = parser.parse_args() if not args.camera: print('A camera needs to be specified!') parser.print_usage() sys.exit(1) if not args.nips: print('At least one NIP needs to be specified!') parser.print_usage() sys.exit(1) data_directory = os.path.join(args.data_dir, args.camera) out_directory_root = args.out_dir try: if args.nip_params is not None: args.nip_params = json.loads(args.nip_params.replace('\'', '"')) except json.decoder.JSONDecodeError: print('WARNING', 'JSON parsing error for: ', args.nip_params.replace('\'', '"')) sys.exit(2) print('## Parameters summary') print('Camera : {}'.format(args.camera)) print('NIPs : {}'.format(args.nips)) print('Params : {}'.format(args.nip_params)) print('Input : {}'.format(data_directory)) print('Output : {}'.format(out_directory_root)) print('Resume : {}'.format(args.resume)) # Load training and validation data training_spec = { 'seed': 1234, 'n_images': int(args.split.split(':')[0]), 'v_images': int(args.split.split(':')[1]), 'valid_patches': int(args.split.split(':')[2]), 'valid_patch_size': 512, } np.random.seed(training_spec['seed']) # Load and summarize the training data data = dataset.IPDataset( data_directory, n_images=training_spec['n_images'], v_images=training_spec['v_images'], load='xy', val_rgb_patch_size=training_spec['valid_patch_size'], val_n_patches=training_spec['valid_patches'], rgb_extension=args.extension) for key in ['Training', 'Validation']: print('{:>16s} [{:5.1f} GB] : X -> {}, Y -> {} '.format( '{} data'.format(key), coreutils.mem(data[key.lower()]['x']) + coreutils.mem(data[key.lower()]['y']), data[key.lower()]['x'].shape, data[key.lower()]['y'].shape), flush=True) # Lazy loading to prevent delays in basic CLI interaction from models import pipelines import tensorflow as tf # Train the Desired NIP Models for pipe in args.nips: if not issubclass(getattr(pipelines, pipe), pipelines.NIPModel): supported_nips = [ x for x in dir(pipelines) if x != 'NIPModel' and type(getattr(pipelines, x)) is type and issubclass(getattr(pipelines, x), pipelines.NIPModel) ] raise ValueError( 'Invalid NIP model ({})! Available NIPs: ({})'.format( pipe, supported_nips)) args.nip_params = args.nip_params or {} tf.reset_default_graph() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) config.gpu_options.allow_growth = True sess = tf.Session(config=config) model = getattr(pipelines, pipe)(sess, tf.get_default_graph(), loss_metric='L1', **args.nip_params) model.sess.run(tf.global_variables_initializer()) train_nip_model(model, args.camera, args.epochs, validation_loss_threshold=1e-5, patch_size=args.patch_size, resume=args.resume, sampling_rate=1000, batch_size=args.batch_size, learning_rate=1e-4, data=data, out_directory_root=args.out_dir) sess.close() return
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=250, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20, save_vocab=False, load_vocab=False): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.save_vocab = save_vocab self.load_vocab = load_vocab candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates, self.save_vocab, self.load_vocab) # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1, allow_growth=True) config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) self.sess = tf.Session(config=config) self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) # self.summary_writer = tf.train.SummaryWriter(self.model.root_dir, self.model.graph_output.graph) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph)
def main(args): # 模型,定义在inception_resnet_v1 V2里(), --model_def models.inception_resnet_v1 network = importlib.import_module(args.model_def) image_size = (args.image_size, args.image_size) subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir(log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir(model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) stat_file_name = os.path.join(log_dir, 'stat.h5') # Write arguments to a text file facenet.write_arguments_to_file(args, os.path.join(log_dir, 'arguments.txt')) # Store some git revision info in a text file in the log directory src_path,_ = os.path.split(os.path.realpath(__file__)) facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) random.seed(args.seed) dataset = facenet.get_dataset(args.data_dir) if args.filter_filename: dataset = filter_dataset(dataset, os.path.expanduser(args.filter_filename), args.filter_percentile, args.filter_min_nrof_images_per_class) if args.validation_set_split_ratio>0.0: train_set, val_set = facenet.split_dataset(dataset, args.validation_set_split_ratio, args.min_nrof_val_images_per_class, 'SPLIT_IMAGES') else: train_set, val_set = dataset, [] nrof_classes = len(train_set) print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) pretrained_model = None if args.pretrained_model: pretrained_model = os.path.expanduser(args.pretrained_model) print('Pre-trained model: %s' % pretrained_model) if args.lfw_dir: print('LFW directory: %s' % args.lfw_dir) # Read the file containing the pairs used for testing pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs)) # Get the paths for the corresponding images lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs) with tf.Graph().as_default(): tf.set_random_seed(args.seed) global_step = tf.Variable(0, trainable=False) # Get a list of image paths and their labels # 训练数据 image_list, label_list = facenet.get_image_paths_and_labels(train_set) assert len(image_list)>0, 'The training set should not be empty' # 测试数据 val_image_list, val_label_list = facenet.get_image_paths_and_labels(val_set) # Create a queue that produces indices into the image_list and label_list # tf.convert_to_tensor用于将不同数据变成张量:比如可以让数组变成张量、也可以让列表变成张量。 labels = ops.convert_to_tensor(label_list, dtype=tf.int32) range_size = array_ops.shape(labels)[0] # 多线程读取数据,shuffle=True表示不是按顺序存储,可以随机获取,并一直循环。 # https://blog.csdn.net/lyg5623/article/details/69387917 index_queue = tf.train.range_input_producer(range_size, num_epochs=None, shuffle=True, seed=None, capacity=32) # epoch 大数据时迭代完一轮时次数,少量数据应该epoch = 全部数据个数/batch index_dequeue_op = index_queue.dequeue_many(args.batch_size*args.epoch_size, 'index_dequeue') learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size') phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train') image_paths_placeholder = tf.placeholder(tf.string, shape=(None,1), name='image_paths') labels_placeholder = tf.placeholder(tf.int32, shape=(None,1), name='labels') control_placeholder = tf.placeholder(tf.int32, shape=(None,1), name='control') nrof_preprocess_threads = 4 input_queue = data_flow_ops.FIFOQueue(capacity=2000000, dtypes=[tf.string, tf.int32, tf.int32], shapes=[(1,), (1,), (1,)], shared_name=None, name=None) enqueue_op = input_queue.enqueue_many([image_paths_placeholder, labels_placeholder, control_placeholder], name='enqueue_op') image_batch, label_batch = facenet.create_input_pipeline(input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder) image_batch = tf.identity(image_batch, 'image_batch') image_batch = tf.identity(image_batch, 'input') label_batch = tf.identity(label_batch, 'label_batch') print('Number of classes in training set: %d' % nrof_classes) print('Number of examples in training set: %d' % len(image_list)) print('Number of classes in validation set: %d' % len(val_set)) print('Number of examples in validation set: %d' % len(val_image_list)) print('Building training graph') # Build the inference graph prelogits, _ = network.inference(image_batch, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) # 因为模型输出的(bottleneck_layer_size)没有计算最后一层(映射到图片类型),这里计算最后一层 logits = slim.fully_connected(prelogits, len(train_set), activation_fn=None, weights_initializer=slim.initializers.xavier_initializer(), weights_regularizer=slim.l2_regularizer(args.weight_decay), scope='Logits', reuse=False) # 按行进行泛化,行的平方求和再求平方根,得到的值按行除每个行的元素,对深度层面泛化? interface里最后一层输出为128个节点,slim.fully_connected(net, bottleneck_layer_size, activation_fn=None, #https://blog.csdn.net/abiggg/article/details/79368982 embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings') # 计算loss函数,当然还有其它训练参数也会加到这里来,通过比训练过程中一个weight加到正则化参数里来tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, weight) # 模型中最后会把这个加到优化的loss中来。 #L= L_softmax + λL_cneter = Softmax(W_i + b_yj) + λ1/2||f(x_i) - c_yj ||_2^2 # Norm for the prelogits eps = 1e-4 prelogits_norm = tf.reduce_mean(tf.norm(tf.abs(prelogits)+eps, ord=args.prelogits_norm_p, axis=1)) # 模型中最后输出(bottleneck_layer_size每个类型的输出值的个数)的平均值加到正则化loss中,但prelogits_norm_loss_factor貌似为0 tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_norm * args.prelogits_norm_loss_factor) # 计算中心损失及增加的正则化loss中 # Add center loss prelogits_center_loss, _ = facenet.center_loss(prelogits, label_batch, args.center_loss_alfa, nrof_classes) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_center_loss * args.center_loss_factor) learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step, args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', learning_rate) # Calculate the average cross entropy loss across the batch # 计算预测损失,和上面框架的Softmax(W_i + b_yj) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=label_batch, logits=logits, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') # 预测损失平均值加到losses变量中 tf.add_to_collection('losses', cross_entropy_mean) correct_prediction = tf.cast(tf.equal(tf.argmax(logits, 1), tf.cast(label_batch, tf.int64)), tf.float32) accuracy = tf.reduce_mean(correct_prediction) #计算总损失,cross_entropy_mean + 前面增加的一些正则化损失(包括模型中增加的),通过tf.GraphKeys.REGULARIZATION_LOSSES获取出来 # Calculate the total losses regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss') # Build a Graph that trains the model with one batch of examples and updates the model parameters train_op = facenet.train(total_loss, global_step, args.optimizer, learning_rate, args.moving_average_decay, tf.global_variables(), args.log_histograms) # Create a saver saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) with sess.as_default(): if pretrained_model: print('Restoring pretrained model: %s' % pretrained_model) saver.restore(sess, pretrained_model) # Training and validation loop print('Running training') nrof_steps = args.max_nrof_epochs*args.epoch_size nrof_val_samples = int(math.ceil(args.max_nrof_epochs / args.validate_every_n_epochs)) # Validate every validate_every_n_epochs as well as in the last epoch stat = { 'loss': np.zeros((nrof_steps,), np.float32), 'center_loss': np.zeros((nrof_steps,), np.float32), 'reg_loss': np.zeros((nrof_steps,), np.float32), 'xent_loss': np.zeros((nrof_steps,), np.float32), 'prelogits_norm': np.zeros((nrof_steps,), np.float32), 'accuracy': np.zeros((nrof_steps,), np.float32), 'val_loss': np.zeros((nrof_val_samples,), np.float32), 'val_xent_loss': np.zeros((nrof_val_samples,), np.float32), 'val_accuracy': np.zeros((nrof_val_samples,), np.float32), 'lfw_accuracy': np.zeros((args.max_nrof_epochs,), np.float32), 'lfw_valrate': np.zeros((args.max_nrof_epochs,), np.float32), 'learning_rate': np.zeros((args.max_nrof_epochs,), np.float32), 'time_train': np.zeros((args.max_nrof_epochs,), np.float32), 'time_validate': np.zeros((args.max_nrof_epochs,), np.float32), 'time_evaluate': np.zeros((args.max_nrof_epochs,), np.float32), 'prelogits_hist': np.zeros((args.max_nrof_epochs, 1000), np.float32), } for epoch in range(1,args.max_nrof_epochs+1): step = sess.run(global_step, feed_dict=None) # Train for one epoch t = time.time() # 训练模型 cont = train(args, sess, epoch, image_list, label_list, index_dequeue_op, enqueue_op, image_paths_placeholder, labels_placeholder, learning_rate_placeholder, phase_train_placeholder, batch_size_placeholder, control_placeholder, global_step, total_loss, train_op, summary_op, summary_writer, regularization_losses, args.learning_rate_schedule_file, stat, cross_entropy_mean, accuracy, learning_rate, prelogits, prelogits_center_loss, args.random_rotate, args.random_crop, args.random_flip, prelogits_norm, args.prelogits_hist_max, args.use_fixed_image_standardization) stat['time_train'][epoch-1] = time.time() - t if not cont: break # 在测试数据上计算正确率 t = time.time() if len(val_image_list)>0 and ((epoch-1) % args.validate_every_n_epochs == args.validate_every_n_epochs-1 or epoch==args.max_nrof_epochs): validate(args, sess, epoch, val_image_list, val_label_list, enqueue_op, image_paths_placeholder, labels_placeholder, control_placeholder, phase_train_placeholder, batch_size_placeholder, stat, total_loss, regularization_losses, cross_entropy_mean, accuracy, args.validate_every_n_epochs, args.use_fixed_image_standardization) stat['time_validate'][epoch-1] = time.time() - t # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, epoch) # Evaluate on LFW t = time.time() if args.lfw_dir: evaluate(sess, enqueue_op, image_paths_placeholder, labels_placeholder, phase_train_placeholder, batch_size_placeholder, control_placeholder, embeddings, label_batch, lfw_paths, actual_issame, args.lfw_batch_size, args.lfw_nrof_folds, log_dir, step, summary_writer, stat, epoch, args.lfw_distance_metric, args.lfw_subtract_mean, args.lfw_use_flipped_images, args.use_fixed_image_standardization) stat['time_evaluate'][epoch-1] = time.time() - t print('Saving statistics') with h5py.File(stat_file_name, 'w') as f: for key, value in stat.items(): f.create_dataset(key, data=value) return model_dir
def train(train_dir, val_dir, labels_file, word2vec_path, batch_size, max_steps, log_step, val_step, snapshot, out_dir): ''' 训练... :param train_dir: 训练数据目录 :param val_dir: val数据目录 :param labels_file: labels文件目录 :param word2vec_path: 词向量模型文件 :param batch_size: batch size :param max_steps: 最大迭代次数 :param log_step: log显示间隔 :param val_step: 测试间隔 :param snapshot: 保存模型间隔 :param out_dir: 模型ckpt和summaries输出的目录 :return: ''' max_sentence_length = 300 embedding_dim = 128 filter_sizes = [3, 4, 5, 6] num_filters = 200 # Number of filters per filter size base_lr = 0.001 # 学习率 dropout_keep_prob = 0.5 l2_reg_lambda = 0.0 # "L2 regularization lambda (default: 0.0) allow_soft_placement = True # 如果你指定的设备不存在,允许TF自动分配设备 log_device_placement = True # 是否打印设备分配日志 print("Loading data...") w2vModel = create_word2vec.load_wordVectors(word2vec_path) labels_set = fileprocessing.read_txt(labels_file) labels_nums = len(labels_set) #使用yield 实现generator 类型 train_file_list = create_batch_data.get_file_list(file_dir=train_dir, postfix='*.npy') train_batch = create_batch_data.get_data_batch(train_file_list, labels_nums=labels_nums, batch_size=batch_size, shuffle=False, one_hot=True) val_file_list = create_batch_data.get_file_list(file_dir=val_dir, postfix='*.npy') val_batch = create_batch_data.get_data_batch(val_file_list, labels_nums=labels_nums, batch_size=batch_size, shuffle=False, one_hot=True) print("train data info *****************************") train_nums = create_word2vec.info_npy(train_file_list) print("val data info *****************************") val_nums = create_word2vec.info_npy(val_file_list) print("labels_set info *****************************") fileprocessing.info_labels_set(labels_set) # Training with tf.Graph().as_default(): #tf.device("/gpu:0") gpu_options = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=0.7) session_conf = tf.ConfigProto( allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement, gpu_options=gpu_options) sess = tf.Session(graph=tf.Graph(), config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=max_sentence_length, num_classes=labels_nums, embedding_size=embedding_dim, filter_sizes=filter_sizes, num_filters=num_filters, l2_reg_lambda=l2_reg_lambda) # Define Training procedure #定义变量 global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=base_lr) # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) if step % log_step == 0: print("training: step {}, loss {:g}, acc {:g}".format( step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) if writer: writer.add_summary(summaries, step) return loss, accuracy for i in range(max_steps): train_batch_data, train_batch_label = create_batch_data.get_next_batch( train_batch) train_batch_data = create_word2vec.indexMat2vector_lookup( w2vModel, train_batch_data) train_step(train_batch_data, train_batch_label) current_step = tf.train.global_step(sess, global_step) if current_step % val_step == 0: val_losses = [] val_accs = [] # for k in range(int(val_nums/batch_size)): for k in range(100): val_batch_data, val_batch_label = create_batch_data.get_next_batch( val_batch) val_batch_data = create_word2vec.indexMat2vector_lookup( w2vModel, val_batch_data) val_loss, val_acc = dev_step(val_batch_data, val_batch_label, writer=dev_summary_writer) val_losses.append(val_loss) val_accs.append(val_acc) mean_loss = np.array(val_losses, dtype=np.float32).mean() mean_acc = np.array(val_accs, dtype=np.float32).mean() print("--------Evaluation:step {}, loss {:g}, acc {:g}". format(current_step, mean_loss, mean_acc)) if current_step % snapshot == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
def main(): # 导入模型 network = importlib.import_module(Config.model_def) # 相当于导入 .py 文件 # 用时间命名 subdir = datetime.strftime(datetime.now(),'%Y%m%d-%H%M%S') model_dir = os.path.join(os.path.expanduser(Config.models_base_dir),subdir) if not os.path.isdir(model_dir): os.makedirs(model_dir) # 读取数据 train_set = data_process.get_data_set(Config.data_dir) # 类别总数 nrof_classes = len(train_set) pretrained_model = None if Config.pretrained_model: pretrained_model = os.path.expanduser(Config.pretrained_model) print('Pre-trained model: %s'%pretrained_model) with tf.Graph().as_default(): global_step = tf.Variable(0,trainable=False) image_list, label_list = data_process.get_image_paths_and_labels(train_set) assert len(image_list)>0,'The dataset should not empty' labels = ops.convert_to_tensor(label_list,dtype=tf.int32) range_size = array_ops.shape(labels)[0] index_queue = tf.train.range_input_producer(range_size,num_epochs=None,shuffle=True,seed = None,capacity=32) index_dequeue_op = index_queue.dequeue_many(Config.batch_size*Config.epoch_size,'index_dequeue') learning_rate_placeholder = tf.placeholder(tf.float32,name='learning_rate') batch_size_placeholder = tf.placeholder(tf.int32,name='batch_size') train_flag = tf.placeholder(tf.bool,name='phase_train') image_paths_placeholder = tf.placeholder(tf.string,shape=(None,1),name='image_paths') labels_placeholder = tf.placeholder(tf.int64,shape=(None,1),name='labels') input_queue = data_flow_ops.FIFOQueue(capacity=500000, dtypes=[tf.string,tf.int64], shapes=[(1,),(1,)], shared_name=None,name=None) enqueue_op = input_queue.enqueue_many([image_paths_placeholder,labels_placeholder],name='enqueue_op') nrof_preprocess_threads = 4 images_and_labels = [] for _ in range(nrof_preprocess_threads): filenames, label = input_queue.dequeue() images = [] for filename in tf.unstack(filenames): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) if Config.random_rotate: image = tf.py_func(data_process.random_rotate_image, [image], tf.uint8) if Config.random_crop: image = tf.random_crop(image, [Config.image_size, Config.image_size, 3]) else: image = tf.image.resize_image_with_crop_or_pad(image, Config.image_size, Config.image_size) if Config.random_flip: image = tf.image.random_flip_left_right(image) # pylint: disable=no-member image.set_shape((Config.image_size, Config.image_size, 3)) images.append(tf.image.per_image_standardization(image)) images_and_labels.append([images, label]) image_batch,label_batch = tf.train.batch_join( images_and_labels,batch_size=batch_size_placeholder, shapes=[(Config.image_size,Config.image_size,3),()],enqueue_many=True, capacity=4*nrof_preprocess_threads*Config.batch_size, allow_smaller_final_batch=True) image_batch = tf.identity(image_batch,'image_batch') image_batch = tf.identity(image_batch,'input') label_batch = tf.identity(label_batch,'label_batch') print('Total number of classes: %d'%nrof_classes) print('Total number of examples: %d'%len(image_list)) print('Building training graph') prelogits = network.inference(image_batch,Config.keep_prob, phase_train = train_flag,bottleneck_layer_size = Config.embedding_size, weight_decay = Config.weight_decay) logits = slim.fully_connected(prelogits, len(train_set), activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.1), weights_regularizer=slim.l2_regularizer(Config.weight_decay), scope='Logits', reuse=False) embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings') # 添加中心损失 if Config.center_loss_weight >0.0: prelogits_center_loss,_ = utils.center_loss(prelogits,label_batch,Config.center_loss_alfa,nrof_classes) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,prelogits_center_loss*Config.center_loss_weight) learning_rate = tf.train.exponential_decay(learning_rate_placeholder,global_step, Config.learning_rate_decay_epochs*Config.epoch_size, Config.learning_rate_decay_factor,staircase=True) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_batch,logits=logits, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy,name='cross_entropy') tf.add_to_collection('losses',cross_entropy_mean) # 把中心损失加到交叉softmax损失上 regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n([cross_entropy_mean]+regularization_losses,name='total_loss') # 一个batch 训练操作并更新模型参数 train_op = train_batch(total_loss,global_step,Config.optimizer,learning_rate, Config.moving_average_decay,tf.global_variables()) # 创建一个保存器 saver = tf.train.Saver(tf.trainable_variables(),max_to_keep=3) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = Config.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,log_device_placement=False)) sess.run(tf.global_variables_initializer()) # 获得线程坐标,启动填充队列的线程 coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord,sess=sess) with sess.as_default(): sess.run(tf.local_variables_initializer()) if pretrained_model: print('Restoring pretrained model: %s'%pretrained_model) meta_file, ckpt_file = utils.get_model_filenames(Config.pretrained_model) saver = tf.train.import_meta_graph(os.path.join(Config.pretrained_model, meta_file)) saver.restore(sess, os.path.join(Config.pretrained_model, ckpt_file)) print('Running training') epoch = 0 while epoch < Config.max_nrof_epochs: step = sess.run(global_step,feed_dict=None) utils.save_variables_and_metagraph(sess, saver, model_dir, subdir, step) print('++++++++++save done++++++++++') epoch = step // Config.epoch_size # 训练一个epoch train(sess,epoch,image_list,label_list,index_dequeue_op,enqueue_op,image_paths_placeholder,labels_placeholder, learning_rate_placeholder,train_flag,batch_size_placeholder,global_step, total_loss,train_op,regularization_losses) utils.save_variables_and_metagraph(sess,saver,model_dir,subdir,step) return model_dir
def __init__(self, config, rng): self.config = config self.rng = rng self.task = config.task self.model_dir = config.model_dir self.gpu_memory_fraction = config.gpu_memory_fraction self.log_step = config.log_step self.max_step = config.max_step self.K_d = config.K_d self.K_g = config.K_g self.initial_K_d = config.initial_K_d self.initial_K_g = config.initial_K_g self.checkpoint_secs = config.checkpoint_secs DataLoader = { 'gaze': gaze_data.DataLoader, 'hand': hand_data.DataLoader, }[config.data_set] self.data_loader = DataLoader(config, rng=self.rng) self.model = Model(config, self.data_loader) self.history_buffer = Buffer(config, self.rng) self.summary_ops = { 'test_synthetic_images': { 'summary': tf.summary.image("test_synthetic_images", self.model.resized_x, max_outputs=config.max_image_summary), 'output': self.model.resized_x, }, 'test_refined_images': { 'summary': tf.summary.image("test_refined_images", self.model.denormalized_R_x, max_outputs=config.max_image_summary), 'output': self.model.denormalized_R_x, } } self.saver = tf.train.Saver() self.summary_writer = tf.summary.FileWriter(self.model_dir) sv = tf.train.Supervisor(logdir=self.model_dir, is_chief=True, saver=self.saver, summary_op=None, summary_writer=self.summary_writer, save_summaries_secs=300, save_model_secs=self.checkpoint_secs, global_step=self.model.discrim_step) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=self.gpu_memory_fraction, allow_growth=True) # seems to be not working sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) self.sess = sv.prepare_or_wait_for_session(config=sess_config)
def train(args): if args.word_level: vocab = Vocab('data/'+args.dataset+'/vocab_word',max_size=args.vocab_size) else: vocab = Vocab('data/'+args.dataset+'/vocab',max_size=args.vocab_size) data_generator = Batcher(args,vocab) batcher = data_generator.get_batcher() if args.use_glove: if args.word_level: wemb = np.load('data/'+args.dataset+'/Wemb_word.npy') wemb = wemb.astype('float32') else: wemb = None model = VisualDialogRetrieval( vocab_size=args.vocab_size, hidden_dim=args.hidden_dim, max_video_enc_steps=args.max_video_enc_steps, max_context_enc_steps=args.max_context_enc_steps, max_response_enc_steps=args.max_response_enc_steps, emb_dim=args.emb_dim, img_dim=args.img_dim, num_layers=args.num_layers, rand_unif_init_mag=args.rand_unif_init_mag, trunc_norm_init_std=args.trunc_norm_init_std, cell_type=args.cell_type, optimizer_type = args.optimizer_type, learning_rate = args.lr, max_grad_clip_norm = args.max_grad_clip_norm, enable_video_context = args.video_context, enable_chat_context = args.chat_context, loss_function = args.loss_function, wemb = wemb, enable_dropout=False, is_training=True) gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=config) # print the variables for var in tf.global_variables(): print var #create a summary handler summary_handler = SummaryHandler(os.path.join(args.summary_save_path,args.model_name), ['LOSS','recall@1in10','recall@2in10','recall@5in10']) saver = tf.train.Saver(max_to_keep=50) sess.run(tf.global_variables_initializer()) if args.load_model != 'None': saver.restore(sess,os.path.join(args.model_save_path,args.load_model)) iteration = args.start_iter while(True): batch = batcher.next() if batch is None: batch = batcher.next() start = time.time() loss,debugger,_ = sess.run([model.loss,model.debugger,model.train_op], feed_dict={ model.video_enc_batch:batch.get('video_batch'), model.video_enc_mask_batch:batch.get('video_mask_batch'), model.context_enc_batch:batch.get('chat_context_batch'), model.context_enc_mask_batch:batch.get('chat_context_mask_batch'), model.response_enc_batch:batch.get('response_batch'), model.response_enc_mask_batch:batch.get('response_mask_batch'), model.target_label_batch:batch.get('label_batch') }) summary = {} summary['LOSS'] = loss summary['ITERATION'] = iteration summary_handler.write_summaries(sess,summary) iteration +=1 print 'iteration:',iteration,'computational time:',time.time()-start,'loss:',loss if iteration > args.max_iter: break if iteration%args.check_point == 0: # get validation loss and perplexity scores = eval_valset(args,sess,model) summary = scores summary['ITERATION'] = iteration summary_handler.write_summaries(sess,summary) saver.save(sess, os.path.join(args.model_save_path, args.model_name+'-'+str(iteration)))
def train(N): root_path = str.format('../{0}x{0}/', N) play_cmd = '../bin/go_zero' model_path = root_path + 'models/' data_path = root_path + "data/" mini_batch_size = config.mini_batch_size dm = data_manager(data_path) #training_data = dm.load_data2(10) channels = 17 from datetime import datetime random.seed(datetime.now()) batch_size = config.self_play_file_batch_size gpu_options = tf.GPUOptions() gpu_options.allow_growth = True with tf.Session(graph=tf.Graph(), config=tf.ConfigProto(gpu_options=gpu_options)) as sess: #sess = tf_debug.TensorBoardDebugWrapperSession(sess, "127.0.0.1:7006") #best_network = Network("best_network", sess, N, channels) training_network = Network("training_network", sess, N, channels, True, "./log/") sess.run(training_network.init_all_vars_op) # restore from previous checkpoint last_model, generation = get_last_model(model_path) if last_model != "": training_network.restore_model('./checkpoints/') #tf.saved_model.loader.load(sess, ['SERVING'], last_model) else: #code below is to create an initial model print("no model was found. create an initial model") export_dir = model_path + 'metagraph-00000000' builder = tf.saved_model.builder.SavedModelBuilder(export_dir) builder.add_meta_graph_and_variables(sess, ['SERVING']) builder.save(as_text=False) last_model, generation = get_last_model(model_path) #return #filename = training_network.save_model("./model/training.ckpt") #training_to_best_op = copy_src_to_dst("training_network", "best_network") #sess.run([training_to_best_op]) #trainables = tf.trainable_variables("training_network") reps = 0 nupdates = 700 lr = lambda f: 1e-3 if f < 500 else 1e-4 cliprange = lambda f: 0.2 if f < 500 else 0.1 #lr = lambda f: 1e-4 #cliprange = lambda f: 0.1 first = True for update in range(generation + 1, nupdates + 1): # using current generation model to sample batch_size files. each file has 100 games file_list, training_data = dm.load_data2(batch_size, generation, N) if training_data is None or len(training_data) == 0: dm.sample(1, generation, N) file_list, training_data = dm.load_data2( batch_size, generation, N) while training_data is None or len(training_data) == 0: import time print("not enough training data. sleep...") time.sleep(config.sleep_seconds) file_list, training_data = dm.load_data2( batch_size, generation, N) #frac = 1.0 - (update - 1.0) / nupdates frac = update #frac = 1.0 * 0.996 ** (update - 1.0) print("learning rate:{} Clip Range {}".format( lr(frac), cliprange(frac))) inds = np.arange(len(training_data)) for _ in range(config.batch_epochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, len(training_data), mini_batch_size): end = start + mini_batch_size if end >= len(training_data) - 1: end = len(training_data) - 1 mbinds = inds[start:end] if len(mbinds) < mini_batch_size / 2: break mini_batch = [] for k in range(len(mbinds)): position = mbinds[k] items = [] state, player, action, pi, reward, oldvalue = training_data[ position] while state != '0' * N * N and len(items) < 7: items.append(training_data[position]) position = position - 1 state, player, action, pi, reward, oldvalue = training_data[ position] items.append(training_data[position]) mini_batch.append(items) states = [] actions = [] actions_pi = [] rewards = [] old_values = [] for s in mini_batch: c = None _, player, action, pi, reward, oldvalue = s[0] #prevent policy becomes zero #pi = [p if p != 0 else 1e-5 for p in pi] for x in s: state, _, _, _, _, _ = x a = np.array([ int(ltr == str(player)) for i, ltr in enumerate(state) ]).reshape((N, N)) b = np.array([ int(ltr == str(3 - player)) for i, ltr in enumerate(state) ]).reshape((N, N)) if c is None: c = np.array([a, b]) else: c = np.append( c, [np.array(a), np.array(b)], axis=0) for i in range((channels - 1) // 2 - len(s)): c = np.append(c, [np.zeros([N, N]), np.zeros([N, N])], axis=0) c = np.append(c, [np.full([N, N], player % 2, dtype=int)], axis=0) states.append(c) actions.append(action) actions_pi.append(pi) rewards.append(reward) old_values.append(oldvalue) feed = { training_network.states: np.array(states), training_network.actions: actions, training_network.actions_pi: actions_pi, training_network.rewards: np.vstack(rewards), training_network.old_values: np.vstack(old_values), training_network.learning_rate: lr(frac), training_network.clip_range: cliprange(frac), } global_step, summary, _, action_loss, value_loss, entropy = sess.run( [ training_network.global_step, training_network.summary_op, training_network.apply_gradients, training_network.actor_loss, training_network.value_loss, training_network.entropy_loss ], feed) print(global_step, action_loss, value_loss, entropy) if global_step % 10 == 0: training_network.summary_writer.add_summary( summary, global_step) print("saving checkpoint...") filename = training_network.save_model( "./checkpoints/training.ckpt") generation = generation + 1 builder = tf.saved_model.builder.SavedModelBuilder( model_path + 'metagraph-' + str(generation).zfill(8)) builder.add_meta_graph_and_variables(sess, ['SERVING']) builder.save(as_text=False) last_model, generation = get_last_model(model_path) print(last_model + " is saved") # if global_step % config.training_repetition == 0: # print("saving checkpoint...") # filename = training_network.save_model("./checkpoints/training.ckpt") # # if os.path.exists(model_path+'temp/'): # shutil.rmtree(model_path+'temp/') # # builder = tf.saved_model.builder.SavedModelBuilder(model_path+'temp/') # builder.add_meta_graph_and_variables(sess, ['SERVING']) # builder.save(as_text=False) # # need_evaluate = False # if( need_evaluate ): # import evaluate2 as evaluate # import math # print("evaluating checkpoint ...") # # evaluate.play_cmd = play_cmd # evaluate.model1 = last_model.split('/')[-1] # evaluate.model2 = 'temp' # # old_win, new_win = evaluate.evaluator(4, config.number_eval_games) # if new_win >= config.number_eval_games * (0.5 + math.sqrt(config.number_eval_games) / config.number_eval_games): # generation = generation + 1 # os.rename(model_path+'temp', model_path+'metagraph-'+str(generation).zfill(8)) # last_model, generation = get_last_model(model_path) # print("checkpoint is better. saved to " + 'metagraph-'+str(generation).zfill(8)) # else: # #shutil.rmtree(model_path+'temp/') # print("checkpoint is discarded") # else: # generation = generation + 1 # os.rename(model_path + 'temp', model_path + 'metagraph-' + str(generation).zfill(8)) # last_model, generation = get_last_model(model_path) # print("checkpoint is saved") #for i in range(config.self_play_file_increment): # base = os.path.splitext(file_list[i])[0] # os.rename(file_list[i], base+".done") for f in file_list: base = os.path.splitext(f)[0] os.rename(f, base + ".done")
import math import sys import os import multiprocessing as mp import tensorflow as tf import numpy as np from average_precision import APCalculator, APs2mAP from training_data import TrainingData from ssdutils import get_anchors_for_preset, decode_boxes, suppress_overlaps from ssdvgg import SSDVGG from utils import * from tqdm import tqdm gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) if sys.version_info[0] < 3: print("This is a Python 3 program. Use Python 3 or higher.") sys.exit(1) #------------------------------------------------------------------------------- def compute_lr(lr_values, lr_boundaries): with tf.variable_scope('learning_rate'): global_step = tf.Variable(0, trainable=False, name='global_step') lr = tf.train.piecewise_constant(global_step, lr_boundaries, lr_values) return lr, global_step
def train(): print 'loading dataset...' dataset, img_feature, train_data = get_data() num_train = train_data['question'].shape[0] vocabulary_size = len(dataset['ix_to_word'].keys()) print 'vocabulary_size : ' + str(vocabulary_size) print 'constructing model...' model = Answer_Generator( rnn_size = rnn_size, rnn_layer = rnn_layer, batch_size = batch_size, input_embedding_size = input_embedding_size, dim_image = img_feature[0].shape, dim_hidden = dim_hidden, dim_attention = dim_attention, max_words_q = max_words_q, vocabulary_size = vocabulary_size, drop_out_rate = 0.5) tf_loss, tf_image, tf_question, tf_label = model.build_model() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with tf.device('/cpu:0'): saver = tf.train.Saver(max_to_keep=100) tvars = tf.trainable_variables() lr = tf.Variable(learning_rate) opt = tf.train.AdamOptimizer(learning_rate=lr) # gradient clipping gvs = opt.compute_gradients(tf_loss,tvars) with tf.device('/cpu:0'): clipped_gvs = [(tf.clip_by_value(grad, -10.0, 10.0), var) for grad, var in gvs if grad is not None] train_op = opt.apply_gradients(clipped_gvs) sess.run(tf.global_variables_initializer()) print 'start training...' for itr in range(max_itr): tStart = time.time() # shuffle the training data index = np.random.random_integers(0, num_train-1, batch_size) current_question = train_data['question'][index,:] current_length_q = train_data['length_q'][index] current_answers = train_data['answers'][index] current_img_list = train_data['img_list'][index] current_img = img_feature[current_img_list,:] # do the training process!!! _, loss = sess.run( [train_op, tf_loss], feed_dict={ tf_image: current_img, tf_question: current_question, tf_label: current_answers }) current_learning_rate = lr*decay_factor lr.assign(current_learning_rate).eval(session=sess) tStop = time.time() if np.mod(itr, 100) == 0: print "Iteration: ", itr, " Loss: ", loss, " Learning Rate: ", lr.eval(session=sess) print ("Time Cost:", round(tStop - tStart,2), "s") if np.mod(itr, 5000) == 0: print "Iteration ", itr, " is done. Saving the model ..." saver.save(sess, os.path.join(checkpoint_path, 'model'), global_step=itr)
def __init__(self, act_dim, obs_dim, depth_dim, lr_actor, lr_value, gamma, tau, buffers, alpha=0.2, name=None, seed=1): # tf.reset_default_graph() self.act_dim = act_dim self.obs_dim = obs_dim self.lr_actor = lr_actor self.lr_value = lr_value self.gamma = gamma self.tau = tau self.name = name self.replay_buffer = [] self.buffers = buffers # self.obs_dim = obs_dim self.depth_dim = depth_dim # batch_img_shape = [ 512, 240*320] # self.depth_dim = depth_dim # self.obs_dim.extend(self.depth_dim) for i in range(buffers): b = ReplayBuffer(capacity=int(1e6), name=self.name+'buffer'+str(i)) self.replay_buffer.append(b) self.OBS0 = tf.placeholder(tf.float32, [None, self.obs_dim], name=self.name+"observations0") self.OBS1 = tf.placeholder(tf.float32, [None, self.obs_dim], name=self.name+"observations1") self.ACT = tf.placeholder(tf.float32, [None, self.act_dim], name=self.name+"action") self.RWD = tf.placeholder(tf.float32, [None,], name=self.name+"reward") self.DONE = tf.placeholder(tf.float32, [None,], name=self.name+"done") self.EPRWD = tf.placeholder(tf.int32, [], name=self.name+"ep_reward") # self.DEPTH = tf.placeholder(tf.float32, [None, 32*24], name =self.name+'depth_gif') self.policy_loss = tf.placeholder(tf.float32, [None, 1], name=self.name+"policy_loss") self.q_value1_loss = tf.placeholder(tf.float32, [None, 1], name=self.name+"q_value1_loss") self.q_value2_loss = tf.placeholder(tf.float32, [None, 1], name=self.name+"q_value2_loss") self.value_loss = tf.placeholder(tf.float32, [None, 1], name=self.name+"value_loss") self.total_value_loss = tf.placeholder(tf.float32, [None, 1], name=self.name+"total_value_loss") policy = ActorNetwork(self.act_dim, self.name+'Actor') q_value_net_1 = QValueNetwork(self.name+'Q_value1') q_value_net_2 = QValueNetwork(self.name+'Q_value2') value_net = ValueNetwork(self.name+'Value') target_value_net = ValueNetwork(self.name+'Target_Value') mu, self.pi, logp_pi = policy.evaluate(self.OBS0) q_value1 = q_value_net_1.get_q_value(self.OBS0, self.ACT, reuse=False) q_value1_pi = q_value_net_1.get_q_value(self.OBS0, self.pi, reuse=True) q_value2 = q_value_net_2.get_q_value(self.OBS0, self.ACT, reuse=False) q_value2_pi = q_value_net_2.get_q_value(self.OBS0, self.pi, reuse=True) # value = value_net.get_value(self.OBS0) value = value_net.get_value(self.OBS0) target_value = target_value_net.get_value(self.OBS1) min_q_value_pi = tf.minimum(q_value1_pi, q_value2_pi) next_q_value = tf.stop_gradient(self.RWD + self.gamma * (1 - self.DONE) * target_value) next_value = tf.stop_gradient(min_q_value_pi - alpha * logp_pi) self.policy_loss = tf.reduce_mean(alpha * logp_pi - q_value1_pi) self.q_value1_loss = tf.reduce_mean(tf.squared_difference(next_q_value, q_value1)) self.q_value2_loss = tf.reduce_mean(tf.squared_difference(next_q_value, q_value2)) self.value_loss = tf.reduce_mean(tf.squared_difference(next_value, value)) self.total_value_loss = self.q_value1_loss + self.q_value2_loss + self.value_loss actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.lr_actor) actor_train_op = actor_optimizer.minimize(self.policy_loss, var_list=tf.global_variables(self.name+'Actor')) value_optimizer = tf.train.AdamOptimizer(learning_rate=self.lr_value) value_params = tf.global_variables(self.name+'Q_value1') + tf.global_variables(self.name+'Q_value2') + tf.global_variables(self.name+'Value') #who is Q_value with tf.control_dependencies([actor_train_op]): value_train_op = value_optimizer.minimize(self.total_value_loss, var_list=value_params) with tf.control_dependencies([value_train_op]): self.target_update = [tf.assign(tv, self.tau * tv + (1 - self.tau) * v) for v, tv in zip(tf.global_variables(self.name+'Value'), tf.global_variables(self.name+'Target_Value'))] target_init = [tf.assign(tv, v) for v, tv in zip(tf.global_variables(self.name+'Value'), tf.global_variables(self.name+'Target_Value'))] # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) # gpu_options = tf.GPUOptions(allow_growth=True) gpu_options = tf.GPUOptions(allow_growth=True) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # self.sess.gpu_options.allow_growth = True # new_graph = tf.Graph() # new_graph.seed = SEED[seed] # self.sess = tf.Session() tf.summary.scalar(self.name+'policy_loss', self.policy_loss) tf.summary.scalar(self.name+'q_value1_loss', self.q_value1_loss) tf.summary.scalar(self.name+'q_value2_loss', self.q_value2_loss) tf.summary.scalar(self.name+'value_loss', self.value_loss) tf.summary.scalar(self.name+'total_value_loss', self.total_value_loss) tf.summary.scalar(self.name+'ep_reward', self.EPRWD) tf.summary.scalar(self.name+'rwd', self.RWD[0]) self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter('/home/yue/no_image/collision_surroding/src/collision_surouding/Collision_Avoidance/train/logs/'+NAME+'/'+self.name+'/', self.sess.graph) self.saver = tf.train.Saver() self.path = '/home/yue/no_image/collision_surroding/src/collision_surouding/Collision_Avoidance/train/weights/'+ NAME +'/'+ self.name self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init)
def test(): print 'loading dataset...' dataset, img_feature, test_data = get_data_test() num_test = test_data['question'].shape[0] vocabulary_size = len(dataset['ix_to_word'].keys()) print 'vocabulary_size : ' + str(vocabulary_size) model = Answer_Generator( rnn_size = rnn_size, rnn_layer = rnn_layer, batch_size = batch_size, input_embedding_size = input_embedding_size, dim_image = img_feature[0].shape, dim_hidden = dim_hidden, dim_attention = dim_attention, max_words_q = max_words_q, vocabulary_size = vocabulary_size, drop_out_rate = 0) tf_answer, tf_image, tf_question, tf_prob_att1, tf_prob_att2 = model.build_generator() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5, allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with tf.device('/cpu:0'): saver = tf.train.Saver() saver.restore(sess, os.path.join(checkpoint_path, 'model-60000')) tStart_total = time.time() result = [] for current_batch_start_idx in xrange(0,num_test-1,batch_size): #for current_batch_start_idx in xrange(0,3,batch_size): tStart = time.time() # set data into current* if current_batch_start_idx + batch_size < num_test: current_batch_file_idx = range(current_batch_start_idx,current_batch_start_idx+batch_size) else: current_batch_file_idx = range(current_batch_start_idx,num_test) current_question = test_data['question'][current_batch_file_idx,:] current_length_q = test_data['length_q'][current_batch_file_idx] current_img_list = test_data['img_list'][current_batch_file_idx] current_ques_id = test_data['ques_id'][current_batch_file_idx] current_img = img_feature[current_img_list,:] # (batch_size, dim_image) # deal with the last batch if(len(current_img)<batch_size): pad_img = np.zeros((batch_size-len(current_img),7,7,dim_image),dtype=np.int) pad_q = np.zeros((batch_size-len(current_img),max_words_q),dtype=np.int) pad_q_len = np.zeros(batch_size-len(current_length_q),dtype=np.int) pad_q_id = np.zeros(batch_size-len(current_length_q),dtype=np.int) pad_ques_id = np.zeros(batch_size-len(current_length_q),dtype=np.int) pad_img_list = np.zeros(batch_size-len(current_length_q),dtype=np.int) current_img = np.concatenate((current_img, pad_img)) current_question = np.concatenate((current_question, pad_q)) current_length_q = np.concatenate((current_length_q, pad_q_len)) current_ques_id = np.concatenate((current_ques_id, pad_q_id)) current_img_list = np.concatenate((current_img_list, pad_img_list)) generated_ans, prob_att1, prob_att2 = sess.run( [tf_answer, tf_prob_att1, tf_prob_att2], feed_dict={ tf_image: current_img, tf_question: current_question }) top_ans = np.argmax(generated_ans, axis=1) # initialize json list for i in xrange(0,batch_size): ans = dataset['ix_to_ans'][str(top_ans[i]+1)] if(current_ques_id[i] == 0): continue result.append({u'answer': ans, u'question_id': str(current_ques_id[i])}) tStop = time.time() print ("Testing batch: ", current_batch_file_idx[0]) print ("Time Cost:", round(tStop - tStart,2), "s") print ("Testing done.") tStop_total = time.time() print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s") # Save to JSON print 'Saving result...' my_list = list(result) dd = json.dump(my_list,open('san_cnn_att.json','w'))
def collect_data(self): output_dir = os.path.expanduser(self.output_datadir) if not os.path.exists(output_dir): os.makedirs(output_dir) dataset = facenet.get_dataset(self.input_datadir) with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85) sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = detect_face.create_mtcnn(sess, './npy') minsize = 20 # minimum size of face threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 0.709 # scale factor margin = 44 image_size = 182 # Add a random key to the filename to allow alignment using multiple processes random_key = np.random.randint(0, high=99999) bounding_boxes_filename = os.path.join( output_dir, 'bounding_boxes_%05d.txt' % random_key) with open(bounding_boxes_filename, "w") as text_file: nrof_images_total = 0 nrof_successfully_aligned = 0 for cls in dataset: output_class_dir = os.path.join(output_dir, cls.name) if not os.path.exists(output_class_dir): os.makedirs(output_class_dir) for image_path in cls.image_paths: nrof_images_total += 1 filename = os.path.splitext( os.path.split(image_path)[1])[0] output_filename = os.path.join(output_class_dir, filename + '.png') print("Image: %s" % image_path) if not os.path.exists(output_filename): try: img = misc.imread(image_path) except (IOError, ValueError, IndexError) as e: errorMessage = '{}: {}'.format(image_path, e) print(errorMessage) else: if img.ndim < 2: print('Unable to align "%s"' % image_path) text_file.write('%s\n' % (output_filename)) continue if img.ndim == 2: img = facenet.to_rgb(img) print('to_rgb data dimension: ', img.ndim) img = img[:, :, 0:3] bounding_boxes, _ = detect_face.detect_face( img, minsize, pnet, rnet, onet, threshold, factor) nrof_faces = bounding_boxes.shape[0] print('No of Detected Face: %d' % nrof_faces) if nrof_faces > 0: det = bounding_boxes[:, 0:4] img_size = np.asarray(img.shape)[0:2] if nrof_faces > 1: bounding_box_size = ( det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]) img_center = img_size / 2 offsets = np.vstack([ (det[:, 0] + det[:, 2]) / 2 - img_center[1], (det[:, 1] + det[:, 3]) / 2 - img_center[0] ]) offset_dist_squared = np.sum( np.power(offsets, 2.0), 0) index = np.argmax( bounding_box_size - offset_dist_squared * 2.0 ) # some extra weight on the centering det = det[index, :] det = np.squeeze(det) bb_temp = np.zeros(4, dtype=np.int32) bb_temp[0] = det[0] bb_temp[1] = det[1] bb_temp[2] = det[2] bb_temp[3] = det[3] cropped_temp = img[bb_temp[1]:bb_temp[3], bb_temp[0]:bb_temp[2], :] scaled_temp = misc.imresize( cropped_temp, (image_size, image_size), interp='bilinear') nrof_successfully_aligned += 1 misc.imsave(output_filename, scaled_temp) text_file.write( '%s %d %d %d %d\n' % (output_filename, bb_temp[0], bb_temp[1], bb_temp[2], bb_temp[3])) else: print('Unable to align "%s"' % image_path) text_file.write('%s\n' % (output_filename)) return (nrof_images_total, nrof_successfully_aligned)
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] work_space = config["workspace"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] vocab_file = '%s/data/%s-%s' % (work_space, "vocab", vocab_size) print("\tDone.") (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, attn_num_units, dec_num_layers, dec_num_units, dec_cell_type, state_pass, infer_max_iter, l2_regularize, learning_rate) = get_model_config(config) (is_beam_search, beam_size, batch_size, infer_source_file, infer_source_max_length, output_path, gpu_fraction, gpu_id) = get_infer_config(config) print("Building model architecture ...") infer_model = Seq2SeqModel(mode='infer', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, enc_num_layers=enc_num_layers, enc_num_units=enc_num_units, enc_cell_type=enc_cell_type, enc_bidir=enc_bidir, attn_num_units=attn_num_units, dec_num_layers=dec_num_layers, dec_num_units=dec_num_units, dec_cell_type=dec_cell_type, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, l2_regularize=l2_regularize, learning_rate=learning_rate) print("\tDone.") # Set up session restore_from = '%s/nn_models/' % work_space gpu_fraction = config["training"]["gpu_fraction"] gpu_id = config["training"]["gpu_id"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # print('global_variables:\n') # glob_var = tf.global_variables() # pprint(glob_var) try: saved_global_step = load(infer_model.saver, sess, restore_from) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise # ##### Inference ##### # Load data print("Loading inference data ...") # Load vocabularies. vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file) src_dataset = prepare_infer_data(infer_source_file, vocab_table, max_length=infer_source_max_length) print("\tDone.") # Inference print("Start inferring ...") final_result = [] for ith in range(int(len(src_dataset) / batch_size)): start = ith end = ith + 1 batch = get_infer_batch(src_dataset, start, end, infer_source_max_length) sentence = token_to_str(batch[0][0], reverse_vocab_table) start_time = time.time() result = infer_model.infer(sess, batch) duration = round((time.time() - start_time), 3) print("sentence:%s, cost:%s s" % (ith, duration)) res = "src:{}\n".format(sentence) if is_beam_search is True: for idx, i in enumerate(result[0][0]): reply = token_to_str(i, reverse_vocab_table) res += "\tpred %s:%s\n" % (idx, reply) res += "\n" else: reply = result[0][0] reply = token_to_str(reply, reverse_vocab_table) res += "\tpred:%s\n\n" % reply print(res) final_result.append(res) with open(config["inference"]["output_path"], 'w') as f: for i in final_result: f.write(i + '\n') print("\tDone.")
def __init__(self, net_class: AbstractNetClass, dataset_loader: AbstractDatasetLoader, optimizer: AbstractOptimizerClass, num_gpus: int = 1, seed=1337, train_data_size: int = 45000, batch_size: int = 100, dataset_path: str = "./Datasets/cifarDataset.npy", work_path: str = "../", experiment_name: str = "model0", is_calc_angle=False): self.optimizer = optimizer self.dataset_path = dataset_path self.model_dir = work_path + "./models/" + experiment_name + "_" + str(NetFrame.static_instance_counter) + "/" self.plot_dir = self.model_dir + "plots/" + experiment_name + "/" self.default_checkpoints_path = self.model_dir + "checkpoints/convNet.ckp" self.default_log_path = self.model_dir + "log/" self.experimentName = experiment_name self.batch_size = batch_size self.is_calc_angle = is_calc_angle u.check_and_create_path(self.model_dir) u.check_and_create_path(self.default_log_path) u.check_and_create_path(self.plot_dir) # Delete all existing plots and logs if u.check_and_create_path(self.plot_dir): for files in os.listdir(self.plot_dir): os.remove(os.path.join(self.plot_dir, files)) if u.check_and_create_path(self.default_log_path): for files in os.listdir(self.default_log_path): os.remove(os.path.join(self.default_log_path, files)) self.static_instance_counter += 1 # Set random seeds np.random.seed(seed) tf.set_random_seed(seed) self.__sess = tf.Session(graph=tf.get_default_graph(), config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=True), log_device_placement=False)) # TODO change to is debug self.__writer = tf.summary.FileWriter(self.default_log_path, filename_suffix=".event", flush_secs=10) self.__iterator, self.inference_mode_var, train_size, eval_size, test_size = dataset_loader.get_iterator( self.__sess, self.dataset_path, train_data_size, self.batch_size, num_gpus) # ok has to be the same iterator for all gpus self.__num_train_it_per_epoch = train_size // self.batch_size # floor division self.__num_train_it_per_epoch += 1 if train_size % self.batch_size != 0 else 0 self.__num_eval_it_per_epoch = eval_size // self.batch_size # floor division self.__num_eval_it_per_epoch += 1 if eval_size % self.batch_size != 0 else 0 self.__num_test_it_per_epoch = test_size // self.batch_size # floor division self.__num_test_it_per_epoch += 1 if test_size % self.batch_size != 0 else 0 # with tf.device('/cpu:0'): print("loading Network: " + net_class.get_name()) # self.__grad_op, self.__loss_reg_op, self.__loss_op,self.__acc_op, self.__acc_update_op, self.batch_assign_ops, self.reuse_binary_tensor = net_class.get_model( # self.__iterator, self.inference_mode_var, batch_size, num_gpus) self.__grad_op, self.__loss_reg_op, _, self.__acc_op, self.__acc_update_op, self.batch_assign_ops, self.reuse_binary_tensor = net_class.get_model( self.__iterator, self.inference_mode_var, batch_size, num_gpus) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self._loss_tensor_update_ops = tf.identity(self.__loss_reg_op) # get gradient, calc mean gradient, update gradient # build grad vars for angle determination if self.is_calc_angle: with tf.variable_scope("grad_vars"): grad_vars = [] train_vars = [e[1] for e in self.__grad_op] gradient_tensors = [e[0] for e in self.__grad_op] for var in train_vars: new_var = tf.Variable(tf.zeros(var.shape), trainable=False, name=var.name[0:-2]) grad_vars.append(new_var) # ass_old_step ops ass_grads = [] for grad_var, grad in zip(grad_vars, gradient_tensors): assign = tf.assign(grad_var, grad) ass_grads.append(assign) with tf.control_dependencies(ass_grads): gradient_tensors = [tf.identity(g) for g in gradient_tensors] self.__grad_op = list(zip(gradient_tensors, train_vars)) self.optimizer.initialize(self.__sess, self.__grad_op, self.__loss_reg_op, None, self.plot_dir, self.reuse_binary_tensor) # ,batch_assign_ops=self.batch_assign_ops) if self.is_calc_angle: if isinstance(self.optimizer, PAOptimizerSuper): vars = self.optimizer.step_direction_variables elif isinstance(self.optimizer, TfOptimizer): if isinstance(self.optimizer.optimizer, tf.train.MomentumOptimizer): vars = [self.optimizer.optimizer.get_slot(t_var, "momentum") for t_var in tf.trainable_variables()] self.step_direction_norm_op = u.get_calc_norm_op(vars) self.step_direction_angle_op = u.get_calc_angel_op(vars, self.__grad_op) self.__sess.run(tf.global_variables_initializer()) # since parameter (weight) variables are added before # optimizer variables all weights get the same g._last_id with different optimizers. # -> same weight initialization self.metric_variables_initializer = [x.initializer for x in tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)] # get number of parameters sum_ = 0 for train_var in tf.trainable_variables(): prod = 1 for e in train_var.get_shape(): prod = e * prod sum_ += prod print("amount parameters: ", sum_) # saver has to be inizialized after model is build and variables are defined self.__saver = tf.train.Saver() # save graph for tensorboard # self.__writer.add_graph(self.__sess.graph) # self.__writer.flush() sys.stdout.flush() return
def main(): start = time.time() SSTrainConfig = namedtuple( 'SSTrainConfig', 'data_path out_path image_size class_num epochs batch_size') t_conf = SSTrainConfig('../../data/CamVid/', '../../saved/segmentation/', (360, 480, 3), 12, 12, 10) class_weighting = [ 0.2595, 0.1826, 4.5640, 0.1417, 0.5051, 0.3826, 9.6446, 1.8418, 6.6823, 6.2478, 3.0, 7.3614 ] config = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, per_process_gpu_memory_fraction=0.8)) session = tf.Session(config=config) keras.backend.tensorflow_backend.set_session(session) print('====loading data====') ds = dataset.DataSet(class_num=t_conf.class_num, data_shape=t_conf.image_size, train_file='train2.txt', test_file='test2.txt') train_data, train_labels = ds.load_data(mode='train', data_path=t_conf.data_path, data_shape=t_conf.image_size, class_num=t_conf.class_num) train_data = ds.preprocess_inputs(train_data) train_labels = ds.reshape_labels(train_labels) print('input data shape...', train_data.shape) print('input label shape...', train_labels.shape) test_data, test_labels = ds.load_data(mode='test', data_path=t_conf.data_path, data_shape=t_conf.image_size, class_num=t_conf.class_num) test_data = ds.preprocess_inputs(test_data) test_labels = ds.reshape_labels(test_labels) tb_cb = keras.callbacks.TensorBoard(log_dir=t_conf.out_path, histogram_freq=1, write_graph=True, write_images=True) print("creating model...") model = seg_model.SegNet(input_shape=t_conf.image_size, classes=t_conf.class_num) model.compile(loss="categorical_crossentropy", optimizer='adadelta', metrics=["accuracy"]) model.load_weights(t_conf.out_path + 'seg_9.h5') model.fit(train_data, train_labels, initial_epoch=9, batch_size=t_conf.batch_size, epochs=t_conf.epochs, verbose=1, class_weight=class_weighting, validation_data=(test_data, test_labels), shuffle=True, callbacks=[tb_cb]) model.save(t_conf.out_path + 'seg_12.h5') elapsed_time = time.time() - start print("elapsed_time:{0}".format(elapsed_time) + "[sec]") backend.clear_session()