def eval(model, name, dataset, sample_shape=[4, 4], load_all_ckpt=True): if name == None: name = model.name dir_name = os.path.join('eval', dataset, name) if tf.gfile.Exists(dir_name): tf.gfile.DeleteRecursively(dir_name) tf.gfile.MakeDirs(dir_name) restorer = tf.train.Saver(slim.get_model_variables()) config = tf.ConfigProto() best_gpu = utils.get_best_gpu() config.gpu_options.visible_device_list = str(best_gpu) with tf.Session(config=config) as sess: ckpt_path = os.path.join('checkpoints', dataset, name) ckpts = get_all_checkpoints(ckpt_path, force=load_all_ckpt) size = sample_shape[0] * sample_shape[1] z_ = sample_z([size, model.z_dim]) for v in ckpts: print("Evaluating {} ...".format(v)) restorer.restore(sess, v) global_step = int(v.split('/')[-1].split('-')[-1]) fake_samples = sess.run(model.fake_sample, {model.z: z_}) # inverse transform: [-1, 1] => [0, 1] fake_samples = (fake_samples + 1.) / 2. merged_samples = utils.merge(fake_samples, size=sample_shape) fn = "{:0>6d}.png".format(global_step) scipy.misc.imsave(os.path.join(dir_name, fn), merged_samples)
def set_device(self): self.sys_type = self.dict["sys_type"] = get_sys_type() try: self.device_str = get_best_gpu() except: # Time-out Exception. self.device_str = "cuda:0" if self.verbose: print("Options: Using device: %s"%(self.device_str)) self.dict["device"] = self.device_str self.device = torch.device(self.device_str)
def eval(model, name, sample_shape=[1, 1], load_all_ckpt=True): if name == None: name = model.name dir_name = 'eval/' + name if tf.gfile.Exists(dir_name): tf.gfile.DeleteRecursively(dir_name) tf.gfile.MakeDirs(dir_name) # training=False => generator only restorer = tf.train.Saver(slim.get_model_variables()) config = tf.ConfigProto() best_gpu = utils.get_best_gpu() config.gpu_options.visible_device_list = str( best_gpu) # Works same as CUDA_VISIBLE_DEVICES! with tf.Session(config=config) as sess: ckpts = get_all_checkpoints('./checkpoints/' + name, force=load_all_ckpt) size = sample_shape[0] * sample_shape[1] # z_ = sample_z([size, 64]) # import pdb # pdb.set_trace() im = scipy.misc.imread( '/home/xujinchang/share/project/GAN/tf.gans-comparison/111000.jpg', mode='RGB') z_ = pre_precess_LR(im, [128, 128]) z_ = z_ / 127.5 - 1.0 for v in ckpts: print("Evaluating {} ...".format(v)) restorer.restore(sess, v) global_step = int(v.split('/')[-1].split('-')[-1]) fake_samples = sess.run(model.fake_sample, {model.z: z_}) # inverse transform: [-1, 1] => [0, 1] fake_samples = (fake_samples + 1.) / 2. merged_samples = utils.merge(fake_samples, size=sample_shape) fn = "{:0>5d}.png".format(global_step) scipy.misc.imsave(os.path.join(dir_name, fn), merged_samples)
def eval_individual(model, name, dataset, num=100, rep=4, step=35000): if name == None: name = model.name dir_name = os.path.join('eval', dataset, name) if tf.gfile.Exists(dir_name): tf.gfile.DeleteRecursively(dir_name) tf.gfile.MakeDirs(dir_name) sampler = CSampler() restorer = tf.train.Saver(slim.get_model_variables()) config = tf.ConfigProto() best_gpu = utils.get_best_gpu() config.gpu_options.visible_device_list = str(best_gpu) config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2 with tf.Session(config=config) as sess: ckpt_path = os.path.join('checkpoints', dataset, name) ckpt = get_all_checkpoints(ckpt_path, step=30000, force=True)[0] print("Evaluating {} ...".format(ckpt)) restorer.restore(sess, ckpt) global_step = int(ckpt.split('/')[-1].split('-')[-1]) size = num * rep z_ = sample_z([size, model.z_dim]) c, inds = sampler.sample_c(rep) for ind in range(size): fake_sample = sess.run(model.fake_sample, { model.z: z_[np.newaxis, ind, :], model.c: c[np.newaxis, ind, :] }) img = (fake_sample + 1.0) / 2.0 tokind, rep_ind = inds[ind] scipy.misc.imsave( os.path.join(dir_name, "{}.{}.png".format(tokind, rep_ind)), img[0, ...])
def train(model, dataset, input_op, num_epochs, batch_size, n_examples, ckpt_step, renew=False): # n_examples = 202599 # same as util.num_examples_from_tfrecords(glob.glob('./data/celebA_tfrecords/*.tfrecord')) # 1 epoch = 1583 steps print("\n# of examples: {}".format(n_examples)) print("steps per epoch: {}\n".format(n_examples // batch_size)) summary_path = os.path.join('.', 'summary', dataset, model.name) ckpt_path = os.path.join('.', 'checkpoints', dataset, model.name) if renew: if os.path.exists(summary_path): tf.gfile.DeleteRecursively(summary_path) if os.path.exists(ckpt_path): tf.gfile.DeleteRecursively(ckpt_path) if not os.path.exists(ckpt_path): tf.gfile.MakeDirs(ckpt_path) config = tf.ConfigProto() best_gpu = utils.get_best_gpu() config.gpu_options.visible_device_list = str( best_gpu) # Works same as CUDA_VISIBLE_DEVICES! with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # for epochs coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) # https://github.com/tensorflow/tensorflow/issues/10972 # TensorFlow 1.2 has much bugs for text summary # make config_summary before define of summary_writer - bypass bug of tensorboard # It seems that batch_size should have been contained in the model config ... total_steps = int(np.ceil(n_examples * num_epochs / float(batch_size))) # total global step config_list = [('num_epochs', num_epochs), ('total_iteration', total_steps), ('batch_size', batch_size), ('dataset', dataset)] model_config_list = [[k, str(w)] for k, w in sorted(model.args.items()) + config_list] model_config_summary_op = tf.summary.text( model.name + '/config', tf.convert_to_tensor(model_config_list), collections=[]) model_config_summary = sess.run(model_config_summary_op) # print to console print("\n====== Process info =======") print("argv: {}".format(' '.join(sys.argv))) print("PID: {}".format(os.getpid())) print("====== Model configs ======") for k, v in model_config_list: print("{}: {}".format(k, v)) print("===========================\n") summary_writer = tf.summary.FileWriter(summary_path, flush_secs=30, graph=sess.graph) summary_writer.add_summary(model_config_summary) pbar = tqdm(total=total_steps, desc='global_step') saver = tf.train.Saver(max_to_keep=9999) # save all checkpoints global_step = 0 ckpt = tf.train.get_checkpoint_state(ckpt_path) if ckpt: saver.restore(sess, ckpt.model_checkpoint_path) global_step = sess.run(model.global_step) print('\n[!] Restore from {} ... starting global step is {}\n'. format(ckpt.model_checkpoint_path, global_step)) pbar.update(global_step) try: # If training process was resumed from checkpoints, input pipeline cannot detect # when training should stop. So we need `global_step < total_step` condition. while not coord.should_stop() and global_step < total_steps: # model.all_summary_op contains histogram summary and image summary which are heavy op summary_op = model.summary_op if global_step % 100 == 0 else model.all_summary_op batch_X = sess.run(input_op) batch_z = sample_z([batch_size, model.z_dim]) _, summary = sess.run([model.D_train_op, summary_op], { model.X: batch_X, model.z: batch_z }) _, global_step = sess.run( [model.G_train_op, model.global_step], {model.z: batch_z}) summary_writer.add_summary(summary, global_step=global_step) pbar.update() if global_step % ckpt_step == 0: saver.save(sess, ckpt_path + '/' + model.name, global_step=global_step) except tf.errors.OutOfRangeError: print('\nDone -- epoch limit reached\n') finally: coord.request_stop() coord.join(threads) summary_writer.close() pbar.close()