Python get_best_gpu 예제들, utils.get_best_gpu Python 예제들

예제 #1

0

파일 보기

파일: eval.py 프로젝트: xiaoye77/tf.gans-comparison

def eval(model, name, dataset, sample_shape=[4, 4], load_all_ckpt=True):
    if name == None:
        name = model.name
    dir_name = os.path.join('eval', dataset, name)
    if tf.gfile.Exists(dir_name):
        tf.gfile.DeleteRecursively(dir_name)
    tf.gfile.MakeDirs(dir_name)

    restorer = tf.train.Saver(slim.get_model_variables())

    config = tf.ConfigProto()
    best_gpu = utils.get_best_gpu()
    config.gpu_options.visible_device_list = str(best_gpu)
    with tf.Session(config=config) as sess:
        ckpt_path = os.path.join('checkpoints', dataset, name)
        ckpts = get_all_checkpoints(ckpt_path, force=load_all_ckpt)
        size = sample_shape[0] * sample_shape[1]

        z_ = sample_z([size, model.z_dim])

        for v in ckpts:
            print("Evaluating {} ...".format(v))
            restorer.restore(sess, v)
            global_step = int(v.split('/')[-1].split('-')[-1])

            fake_samples = sess.run(model.fake_sample, {model.z: z_})

            # inverse transform: [-1, 1] => [0, 1]
            fake_samples = (fake_samples + 1.) / 2.
            merged_samples = utils.merge(fake_samples, size=sample_shape)
            fn = "{:0>6d}.png".format(global_step)
            scipy.misc.imsave(os.path.join(dir_name, fn), merged_samples)

예제 #2

0

파일 보기

 def set_device(self):
     self.sys_type = self.dict["sys_type"] = get_sys_type()
     try:
         self.device_str = get_best_gpu()
     except: # Time-out Exception.
         self.device_str = "cuda:0"
     if self.verbose:
         print("Options: Using device: %s"%(self.device_str))    
     self.dict["device"] = self.device_str
     self.device = torch.device(self.device_str)

예제 #3

0

파일 보기

def eval(model, name, sample_shape=[1, 1], load_all_ckpt=True):
    if name == None:
        name = model.name
    dir_name = 'eval/' + name
    if tf.gfile.Exists(dir_name):
        tf.gfile.DeleteRecursively(dir_name)
    tf.gfile.MakeDirs(dir_name)

    # training=False => generator only
    restorer = tf.train.Saver(slim.get_model_variables())

    config = tf.ConfigProto()
    best_gpu = utils.get_best_gpu()
    config.gpu_options.visible_device_list = str(
        best_gpu)  # Works same as CUDA_VISIBLE_DEVICES!
    with tf.Session(config=config) as sess:
        ckpts = get_all_checkpoints('./checkpoints/' + name,
                                    force=load_all_ckpt)
        size = sample_shape[0] * sample_shape[1]

        # z_ = sample_z([size, 64])
        # import pdb
        # pdb.set_trace()
        im = scipy.misc.imread(
            '/home/xujinchang/share/project/GAN/tf.gans-comparison/111000.jpg',
            mode='RGB')
        z_ = pre_precess_LR(im, [128, 128])
        z_ = z_ / 127.5 - 1.0
        for v in ckpts:
            print("Evaluating {} ...".format(v))
            restorer.restore(sess, v)
            global_step = int(v.split('/')[-1].split('-')[-1])

            fake_samples = sess.run(model.fake_sample, {model.z: z_})

            # inverse transform: [-1, 1] => [0, 1]
            fake_samples = (fake_samples + 1.) / 2.
            merged_samples = utils.merge(fake_samples, size=sample_shape)
            fn = "{:0>5d}.png".format(global_step)
            scipy.misc.imsave(os.path.join(dir_name, fn), merged_samples)

예제 #4

0

파일 보기

파일: eval.py 프로젝트: villa1233/mkbe

def eval_individual(model, name, dataset, num=100, rep=4, step=35000):
    if name == None:
        name = model.name
    dir_name = os.path.join('eval', dataset, name)
    if tf.gfile.Exists(dir_name):
        tf.gfile.DeleteRecursively(dir_name)
    tf.gfile.MakeDirs(dir_name)

    sampler = CSampler()

    restorer = tf.train.Saver(slim.get_model_variables())

    config = tf.ConfigProto()
    best_gpu = utils.get_best_gpu()
    config.gpu_options.visible_device_list = str(best_gpu)
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2
    with tf.Session(config=config) as sess:
        ckpt_path = os.path.join('checkpoints', dataset, name)
        ckpt = get_all_checkpoints(ckpt_path, step=30000, force=True)[0]

        print("Evaluating {} ...".format(ckpt))
        restorer.restore(sess, ckpt)
        global_step = int(ckpt.split('/')[-1].split('-')[-1])
        size = num * rep

        z_ = sample_z([size, model.z_dim])
        c, inds = sampler.sample_c(rep)

        for ind in range(size):
            fake_sample = sess.run(model.fake_sample, {
                model.z: z_[np.newaxis, ind, :],
                model.c: c[np.newaxis, ind, :]
            })
            img = (fake_sample + 1.0) / 2.0
            tokind, rep_ind = inds[ind]
            scipy.misc.imsave(
                os.path.join(dir_name, "{}.{}.png".format(tokind, rep_ind)),
                img[0, ...])

예제 #5

0

파일 보기

def train(model,
          dataset,
          input_op,
          num_epochs,
          batch_size,
          n_examples,
          ckpt_step,
          renew=False):
    # n_examples = 202599 # same as util.num_examples_from_tfrecords(glob.glob('./data/celebA_tfrecords/*.tfrecord'))
    # 1 epoch = 1583 steps
    print("\n# of examples: {}".format(n_examples))
    print("steps per epoch: {}\n".format(n_examples // batch_size))

    summary_path = os.path.join('.', 'summary', dataset, model.name)
    ckpt_path = os.path.join('.', 'checkpoints', dataset, model.name)
    if renew:
        if os.path.exists(summary_path):
            tf.gfile.DeleteRecursively(summary_path)
        if os.path.exists(ckpt_path):
            tf.gfile.DeleteRecursively(ckpt_path)
    if not os.path.exists(ckpt_path):
        tf.gfile.MakeDirs(ckpt_path)

    config = tf.ConfigProto()
    best_gpu = utils.get_best_gpu()
    config.gpu_options.visible_device_list = str(
        best_gpu)  # Works same as CUDA_VISIBLE_DEVICES!
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())  # for epochs

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        # https://github.com/tensorflow/tensorflow/issues/10972
        # TensorFlow 1.2 has much bugs for text summary
        # make config_summary before define of summary_writer - bypass bug of tensorboard

        # It seems that batch_size should have been contained in the model config ...
        total_steps = int(np.ceil(n_examples * num_epochs /
                                  float(batch_size)))  # total global step
        config_list = [('num_epochs', num_epochs),
                       ('total_iteration', total_steps),
                       ('batch_size', batch_size), ('dataset', dataset)]
        model_config_list = [[k, str(w)]
                             for k, w in sorted(model.args.items()) +
                             config_list]
        model_config_summary_op = tf.summary.text(
            model.name + '/config',
            tf.convert_to_tensor(model_config_list),
            collections=[])
        model_config_summary = sess.run(model_config_summary_op)

        # print to console
        print("\n====== Process info =======")
        print("argv: {}".format(' '.join(sys.argv)))
        print("PID: {}".format(os.getpid()))
        print("====== Model configs ======")
        for k, v in model_config_list:
            print("{}: {}".format(k, v))
        print("===========================\n")

        summary_writer = tf.summary.FileWriter(summary_path,
                                               flush_secs=30,
                                               graph=sess.graph)
        summary_writer.add_summary(model_config_summary)
        pbar = tqdm(total=total_steps, desc='global_step')
        saver = tf.train.Saver(max_to_keep=9999)  # save all checkpoints
        global_step = 0

        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        if ckpt:
            saver.restore(sess, ckpt.model_checkpoint_path)
            global_step = sess.run(model.global_step)
            print('\n[!] Restore from {} ... starting global step is {}\n'.
                  format(ckpt.model_checkpoint_path, global_step))
            pbar.update(global_step)

        try:
            # If training process was resumed from checkpoints, input pipeline cannot detect
            # when training should stop. So we need `global_step < total_step` condition.
            while not coord.should_stop() and global_step < total_steps:
                # model.all_summary_op contains histogram summary and image summary which are heavy op
                summary_op = model.summary_op if global_step % 100 == 0 else model.all_summary_op

                batch_X = sess.run(input_op)
                batch_z = sample_z([batch_size, model.z_dim])

                _, summary = sess.run([model.D_train_op, summary_op], {
                    model.X: batch_X,
                    model.z: batch_z
                })
                _, global_step = sess.run(
                    [model.G_train_op, model.global_step], {model.z: batch_z})

                summary_writer.add_summary(summary, global_step=global_step)

                pbar.update()

                if global_step % ckpt_step == 0:
                    saver.save(sess,
                               ckpt_path + '/' + model.name,
                               global_step=global_step)

        except tf.errors.OutOfRangeError:
            print('\nDone -- epoch limit reached\n')
        finally:
            coord.request_stop()

        coord.join(threads)
        summary_writer.close()
        pbar.close()