Exemplo n.º 1
0
def main(argv):
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('-l', '--level', dest='level', type=int, help='Provide number of the level to solve', default=0)
    arg_parser.add_argument('-v', '--verbose', dest='verbose', action="store_true", help='Set verbosity level to DEBUG', default=False)
    args = arg_parser.parse_args(argv)

    logger = setup_logger(verbose=args.verbose)
    level = args.level
    _valid_level(level, logger)
    _solve(level, logger)
def train(solved_score, population_size, elite_size, num_proc, log_video_rate):
    setup_logger()
    manager = mp.Manager()
    work_queue = manager.Queue()
    results_queue = manager.Queue()

    # Random Search 1st generation
    start_time = time.time()
    env = create_environment()
    population = create_population(env, population_size)
    print(population[0])
    elite, top_scores = get_top_performers_from_random_population(
        env, population, elite_size)
    elapsed_time = time.time() - start_time
    log_generation_stats(1, top_scores, elapsed_time)

    # 2nd -> inf generation: Mutate Top Performers (classic GA)
    ma_reward = 0
    spawn_processes(num_proc,
                    work_fn=mutate_and_evaluate_task,
                    args=(elite, work_queue, results_queue))
    for generation in count(start=2, step=1):
        start_time = time.time()
        spawn_mutation_work(work_queue, elite_size, population_size)

        evaluated_population = collect_results(results_queue,
                                               size=population_size)
        top_scores = get_top_performers(evaluated_population, elite,
                                        elite_size)
        elapsed_time = time.time() - start_time
        if generation % log_video_rate == 0:
            record_evaluation_video(elite[0], env)
        log_generation_stats(generation, top_scores, elapsed_time)

        ma_reward = 0.7 * ma_reward + 0.3 * top_scores.mean()
        if ma_reward >= solved_score:
            print(f"Solved in {generation} generations")
            kill_processes(work_queue, num_proc)
            break
Exemplo n.º 3
0
 def __init__(self, url=None):
     self.soup = None
     self.url = url
     
     self.logger = setup_logger()
     self.logger.info('Solving %s located at %s' % (self.__class__.__name__.lower(), url))
     
     cache_path = '/tmp/%s' % self.__class__.__name__
     self.logger.debug('Setting up HTTP cache: %s.sqlite' % cache_path)
     requests_cache.install_cache(cache_path, extension='.sqlite')
     self.logger.debug('Requesting HTTP and dumping to BeautifulSoup')
     self.requests_result = requests.get(self.url)
     self.soup = BeautifulSoup(self.requests_result.text)
Exemplo n.º 4
0
def main():
    logger = setup_logger("eval_stats",
                          __file__,
                          FLAGS.dir,
                          filename="summary.log")
    lbs = []
    for seed in range(1234, 1234 + 10):
        filename = os.path.join(FLAGS.dir, "eval.log.{}".format(seed))
        with open(filename, "rb") as f:
            text = f.read().decode("utf-8")
            lb = float(text.strip().split("\n")[-1].split("=")[-1].strip())
            logger.info(str(lb))
            lbs.append(lb)

    logger.info("{}+-{}".format(np.mean(lbs), np.std(lbs)))
Exemplo n.º 5
0
def main(model_path, backbone, scale, path, save_path, gpu_id):
    device = torch.device("cuda:" + str(gpu_id))
    logger = setup_logger(os.path.join(config.output_dir, 'test_log'))
    logger.info(config.print())
    if os.path.exists(save_path):
        shutil.rmtree(save_path, ignore_errors=True)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    save_img_folder = os.path.join(save_path, 'img')
    if not os.path.exists(save_img_folder):
        os.makedirs(save_img_folder)
    save_txt_folder = os.path.join(save_path, 'result')
    if not os.path.exists(save_txt_folder):
        os.makedirs(save_txt_folder)
    img_paths = [os.path.join(path, x) for x in os.listdir(path)]
    net = PSENet(backbone=backbone,
                 pretrained=config.pretrained,
                 result_num=config.n)
    model = Pytorch_model(model_path, net=net, scale=scale, gpu_id=gpu_id)
    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    model = nn.DataParallel(model)
    recall, precision, f1 = merge_eval(model=model,
                                       save_path=os.path.join(
                                           config.output_dir, 'output'),
                                       test_path=config.testroot,
                                       device=device,
                                       base_path=config.base_path,
                                       use_sub=config.use_sub)
    logger.info('test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'.format(
        recall, precision, f1))

    # total_frame = 0.0
    # total_time = 0.0
    # for img_path in tqdm(img_paths):
    #     img_name = os.path.basename(img_path).split('.')[0]
    #     save_name = os.path.join(save_txt_folder, 'res_' + img_name + '.txt')
    #     _, boxes_list, t = model.predict(img_path)
    #     total_frame += 1
    #     total_time += t
    #     # img = draw_bbox(img_path, boxes_list, color=(0, 0, 255))
    #     # cv2.imwrite(os.path.join(save_img_folder, '{}.jpg'.format(img_name)), img)
    #     np.savetxt(save_name, boxes_list.reshape(-1, 8), delimiter=',', fmt='%d')
    # print('fps:{}'.format(total_frame / total_time))
    return save_txt_folder
Exemplo n.º 6
0
def main():

    config_path = sys.argv[1]
    opt = util.load_yaml(config_path)

    if opt['path']['resume_state']:  # resuming training
        resume_state = torch.load(opt['path']['resume_state'])

    else:
        resume_state = None
        util.mkdir(opt['path']['log'])

    util.setup_logger(None,
                      opt['path']['log'],
                      'train',
                      level=logging.INFO,
                      screen=True)
    util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO)
    logger = logging.getLogger('base')

    set_random_seed(0)

    # tensorboard log
    writer = SummaryWriter(log_dir=opt['path']['tb_logger'])

    torch.backends.cudnn.benckmark = True

    for phase, dataset_opt in opt['datasets'].items():
        if phase == 'train':
            train_set = data.create_dataset(dataset_opt, phase)
            train_size = int(
                math.ceil(len(train_set) / dataset_opt['batch_size']))
            logger.info('Number of train images: {:,d}, iters: {:,d}'.format(
                len(train_set), train_size))
            total_iters = int(opt['train']['niter'])
            total_epochs = int(math.ceil(total_iters / train_size))
            logger.info('Total epochs needed: {:d} for iters {:,d}'.format(
                total_epochs, total_iters))
            train_loader = data.create_dataloader(train_set, dataset_opt,
                                                  phase)
        elif phase == 'valid':
            val_set = data.create_dataset(dataset_opt, phase)
            val_loader = data.create_dataloader(val_set, dataset_opt, phase)
            logger.info('Number of validation images in [{:s}]: {:d}'.format(
                dataset_opt['name'], len(val_set)))
        else:
            raise NotImplementedError(
                'Phase [{:s}] is not recognized.'.format(phase))
    assert train_loader is not None

    # create model

    model = Model(opt)

    # resume training
    if resume_state:
        start_epoch = resume_state['epoch']
        current_step = resume_state['iter']
        model.load_model(current_step)
        model.resume_training(resume_state)  # handle optimizers and schedulers
    else:
        current_step = 0
        start_epoch = 0

    # training
    logger.info('Start training from epoch: {:d}, iter: {:d}'.format(
        start_epoch, current_step))

    for epoch in range(start_epoch, total_epochs):
        for _, train_data in enumerate(train_loader):
            current_step += 1
            if current_step > total_iters:
                break
            # update learning rate
            model.update_learning_rate()

            # training
            model.train(train_data, current_step)

            # log
            if current_step % opt['train']['print_freq'] == 0:
                logs = model.get_current_log()
                message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format(
                    epoch, current_step, model.get_current_learning_rate())
                for k, v in logs.items():
                    message += '{:s}: {:.4e} '.format(k, v)
                    # tensorboard logger
                    writer.add_scalar(k, v, current_step)
                logger.info(message)

            if current_step % opt['train']['val_freq'] == 0:
                psnr, ssim = model.validate(val_loader, current_step)

                # log
                logger.info('# Validation # PSNR: {:.4e} SSIM: {:.4e}'.format(
                    psnr, ssim))
                logger_val = logging.getLogger('val')  # validation logger
                logger_val.info(
                    '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e} ssim: {:.4e}'.
                    format(epoch, current_step, psnr, ssim))
                # tensorboard logger
                writer.add_scalar('VAL_PSNR', psnr, current_step)
                writer.add_scalar('VAL_SSIM', ssim, current_step)

            # save models and training states
            if current_step % opt['train']['save_step'] == 0:
                logger.info('Saving models and training states.')
                model.save_model(epoch, current_step)
Exemplo n.º 7
0
import datetime as DT
import matplotlib as plt
import numpy as np
import os
import pandas as pd
import utils.utils as utils

# TODO: functions with calc_daily mean and stuff.. are all almost the same.. generalize.
# TODO: need to remove outliers before calculating mean.

pd.set_option('display.max_rows', 200000)  # so pandas prints more rows

# --- logging - always cleans the log when importing and executing this file
import logging

utils.setup_logger('logger_clean', r'logs/clean.log')
logger = logging.getLogger('logger_clean')

# --- global variables
# global variables were used because of .apply() function used inside calc_stats, otherwise I wouldve used these two as normal parameters. They receive value inside 'main()'
global start_year
global end_year


# --- START Functions
def clean_datetime_site_daily(df):
    """
        Clears data in datetime and site field of a given pandas dataframe.
        site becomes: 'site' containing only the site code
        datetime becomes: 'date' containing only year-month-day
        value is repeated.
Exemplo n.º 8
0
from __future__ import print_function
from pdnn.run_DNN import run_DNN
from pdnn.run_RBM import run_RBM
from pdnn.run_SDA import run_SDA
from pdnn.eval_DNN import eval_DNN
import json
from utils.utils import setup_logger

MNIST_CONF = json.load(open("configs/unittest_mnist.json"))
MAX_ITERS = 2
setup_logger(None)


def banner(s):
    print("***********************" + s + "*************************")


def test_hi():
    print("hi")


def test_rbm_dnn():
    banner("rbm dnn")
    mnist_conf = MNIST_CONF.copy()

    mnist_conf["train_rbm"]["max_iters"] = MAX_ITERS
    run_RBM(mnist_conf)

    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
    mnist_conf["init_dnn"] = {
        "filename": "temp/rbm/final.nnet",
Exemplo n.º 9
0
def main():
    tf.set_random_seed(1234)
    np.random.seed(1234)

    # Load celebA
    data_path = os.path.join('data', 'celebA', 'img_align_celeba.zip')
    celeba = dataset.CelebADataset(data_path)

    x = tf.placeholder(tf.float32, shape=[None] + celeba.data_dims, name='x')
    n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles')
    n = tf.shape(x)[0]

    def log_joint(observed):
        model, _ = vae(observed, n, n_z, n_particles)
        log_pz, log_px_z = model.local_log_prob(['z', 'x'])
        return log_pz + log_px_z

    variational = q_net(x, n_z, n_particles)
    qz_samples, log_qz = variational.query('z',
                                           outputs=True,
                                           local_log_prob=True)
    lower_bound = zs.variational.elbo(log_joint,
                                      observed={'x': x},
                                      latent={'z': [qz_samples, log_qz]},
                                      axis=0)
    cost = tf.reduce_mean(lower_bound.sgvb())
    lower_bound = tf.reduce_mean(lower_bound)

    model, _ = vae({'z': qz_samples}, n, n_z, n_particles)
    log_pz = model.local_log_prob('z')
    kl_term = tf.reduce_mean(log_qz - log_pz)
    # cost = kl_term

    optimizer = tf.train.AdamOptimizer(3e-4)
    infer_op = optimizer.minimize(cost)

    # Generate images
    n_gen = 100
    _, x_mean = vae({}, n_gen, n_z, None)
    x_gen = tf.reshape(x_mean, [-1] + celeba.data_dims)

    # Interpolation
    # [n, n_z]
    x_start = x[:8]
    x_end = x[8:16]
    z_start = qz_samples[0, :8, :]
    z_end = qz_samples[0, 8:16, :]
    # [1, 8, 1]
    alpha = tf.reshape(tf.linspace(0., 1., 8), [1, 8, 1])
    # [n, 1, n_z]
    z_start = tf.expand_dims(z_start, 1)
    z_end = tf.expand_dims(z_end, 1)
    # [n, 8, n_z]
    z_interp = alpha * z_start + (1. - alpha) * z_end
    z_interp = tf.reshape(z_interp, [-1, n_z])
    _, x_interp = vae({'z': z_interp}, 64, n_z, None)
    x_interp = tf.reshape(x_interp, [-1] + celeba.data_dims)

    # Define training parameters
    epochs = 25
    batch_size = 64
    iters = celeba.train_size // batch_size
    save_image_freq = 1
    print_freq = 100
    save_model_freq = 5
    test_freq = 1
    test_batch_size = 500
    test_iters = celeba.test_size // test_batch_size
    result_path = "results/vae_celeba_" + time.strftime("%Y%m%d_%H%M%S")

    saver = tf.train.Saver(max_to_keep=10)
    logger = setup_logger('vae_celeba', __file__, result_path)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Restore from the latest checkpoint
        ckpt_file = tf.train.latest_checkpoint(result_path)
        begin_epoch = 1
        if ckpt_file is not None:
            logger.info('Restoring model from {}...'.format(ckpt_file))
            begin_epoch = int(ckpt_file.split('.')[-2]) + 1
            saver.restore(sess, ckpt_file)

        for epoch in range(begin_epoch, epochs + 1):
            lbs = []
            kls = []
            time_iters = []
            for t in range(1, iters + 1):
                time_iter = -time.time()
                x_batch = celeba.next_batch(batch_size)
                _, lb, kl = sess.run([infer_op, lower_bound, kl_term],
                                     feed_dict={
                                         x: x_batch,
                                         n_particles: 1
                                     })
                # logger.info('Iter {}: lb = {}'.format(t, lb))
                lbs.append(lb)
                kls.append(kl)
                time_iter += time.time()
                time_iters.append(time_iter)

                if t % print_freq == 0:
                    logger.info(
                        'Epoch={} Iter={} ({}s): lb = {}, kl = {}'.format(
                            epoch, t, np.mean(time_iters),
                            np.mean(lbs[-print_freq:]),
                            np.mean(kls[-print_freq:])))
                    time_iters = []

            logger.info('>> Epoch {}: Lower bound = {}, kl = {}'.format(
                epoch, np.mean(lbs), np.mean(kls)))

            interp_images = []
            start_images = []
            end_images = []
            if epoch % test_freq == 0:
                time_test = -time.time()
                test_lbs = []
                for t in range(test_iters):
                    test_x_batch = celeba.next_test_batch(test_batch_size)
                    test_lb, interp_image, start_image, end_image = sess.run(
                        [lower_bound, x_interp, x_start, x_end],
                        feed_dict={
                            x: test_x_batch,
                            n_particles: 1
                        })
                    test_lbs.append(test_lb)
                    interp_images.append(interp_image)
                    start_images.append(start_image)
                    end_images.append(end_image)
                time_test += time.time()
                logger.info('>>> TEST ({:.1f}s)'.format(time_test))
                logger.info('>> Test lower bound = {}'.format(
                    np.mean(test_lbs)))

                logger.info('Saving interpolations...')
                interp_name = os.path.join(result_path,
                                           "interp.epoch.{}.png".format(epoch))
                save_image_collections(interp_images[0],
                                       interp_name,
                                       scale_each=True,
                                       shape=(8, 8))
                if epoch == 1:
                    save_image_collections(start_images[0],
                                           interp_name + ".start.png",
                                           scale_each=True,
                                           shape=(8, 1))
                    save_image_collections(end_images[0],
                                           interp_name + ".end.png",
                                           scale_each=True,
                                           shape=(8, 1))

            if epoch % save_image_freq == 0:
                logger.info('Saving images...')
                images = sess.run(x_gen)
                name = os.path.join(result_path,
                                    "vae.epoch.{}.png".format(epoch))
                save_image_collections(images, name, scale_each=True)

            if epoch % save_model_freq == 0:
                logger.info('Saving model...')
                save_path = os.path.join(result_path,
                                         "vae.epoch.{}.ckpt".format(epoch))
                if not os.path.exists(os.path.dirname(save_path)):
                    os.makedirs(os.path.dirname(save_path))
                saver.save(sess, save_path)
                logger.info('Done')
Exemplo n.º 10
0
        image_name = img_path.split('/')[-1].split('.')[0]
        write_result_as_txt(image_name, boxes_list, save_path)

    #  recall precision f1
    gt_path = os.path.join(test_path, 'gt/Test')
    fid_path = os.path.join(workspace, 'res_tt.txt')
    shutil.rmtree(fid_path, ignore_errors=True)
    precision, recall, hmean = evl_totaltext(save_path, gt_path, fid_path)
    # f_score_new = getresult(save_path,config.gt_name)
    return precision, recall, hmean


if __name__ == "__main__":
    config.workspace = os.path.join(config.workspace_dir, config.exp_name)
    logger = setup_logger(os.path.join(config.workspace, 'test_log'))
    logger.info(config.print())

    # best_save_path = '{}/Best_model_0.632154.pth'.format(config.workspace)
    best_save_path = "/data/glusterfs_cv_04/11121171/CVPR_Text/PSENet_file/Total_Text/Best_model_0.787389.pth"
    # writer = SummaryWriter(config.output_dir)
    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.kernel_num,
                   scale=config.scale)
    num_gpus = torch.cuda.device_count()
    device = torch.device("cuda:0")
    # if num_gpus > 1:
    model = nn.DataParallel(model)
    model = model.to(device)
Exemplo n.º 11
0
    ais = AIS(ais_log_prior,
              log_joint, {'z': pz_samples},
              hmc,
              observed={'x': x_obs},
              latent={'z': z},
              n_chains=test_n_chains,
              n_temperatures=test_n_temperatures)

    model_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope="model")
    variational_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="variational")
    saver = tf.train.Saver(max_to_keep=10,
                           var_list=model_var_list + variational_var_list)
    logger = setup_logger("vae_eval",
                          __file__,
                          result_path,
                          filename="eval.log.{}".format(seed))

    # Run the evaluation
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Restore from the latest checkpoint
        ckpt_file = tf.train.latest_checkpoint(result_path)
        if ckpt_file is not None:
            logger.info('Restoring model from {}...'.format(ckpt_file))
            saver.restore(sess, ckpt_file)

            # AIS evaluation
            logger.info('Start evaluation...')
            time_ais = -time.time()
Exemplo n.º 12
0
from pdnn.run_DNN import run_DNN
from pdnn.run_RBM import run_RBM
from pdnn.run_SDA import run_SDA
from pdnn.eval_DNN import eval_DNN
import json
from utils.utils import setup_logger

MNIST_CONF = json.load(open("configs/unittest_mnist.json"))
MAX_ITERS = 2
setup_logger(None)

def banner(s):
    print "***********************" + s + "*************************"

def test_hi():
    print "hi"

def test_rbm_dnn():
    banner("rbm dnn")
    mnist_conf = MNIST_CONF.copy()

    mnist_conf["train_rbm"]["max_iters"] = MAX_ITERS
    run_RBM(mnist_conf)

    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
    mnist_conf["init_dnn"] = {
        "filename": "temp/rbm/final.nnet",
        "num_hidden_layers": -1,
        "with_final": 1
    }
    run_DNN(mnist_conf)
Exemplo n.º 13
0
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir):
    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    #os.environ['CUDA_VISIBLE_DEVICES']=gpus

    # Parallel setting
    print("Use GPU: {} for training".format(gpus))

    update_config(cfg, args)

    #test(cfg, args)

    # logger setting
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # model initilization
    model = eval(cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True)

    # load pretrained model before DDP initialization
    checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar')

    if cfg.AUTO_RESUME:
        if os.path.exists(checkpoint_file):
            checkpoint = torch.load(checkpoint_file, map_location='cpu')
            state_dict = checkpoint['state_dict']

            for key in list(state_dict.keys()):
                new_key = key.replace("module.", "")
                state_dict[new_key] = state_dict.pop(key)
            model.load_state_dict(state_dict)
            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_file, checkpoint['epoch']))

    elif cfg.MODEL.HRNET_PRETRAINED:
        logger.info("=> loading a pretrained model '{}'".format(
            cfg.MODEL.PRETRAINED))
        checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED, map_location='cpu')

        state_dict = checkpoint['state_dict']
        for key in list(state_dict.keys()):
            new_key = key.replace("module.", "")
            state_dict[new_key] = state_dict.pop(key)

        model.load_state_dict(state_dict)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
        final_output_dir)
    # copy configuration file
    config_dir = args.cfg
    shutil.copy2(os.path.join(args.cfg), final_output_dir)

    # calculate GFLOPS
    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]))

    logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    #ops, params = get_model_complexity_info(
    #    model, (3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]),
    #    as_strings=True, print_per_layer_stat=True, verbose=True)
    # FP16 SETTING
    if cfg.FP16.ENABLED:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if cfg.FP16.STATIC_LOSS_SCALE != 1.0:
        if not cfg.FP16.ENABLED:
            print(
                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
            )

    if cfg.FP16.ENABLED:
        model = network_to_half(model)

    if cfg.MODEL.SYNC_BN and not cfg.cfg.DISTRIBUTED:
        print(
            'Warning: Sync BatchNorm is only supported in distributed training.'
        )

    # Distributed Computing
    master = True
    if cfg.DISTRIBUTED:  # This block is not available
        args.local_rank += int(gpus[0])
        print('This process is using GPU', args.local_rank)
        device = args.local_rank
        master = device == int(gpus[0])
        dist.init_process_group(backend='nccl')
        if cfg.MODEL.SYNC_BN:
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if gpus is not None:
            torch.cuda.set_device(device)
            model.cuda(device)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # workers = int(workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[device],
                output_device=device,
                find_unused_parameters=True)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:  # implement this block
        gpu_ids = eval('[' + gpus + ']')
        device = gpu_ids[0]
        print('This process is using GPU', str(device))
        model = torch.nn.DataParallel(model, gpu_ids).cuda(device)

    # Prepare loss functions
    criterion = {}
    if cfg.LOSS.WITH_HEATMAP_LOSS:
        criterion['heatmap_loss'] = HeatmapLoss().cuda()
    if cfg.LOSS.WITH_POSE2D_LOSS:
        criterion['pose2d_loss'] = JointsMSELoss().cuda()
    if cfg.LOSS.WITH_BONE_LOSS:
        criterion['bone_loss'] = BoneLengthLoss().cuda()
    if cfg.LOSS.WITH_JOINTANGLE_LOSS:
        criterion['jointangle_loss'] = JointAngleLoss().cuda()

    best_perf = 1e9
    best_model = False
    last_epoch = -1

    # optimizer must be initilized after model initilization
    optimizer = get_optimizer(cfg, model)

    if cfg.FP16.ENABLED:
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE,
            dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE,
            verbose=False)

    begin_epoch = cfg.TRAIN.BEGIN_EPOCH

    if not cfg.AUTO_RESUME and cfg.MODEL.HRNET_PRETRAINED:
        optimizer.load_state_dict(checkpoint['optimizer'])

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['loss']
        optimizer.load_state_dict(checkpoint['optimizer'])

        if 'train_global_steps' in checkpoint.keys() and \
        'valid_global_steps' in checkpoint.keys():
            writer_dict['train_global_steps'] = checkpoint[
                'train_global_steps']
            writer_dict['valid_global_steps'] = checkpoint[
                'valid_global_steps']

    if cfg.FP16.ENABLED:
        logger.info("=> Using FP16 mode")
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer.optimizer,
            cfg.TRAIN.LR_STEP,
            cfg.TRAIN.LR_FACTOR,
            last_epoch=begin_epoch)
    elif cfg.TRAIN.LR_SCHEDULE == 'warmup':
        from utils.utils import get_linear_schedule_with_warmup
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=cfg.TRAIN.WARMUP_EPOCHS,
            num_training_steps=cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH,
            last_epoch=begin_epoch)
    elif cfg.TRAIN.LR_SCHEDULE == 'multi_step':
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            cfg.TRAIN.LR_STEP,
            cfg.TRAIN.LR_FACTOR,
            last_epoch=begin_epoch)
    else:
        print('Unknown learning rate schedule!')
        exit()

    # Data loading code
    train_loader_dict = make_dataloader(cfg,
                                        is_train=True,
                                        distributed=cfg.DISTRIBUTED)
    valid_loader_dict = make_dataloader(cfg,
                                        is_train=False,
                                        distributed=cfg.DISTRIBUTED)

    for i, (dataset_name,
            train_loader) in enumerate(train_loader_dict.items()):
        logger.info(
            'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) +
            str(train_loader.dataset))
    for i, (dataset_name,
            valid_loader) in enumerate(valid_loader_dict.items()):
        logger.info('Validation Loader {}/{}:\n'.format(
            i + 1, len(valid_loader_dict)) + str(valid_loader.dataset))

    #writer_dict['writer'].add_graph(model, (dump_input, ))
    """
    Start training
    """
    start_time = time.time()

    with torch.autograd.set_detect_anomaly(True):
        for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1):
            epoch_start_time = time.time()
            # shuffle datasets with the sample random seed
            if cfg.DISTRIBUTED:
                for data_loader in train_loader_dict.values():
                    data_loader.sampler.set_epoch(epoch)
            # train for one epoch
            # get_last_lr() returns a list
            logger.info('Start training [{}/{}] lr: {:.4e}'.format(
                epoch, cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH,
                lr_scheduler.get_last_lr()[0]))
            train(cfg,
                  args,
                  master,
                  train_loader_dict,
                  model,
                  criterion,
                  optimizer,
                  epoch,
                  final_output_dir,
                  tb_log_dir,
                  writer_dict,
                  logger,
                  fp16=cfg.FP16.ENABLED,
                  device=device)

            # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`.
            lr_scheduler.step()

            # evaluate on validation set

            if not cfg.WITHOUT_EVAL:
                logger.info('Start evaluating [{}/{}]'.format(
                    epoch, cfg.TRAIN.END_EPOCH - 1))
                with torch.no_grad():
                    recorder = validate(cfg,
                                        args,
                                        master,
                                        valid_loader_dict,
                                        model,
                                        criterion,
                                        final_output_dir,
                                        tb_log_dir,
                                        writer_dict,
                                        logger,
                                        device=device)

                val_total_loss = recorder.avg_total_loss

                best_model = False
                if val_total_loss < best_perf:
                    logger.info(
                        'This epoch yielded a better model with total loss {:.4f} < {:.4f}.'
                        .format(val_total_loss, best_perf))
                    best_perf = val_total_loss
                    best_model = True

            else:
                val_total_loss = 0
                best_model = True

            if master:
                logger.info(
                    '=> saving checkpoint to {}'.format(final_output_dir))
                save_checkpoint(
                    {
                        'epoch': epoch,
                        'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME,
                        'state_dict': model.state_dict(),
                        'loss': val_total_loss,
                        'optimizer': optimizer.state_dict(),
                        'train_global_steps':
                        writer_dict['train_global_steps'],
                        'valid_global_steps': writer_dict['valid_global_steps']
                    }, best_model, final_output_dir)

            print('\nEpoch {} spent {:.2f} hours\n'.format(
                epoch, (time.time() - epoch_start_time) / 3600))

            #if epoch == 3:break
    if master:
        final_model_state_file = os.path.join(
            final_output_dir, 'final_state{}.pth.tar'.format(gpus))
        logger.info(
            '=> saving final model state to {}'.format(final_model_state_file))
        torch.save(model.state_dict(), final_model_state_file)
        writer_dict['writer'].close()

        print(
            '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format(
                cfg.TRAIN.END_EPOCH - begin_epoch + 1,
                (time.time() - start_time) / 3600))
Exemplo n.º 14
0
    def __init__(self,
                 config,
                 model,
                 criterion,
                 metric_cls,
                 train_loader,
                 validate_loader,
                 post_process=None):
        config['trainer']['output_dir'] = os.path.join(
            str(pathlib.Path(os.path.abspath(__name__)).parent),
            config['trainer']['output_dir'])
        config['name'] = config['name'] + '_' + model.name
        self.save_dir = os.path.join(config['trainer']['output_dir'],
                                     config['name'])
        self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint')

        if config['trainer']['resume_checkpoint'] == '' and config['trainer'][
                'finetune_checkpoint'] == '':
            shutil.rmtree(self.save_dir, ignore_errors=True)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.global_step = 0
        self.start_epoch = 0
        self.config = config
        self.model = model
        self.criterion = criterion
        self.metric_cls = metric_cls
        # logger and tensorboard
        self.epochs = self.config['trainer']['epochs']
        self.log_iter = self.config['trainer']['log_iter']
        self.tensorboard_enable = self.config['trainer']['tensorboard']
        if config['local_rank'] == 0:
            anyconfig.dump(config, os.path.join(self.save_dir, 'config.yaml'))
            self.logger = setup_logger(os.path.join(self.save_dir,
                                                    'train.log'))
            self.logger_info(pformat(self.config))

        # device
        torch.manual_seed(self.config['trainer']['seed'])  # 为CPU设置随机种子
        if torch.cuda.device_count() > 0 and torch.cuda.is_available():
            self.with_cuda = True
            torch.backends.cudnn.benchmark = True
            self.device = torch.device("cuda")
            torch.cuda.manual_seed(
                self.config['trainer']['seed'])  # 为当前GPU设置随机种子
            torch.cuda.manual_seed_all(
                self.config['trainer']['seed'])  # 为所有GPU设置随机种子
        else:
            self.with_cuda = False
            self.device = torch.device("cpu")
        self.logger_info('train with device {} and pytorch {}'.format(
            self.device, torch.__version__))

        self.optimizer = self._initialize('optimizer', torch.optim,
                                          model.parameters())

        # resume or finetune
        if self.config['trainer']['resume_checkpoint'] != '':
            self._load_checkpoint(self.config['trainer']['resume_checkpoint'],
                                  resume=True)
        elif self.config['trainer']['finetune_checkpoint'] != '':
            self._load_checkpoint(
                self.config['trainer']['finetune_checkpoint'], resume=False)

        if self.config['lr_scheduler']['type'] != 'WarmupPolyLR':
            self.scheduler = self._initialize('lr_scheduler',
                                              torch.optim.lr_scheduler,
                                              self.optimizer)
        self.metrics = {
            'recall': 0,
            'precision': 0,
            'hmean': 0,
            'train_loss': float('inf'),
            'best_model_epoch': 0
        }
        self.model.to(self.device)

        # 分布式训练
        if torch.cuda.device_count() > 1:
            local_rank = config['local_rank']
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[local_rank],
                output_device=local_rank,
                broadcast_buffers=False,
                find_unused_parameters=True)

        self.show_images_iter = self.config['trainer']['show_images_iter']
        self.train_loader = train_loader
        if validate_loader is not None:
            assert post_process is not None
        self.validate_loader = validate_loader
        self.post_process = post_process
        self.train_loader_len = len(train_loader)
        if self.config['lr_scheduler']['type'] == 'WarmupPolyLR':
            warmup_iters = config['lr_scheduler']['args'][
                'warmup_epoch'] * self.train_loader_len
            if self.start_epoch > 1:
                self.config['lr_scheduler']['args']['last_epoch'] = (
                    self.start_epoch - 1) * self.train_loader_len
            self.scheduler = WarmupPolyLR(self.optimizer,
                                          max_iters=self.epochs *
                                          self.train_loader_len,
                                          warmup_iters=warmup_iters,
                                          **config['lr_scheduler']['args'])
        if self.validate_loader is not None:
            self.logger_info(
                'train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader'
                .format(len(self.train_loader.dataset), self.train_loader_len,
                        len(self.validate_loader.dataset),
                        len(self.validate_loader)))
        else:
            self.logger_info(
                'train dataset has {} samples,{} in dataloader'.format(
                    len(self.train_loader.dataset), self.train_loader_len))

        if self.tensorboard_enable and config['local_rank'] == 0:
            from torch.utils.tensorboard import SummaryWriter
            self.writer = SummaryWriter(self.save_dir)
            try:
                dummy_input = torch.zeros(1, 3, 640, 640).to(self.device)
                self.writer.add_graph(self.model, dummy_input)
                torch.cuda.empty_cache()
            except:
                import traceback
                self.logger.error(traceback.format_exc())
                self.logger.warn('add graph to tensorboard failed')
Exemplo n.º 15
0
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier, plot_importance

from sklearn.preprocessing import MinMaxScaler

# np.set_printoptions(threshold=np.nan)
pd.set_option('display.max_rows', 200000)


# --- logging - always cleans the log when importing and executing this file
import logging
utils.setup_logger('logger_feat_extract', r'logs/feat_extract.log')
logger = logging.getLogger('logger_feat_extract')

# --- measuring time
import time

# --- global variables
global start_year
global end_year


def timeit(method):

    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
Exemplo n.º 16
0
Arquivo: refine.py Projeto: ukanf/TCC
"""
    This file constains methods to interpolate and handle with some outliers.
"""
# --- general imports
import datetime as DT
import matplotlib as plt
import numpy as np
import os
import pandas as pd
import utils.utils as utils

pd.set_option('display.max_rows', 200000)

# --- logging - always cleans the log when importing and executing this file
import logging
utils.setup_logger('logger_refine', r'logs/refine.log')
logger = logging.getLogger('logger_refine')

# --- global variables
global start_year
global end_year

# --- START Functions


def handle_outliers(df):
    """
        There are no silver bullets for this issue... TODO: maybe outliers should be handled before calculating the averages*
        Removes negative values from readings that should not have negative values
    """
    logger.warning('Removing outliers (negative values and putting 0)')
Exemplo n.º 17
0
def main_worker(gpu, ngpus_per_node, args, final_output_dir, tb_log_dir):

    args.gpu = gpu
    args.rank = args.rank * ngpus_per_node + gpu
    print('Init process group: dist_url: {}, world_size: {}, rank: {}'.format(cfg.DIST_URL, args.world_size, args.rank))
    dist.init_process_group(backend=cfg.DIST_BACKEND, init_method=cfg.DIST_URL, world_size=args.world_size, rank=args.rank)

    update_config(cfg, args)

    # setup logger
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(cfg, is_train=True)
    logger.info(get_model_summary(model, torch.zeros(1, 3, *cfg.MODEL.IMAGE_SIZE)))

    # copy model file
    if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0):
        this_dir = os.path.dirname(__file__)
        shutil.copy2(os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0):
        dump_input = torch.rand((1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
        writer_dict['writer'].add_graph(model, (dump_input, ))
        # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    if cfg.MODEL.SYNC_BN:
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
    
    torch.cuda.set_device(args.gpu)
    model.cuda(args.gpu)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda(args.gpu)

    # Data loading code
    train_dataset = eval('dataset.'+cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    )
    valid_dataset = eval('dataset.'+cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    )
    
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
        shuffle=(train_sampler is None),
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY,
        sampler=train_sampler
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY
    )
    logger.info(train_loader.dataset)

    best_perf = -1
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')
    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
        last_epoch=last_epoch)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        
        # train for one epoch
        train(cfg, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)
        # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`.
        lr_scheduler.step()

        # evaluate on validation set
        perf_indicator = validate(
            args, cfg, valid_loader, valid_dataset, model, criterion,
            final_output_dir, tb_log_dir, writer_dict
        )

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        if not cfg.MULTIPROCESSING_DISTRIBUTED or (
                cfg.MULTIPROCESSING_DISTRIBUTED
                and args.rank == 0
        ):
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint({
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(
        final_output_dir, 'final_state{}.pth.tar'.format(gpu)
    )

    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Exemplo n.º 18
0
def main():

    config.workspace = os.path.join(config.workspace_dir, config.exp_name)
    if config.restart_training:
        shutil.rmtree(config.workspace, ignore_errors=True)
    if not os.path.exists(config.workspace):
        os.makedirs(config.workspace)

    shutil.rmtree(os.path.join(config.workspace, 'train_log'),
                  ignore_errors=True)
    logger = setup_logger(os.path.join(config.workspace, 'train_log'))
    logger.info(config.print())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    if config.gpu_id is not None and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logger.info('train with gpu {} and pytorch {}'.format(
            config.gpu_id, torch.__version__))
        device = torch.device("cuda:0")
        torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子
    else:
        logger.info('train with cpu and pytorch {}'.format(torch.__version__))
        device = torch.device("cpu")

    train_data = ICDAR17(config.trainroot,
                         data_shape=config.data_shape,
                         n=config.kernel_num,
                         m=config.min_scale)
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=config.train_batch_size,
                                   shuffle=True,
                                   num_workers=int(config.workers))

    # writer = SummaryWriter(config.output_dir)
    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.kernel_num,
                   scale=config.scale)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    criterion = PSELoss(Lambda=config.Lambda,
                        ratio=config.OHEM_ratio,
                        reduction='mean')
    # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    if config.checkpoint != '' and not config.restart_training:
        start_epoch = load_checkpoint(config.checkpoint, model, logger, device,
                                      optimizer)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            config.lr_decay_step,
            gamma=config.lr_gamma,
            last_epoch=start_epoch)
    else:
        start_epoch = config.start_epoch
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         config.lr_decay_step,
                                                         gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} in dataloader'.format(
        train_data.__len__(), all_step))
    epoch = 0
    f1 = 0
    try:
        for epoch in range(start_epoch, config.epochs):
            start = time.time()
            train_loss, lr = train_epoch(model, optimizer, scheduler,
                                         train_loader, device, criterion,
                                         epoch, all_step, logger)
            logger.info(
                '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
                    epoch, config.epochs, train_loss,
                    time.time() - start, lr))

            save_path = '{}/epoch_model.pth'.format(config.workspace)
            save_checkpoint(save_path, model, optimizer, epoch, logger)

            if epoch >= 50 and epoch % 10 == 0:
                f_score_new = eval(model,
                                   os.path.join(config.workspace, 'output'),
                                   config.testroot, device)
                logger.info('  ---------------------------------------')
                logger.info('     test: f_score : {:.6f}'.format(f_score_new))
                logger.info('  ---------------------------------------')

                if f_score_new > f1:
                    f1 = f_score_new
                    best_save_path = '{}/Best_model_{:.6f}.pth'.format(
                        config.workspace, f1)

                    save_checkpoint(best_save_path, model, optimizer, epoch,
                                    logger)

                # writer.add_scalar(tag='Test/recall', scalar_value=recall, global_step=epoch)
                # writer.add_scalar(tag='Test/precision', scalar_value=precision, global_step=epoch)
                # writer.add_scalar(tag='Test/f1', scalar_value=f1, global_step=epoch)
        # writer.close()
    except KeyboardInterrupt:
        save_checkpoint('{}/final.pth'.format(config.workspace), model,
                        optimizer, epoch, logger)
def main():
    tf.set_random_seed(1234)
    np.random.seed(1234)

    # Load celebA
    data_path = os.path.join('data', 'celebA', 'img_align_celeba.zip')
    celeba = dataset.CelebADataset(data_path)

    x = tf.placeholder(tf.float32, shape=[None] + celeba.data_dims, name='x')
    n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles')
    n = tf.shape(x)[0]

    qz_samples = q_net(x, n_z, n_particles)
    # Use a single particle for the reconstruction term
    observed = {'x': x, 'z': qz_samples[:1]}
    model, z, _ = vae(observed, n, n_z, 1)
    # [1, n]
    log_px_qz = model.local_log_prob('x')
    eq_ll = tf.reduce_mean(log_px_qz)
    # [n_particles, n]
    log_p_qz = z.log_prob(qz_samples)
    eq_joint = eq_ll + tf.reduce_mean(log_p_qz)

    if FLAGS.estimator == "stein":
        estimator = SteinScoreEstimator(eta=eta)
    elif FLAGS.estimator == "spectral":
        estimator = SpectralScoreEstimator(n_eigen=None, eta=None,
                                           n_eigen_threshold=0.99)
    else:
        raise ValueError("The chosen estimator is not recognized.")

    optimizer = tf.train.AdamOptimizer(3e-4)

    entropy_grads_and_vars = entropy_gradients(
        optimizer, estimator, tf.transpose(qz_samples, [1, 0, 2]))
    entropy_dict = dict([(v, g) for g, v in entropy_grads_and_vars
                         if g is not None])

    joint_grads_and_vars = optimizer.compute_gradients(-eq_joint)
    joint_dict = dict([(v, g) for g, v in joint_grads_and_vars
                       if g is not None])

    def combine_grads(v):
        ret = 0.
        if v in entropy_dict:
            ret += -entropy_dict[v]
        if v in joint_dict:
            ret += joint_dict[v]
        return ret

    grads_and_vars = [(combine_grads(v), v) for v in tf.trainable_variables()]
    infer_op = optimizer.apply_gradients(grads_and_vars)

    # Generate images
    n_gen = 100
    _, _, x_mean = vae({}, n_gen, n_z, None)
    x_gen = tf.reshape(x_mean, [-1] + celeba.data_dims)

    # Interpolation
    # [n, n_z]
    x_start = x[:8]
    x_end = x[8:16]
    z_start = qz_samples[0, :8, :]
    z_end = qz_samples[0, 8:16, :]
    # [1, 8, 1]
    alpha = tf.reshape(tf.linspace(0., 1., 8), [1, 8, 1])
    # [n, 1, n_z]
    z_start = tf.expand_dims(z_start, 1)
    z_end = tf.expand_dims(z_end, 1)
    # [n, 8, n_z]
    z_interp = alpha * z_start + (1. - alpha) * z_end
    z_interp = tf.reshape(z_interp, [-1, n_z])
    _, _, x_interp = vae({'z': z_interp}, 64, n_z, None)
    x_interp = tf.reshape(x_interp, [-1] + celeba.data_dims)

    # Define training parameters
    epochs = 25
    batch_size = 64
    iters = celeba.train_size // batch_size
    save_image_freq = 1
    print_freq = 100
    save_model_freq = 5
    test_freq = 1
    test_batch_size = 500
    test_iters = celeba.test_size // test_batch_size
    result_path = "results/vae_celeba_" + FLAGS.estimator + \
        time.strftime("_%Y%m%d_%H%M%S")

    saver = tf.train.Saver(max_to_keep=10)
    logger = setup_logger('vae_celeba_' + FLAGS.estimator, __file__,
                          result_path)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Restore from the latest checkpoint
        ckpt_file = tf.train.latest_checkpoint(result_path)
        begin_epoch = 1
        if ckpt_file is not None:
            logger.info('Restoring model from {}...'.format(ckpt_file))
            begin_epoch = int(ckpt_file.split('.')[-2]) + 1
            saver.restore(sess, ckpt_file)

        for epoch in range(begin_epoch, epochs + 1):
            eq_joints = []
            time_iters = []
            for t in range(1, iters + 1):
                time_iter = -time.time()
                x_batch = celeba.next_batch(batch_size)
                _, eq_joint_ = sess.run(
                    [infer_op, eq_joint],
                    feed_dict={x: x_batch, n_particles: n_est})
                # logger.info('Iter {}: lb = {}, kl = {}, true_kl = {}'
                #             .format(t, lb, kl, true_kl))
                eq_joints.append(eq_joint_)
                time_iter += time.time()
                time_iters.append(time_iter)

                if t % print_freq == 0:
                    logger.info(
                        'Epoch={} Iter={} ({}s): log joint = {}'
                        .format(epoch, t, np.mean(time_iters),
                                np.mean(eq_joints[-print_freq:])))

            logger.info(
                'Epoch {}: log joint = {}'
                .format(epoch, np.mean(eq_joints)))

            interp_images = []
            start_images = []
            end_images = []
            if epoch % test_freq == 0:
                time_test = -time.time()
                test_eq_joints = []
                for t in range(test_iters):
                    test_x_batch = celeba.next_test_batch(test_batch_size)
                    test_eq_joint, interp_image, start_image, end_image = \
                        sess.run(
                            [eq_joint, x_interp, x_start, x_end],
                            feed_dict={x: test_x_batch, n_particles: n_est})
                    test_eq_joints.append(test_eq_joint)
                    interp_images.append(interp_image)
                    start_images.append(start_image)
                    end_images.append(end_image)
                time_test += time.time()
                logger.info('>>> TEST ({:.1f}s)'.format(time_test))
                logger.info('>> Test log joint = {}'
                            .format(np.mean(test_eq_joints)))

                logger.info('Saving interpolations...')
                interp_name = os.path.join(result_path,
                                           "interp.epoch.{}.png".format(epoch))
                save_image_collections(interp_images[0], interp_name,
                                       scale_each=True, shape=(8, 8))
                if epoch == 1:
                    save_image_collections(
                        start_images[0], interp_name + ".start.png",
                        scale_each=True, shape=(8, 1))
                    save_image_collections(
                        end_images[0], interp_name + ".end.png",
                        scale_each=True, shape=(8, 1))

            if epoch % save_image_freq == 0:
                logger.info('Saving images...')
                images = sess.run(x_gen)
                name = os.path.join(result_path,
                                    "vae.epoch.{}.png".format(epoch))
                save_image_collections(images, name, scale_each=True)

            if epoch % save_model_freq == 0:
                logger.info('Saving model...')
                save_path = os.path.join(result_path,
                                         "vae.epoch.{}.ckpt".format(epoch))
                if not os.path.exists(os.path.dirname(save_path)):
                    os.makedirs(os.path.dirname(save_path))
                saver.save(sess, save_path)
                logger.info('Done')
Exemplo n.º 20
0
def main():
    tf.set_random_seed(1234)
    np.random.seed(1234)

    # Load MNIST
    data_path = os.path.join('data', 'mnist.pkl.gz')
    x_train, t_train, x_valid, t_valid, x_test, t_test = \
        dataset.load_mnist_realval(data_path)
    x_train = np.vstack([x_train, x_valid])
    x_test = np.random.binomial(1, x_test, size=x_test.shape)
    n_x = x_train.shape[1]
    n_z = FLAGS.n_z

    n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles')
    x_input = tf.placeholder(tf.float32, shape=[None, n_x], name='x')
    x = tf.to_int32(tf.random_uniform(tf.shape(x_input)) <= x_input)
    learning_rate_ph = tf.placeholder(tf.float32, shape=[], name='lr')
    optimizer = tf.train.AdamOptimizer(learning_rate_ph, beta1=0.5)

    def build_tower_graph(x, id_):
        tower_x = x[id_ * tf.shape(x)[0] // FLAGS.num_gpus:(id_ + 1) *
                    tf.shape(x)[0] // FLAGS.num_gpus]
        n = tf.shape(tower_x)[0]

        # qz_samples: [n_particles, n, n_z]
        qz_samples = q_net(tower_x, n_z, n_particles)
        # Use a single particle for the reconstruction term
        observed = {'x': tower_x, 'z': qz_samples[:1]}
        model, z, _ = vae(observed, n, n_x, n_z, 1)
        # log_px_qz: [1, n]
        log_px_qz = model.local_log_prob('x')
        eq_ll = tf.reduce_mean(log_px_qz)
        # log_p_qz: [n_particles, n]
        log_p_qz = z.log_prob(qz_samples)
        eq_joint = eq_ll + tf.reduce_mean(log_p_qz)

        if FLAGS.estimator == "stein":
            estimator = SteinScoreEstimator(eta=eta)
        elif FLAGS.estimator == "spectral":
            estimator = SpectralScoreEstimator(n_eigen=None,
                                               eta=None,
                                               n_eigen_threshold=0.99)
        else:
            raise ValueError("The chosen estimator is not recognized.")

        qzs = tf.transpose(qz_samples, [1, 0, 2])
        dlog_q = estimator.compute_gradients(qzs)
        entropy_surrogate = tf.reduce_mean(
            tf.reduce_sum(tf.stop_gradient(-dlog_q) * qzs, -1))
        cost = -eq_joint - entropy_surrogate
        grads_and_vars = optimizer.compute_gradients(cost)

        return grads_and_vars, eq_joint

    tower_losses = []
    tower_grads = []
    for i in range(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('tower_%d' % i):
                grads, tower_eq_joint = build_tower_graph(x, i)
                tower_losses.append([tower_eq_joint])
                tower_grads.append(grads)

    eq_joint = average_losses(tower_losses)[0]
    grads = average_gradients(tower_grads)
    infer_op = optimizer.apply_gradients(grads)

    # Generate images
    n_gen = 100
    _, _, x_logits = vae({}, n_gen, n_x, n_z, 1)
    x_gen = tf.reshape(tf.sigmoid(x_logits), [-1, 28, 28, 1])

    # Define training parameters
    learning_rate = 1e-4
    epochs = 3000
    batch_size = 128
    iters = x_train.shape[0] // batch_size
    save_image_freq = 10
    save_model_freq = 100
    test_freq = 10
    test_batch_size = 400
    test_iters = x_test.shape[0] // test_batch_size
    result_path = "results/vae_conv_{}_{}".format(
        n_z, FLAGS.estimator) + time.strftime("_%Y%m%d_%H%M%S")

    saver = tf.train.Saver(max_to_keep=10)
    logger = setup_logger('vae_conv_' + FLAGS.estimator, __file__, result_path)

    with create_session(FLAGS.log_device_placement) as sess:
        sess.run(tf.global_variables_initializer())

        # Restore from the latest checkpoint
        ckpt_file = tf.train.latest_checkpoint(result_path)
        begin_epoch = 1
        if ckpt_file is not None:
            logger.info('Restoring model from {}...'.format(ckpt_file))
            begin_epoch = int(ckpt_file.split('.')[-2]) + 1
            saver.restore(sess, ckpt_file)

        for epoch in range(begin_epoch, epochs + 1):
            time_epoch = -time.time()
            np.random.shuffle(x_train)
            eq_joints = []
            for t in range(iters):
                x_batch = x_train[t * batch_size:(t + 1) * batch_size]
                _, eq_joint_ = sess.run(
                    [infer_op, eq_joint],
                    feed_dict={
                        x_input: x_batch,
                        learning_rate_ph: learning_rate,
                        n_particles: n_est
                    },
                )

                eq_joints.append(eq_joint_)

            time_epoch += time.time()
            logger.info('Epoch {} ({:.1f}s): log joint = {}'.format(
                epoch, time_epoch, np.mean(eq_joints)))

            if epoch % test_freq == 0:
                time_test = -time.time()
                test_eq_joints = []
                for t in range(test_iters):
                    test_x_batch = x_test[t * test_batch_size:(t + 1) *
                                          test_batch_size]
                    test_eq_joint = sess.run(eq_joint,
                                             feed_dict={
                                                 x: test_x_batch,
                                                 n_particles: n_est
                                             })
                    test_eq_joints.append(test_eq_joint)
                time_test += time.time()
                logger.info('>>> TEST ({:.1f}s)'.format(time_test))
                logger.info('>> Test log joint = {}'.format(
                    np.mean(test_eq_joints)))

            if epoch % save_image_freq == 0:
                logger.info('Saving images...')
                images = sess.run(x_gen)
                name = os.path.join(result_path,
                                    "vae.epoch.{}.png".format(epoch))
                save_image_collections(images, name)

            if epoch % save_model_freq == 0:
                logger.info('Saving model...')
                save_path = os.path.join(result_path,
                                         "vae.epoch.{}.ckpt".format(epoch))
                if not os.path.exists(os.path.dirname(save_path)):
                    os.makedirs(os.path.dirname(save_path))
                saver.save(sess, save_path)
                logger.info('Done')
def main():

    config.workspace = os.path.join(config.workspace_dir, config.exp_name)
    # if config.restart_training:
    #     shutil.rmtree(config.workspace, ignore_errors=True)
    if not os.path.exists(config.workspace):
        os.makedirs(config.workspace)

    logger = setup_logger(os.path.join(config.workspace, 'train_log'))
    logger.info(config.print())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    if config.gpu_id is not None and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logger.info('train with gpu {} and pytorch {}'.format(
            config.gpu_id, torch.__version__))
        device = torch.device("cuda:0")
        torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子
    else:
        logger.info('train with cpu and pytorch {}'.format(torch.__version__))
        device = torch.device("cpu")

    train_data = ICDAR15(config.trainroot,
                         config.is_pseudo,
                         data_shape=config.data_shape,
                         n=config.kernel_num,
                         m=config.m)
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=config.train_batch_size,
                                   shuffle=True,
                                   num_workers=int(config.workers))

    # writer = SummaryWriter(config.output_dir)
    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.kernel_num,
                   scale=config.scale)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)

    criterion = PSELoss(Lambda=config.Lambda,
                        ratio=config.OHEM_ratio,
                        reduction='mean')

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    if config.checkpoint != '' and not config.restart_training:
        start_epoch = load_checkpoint(config.checkpoint, model, logger, device,
                                      optimizer)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            config.lr_decay_step,
            gamma=config.lr_gamma,
            last_epoch=start_epoch)
        logger.info('resume from {}, epoch={}'.format(config.checkpoint,
                                                      start_epoch))
    else:
        start_epoch = config.start_epoch
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         config.lr_decay_step,
                                                         gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} in dataloader'.format(
        train_data.__len__(), all_step))
    epoch = 0
    f1 = 0

    for epoch in range(start_epoch, config.epochs):
        start = time.time()
        train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader,
                                     device, criterion, epoch, all_step,
                                     logger)
        logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
            epoch, config.epochs, train_loss,
            time.time() - start, lr))

        if epoch % config.save_interval == 0:
            save_path = '{}/epoch_{}.pth'.format(config.workspace, epoch)
            latest_path = '{}/latest.pth'.format(config.workspace)
            save_checkpoint(save_path, model, optimizer, epoch, logger)
            save_checkpoint(latest_path, model, optimizer, epoch, logger)
Exemplo n.º 22
0
def main():
    if config.output_dir is None:
        config.output_dir = 'output'
    if config.restart_training:
        shutil.rmtree(config.output_dir, ignore_errors=True)
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    logger = setup_logger(os.path.join(config.output_dir, 'train_log'))
    logger.info(config.print())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    if config.gpu_id is not None and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logger.info('train with gpu {} and pytorch {}'.format(config.gpu_id, torch.__version__))
        device = torch.device("cuda:0")
        torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子
    else:
        logger.info('train with cpu and pytorch {}'.format(torch.__version__))
        device = torch.device("cpu")

    train_data = MyDataset(config.trainroot, config.MIN_LEN, config.MAX_LEN, transform=transforms.ToTensor())
    train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True,
                                   num_workers=int(config.workers))

    writer = SummaryWriter(config.output_dir)
    model = CTPN_Model(pretrained=config.pretrained)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    dummy_input = torch.zeros(1, 3, 600, 800).to(device)
    writer.add_graph(model=model, input_to_model=dummy_input)
    criterion = CTPNLoss(device)
    # optimizer = torch.optim.SGD(model.parameters(), lr=config.lr, momentum=0.99)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    if config.checkpoint != '' and not config.restart_training:
        print('Loading Checkpoint...')
        start_epoch = load_checkpoint(config.ch9eckpoint, model, logger, device)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma,
                                                         last_epoch=start_epoch)
    else:
        start_epoch = config.start_epoch
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} in dataloader'.format(train_data.__len__(), all_step))
    epoch = 0
    best_model = {'loss': float('inf')}
    try:
        for epoch in range(start_epoch, config.epochs):
            start = time.time()
            train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step,
                                         writer, logger)
            logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
                epoch, config.epochs, train_loss, time.time() - start, lr))
            # if (0.3 < train_loss < 0.4 and epoch % 1 == 0) or train_loss < 0.3:
            if epoch % 10 == 0 or train_loss < best_model['loss']:
                net_save_path = '{}/PSENet_{}_loss{:.6f}.pth'.format(config.output_dir, epoch, train_loss)
                save_checkpoint(net_save_path, model, optimizer, epoch, logger)
                if train_loss < best_model['loss']:
                    best_model['loss'] = train_loss
                    if 'model' in best_model:
                        os.remove(best_model['model'])
                    best_model['model'] = net_save_path
                    shutil.copy(best_model['model'],
                                '{}/best_loss{:.6f}.pth'.format(config.output_dir, best_model['loss']))
        writer.close()
    except KeyboardInterrupt:
        pass
    finally:
        if best_model['model']:
            shutil.copy(best_model['model'], '{}/best_loss{:.6f}.pth'.format(config.output_dir, best_model['loss']))
            logger.info(best_model)
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description='AN')
    parser.add_argument('--name', default='bn_smaller_batch', type=str)

    ## data setting
    parser.add_argument('--root',
                        default='/scratch/local/ssd/datasets',
                        type=str)
    parser.add_argument('--train_dataset', default='synthtext', type=str)
    parser.add_argument('--test_dataset', default='ic03', type=str)
    parser.add_argument('--vis_gt', default=False, type=bool)
    parser.add_argument('--vis_gt_path',
                        default='/users/czhang/data/vis',
                        type=str)
    parser.add_argument('--load_width', default=256, type=int)
    parser.add_argument('--load_height', default=32, type=int)
    parser.add_argument("--gpus", dest="gpu", default="0", type=str)
    parser.add_argument('--min_gt_len', default=3, type=int)
    parser.add_argument("--aug", dest="aug", action='store_true')
    parser.add_argument("--RA", dest="repeated_aug", default='1', type=int)

    ## model setting
    parser.add_argument('--alphabet',
                        default=' 0123456789abcdefghijklmnopqrstuvwxyz',
                        type=str)
    #parser.add_argument('--ignore_case', default=True, type=bool)
    parser.add_argument('--max_len', default=65, type=int)
    parser.add_argument("--cv", dest="context_vector", action='store_true')

    ## optim setting
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--resume_i', default=0, type=int)
    parser.add_argument('--resume_j', default=0, type=int)

    parser.add_argument('--cl_weight',
                        default=1,
                        type=int,
                        help='center loss weight')
    parser.add_argument('--num_workers', default=64, type=int)
    parser.add_argument('--lr', default=1.0, type=float)
    parser.add_argument('--beta1',
                        type=float,
                        default=0.5,
                        help='beta1 for adam. default=0.5')
    parser.add_argument('--momentum', default=0.9, type=float)
    parser.add_argument('--weight_decay', default=1e-5, type=float)
    parser.add_argument('--gamma', default=0.1, type=float)
    parser.add_argument('--optim',
                        default='adadelta',
                        type=str,
                        help='sgd, adam, adadelta')
    # parser.add_argument('--clip_grad', default=False, type=bool)
    parser.add_argument('--max_norm',
                        default=400,
                        type=int,
                        help='Norm cutoff to prevent explosion of gradients')
    parser.add_argument('--max_epoches', default=1000, type=int)
    # parser.add_argument('--adjust_lr', default='800, 1600', type=str)

    ## output setting
    parser.add_argument('--log_iter', default=10, type=int)
    parser.add_argument('--eval_iter', default=2500, type=int)
    parser.add_argument('--save_iter', default=2500, type=int)
    parser.add_argument('--save_folder',
                        default='/users/czhang/data/FAN/',
                        type=str)
    parser.add_argument('--tbx_folder',
                        default='/users/czhang/data/FAN/tbx',
                        type=str)

    parser.add_argument('--eval_vis_num', default=15, type=int)
    parser.add_argument('--max_iter', default=2000000, type=int)

    args = parser.parse_args()
    args.save_folder = osp.join(args.save_folder, args.name)
    if osp.exists(args.save_folder) == False:
        os.mkdir(args.save_folder)

    tbx_dir = osp.join(args.tbx_folder, args.name)
    if osp.exists(args.tbx_folder) == False:
        os.mkdir(args.tbx_folder)

    if osp.exists(tbx_dir) == False:
        os.mkdir(tbx_dir)
    writer = SummaryWriter(tbx_dir)
    log_file_path = args.save_folder + '/' + time.strftime(
        '%Y%m%d_%H%M%S') + '.log'
    ##
    args.nClasses = len(args.alphabet)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    device = torch.device("cuda:0")

    setup_logger(log_file_path)
    print_args(args)
    torch.set_default_tensor_type('torch.FloatTensor')

    ## setup converter
    converter = strLabelConverter(args.alphabet)

    ## setup dataset
    logging.info('model will be trained on %s' % (args.train_dataset))
    trainset = SynthLoader(args, args.train_dataset, converter, aug=args.aug)
    logging.info('%d training samples' % (trainset.__len__()))
    train_loader = data.DataLoader(trainset,
                                   args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=text_collate,
                                   pin_memory=True)

    logging.info('model will be evaluated on %s' % (args.test_dataset))
    testset = SceneLoader(args, args.test_dataset, False)
    logging.info('%d test samples' % (testset.__len__()))
    test_loader = data.DataLoader(testset,
                                  1,
                                  num_workers=args.num_workers,
                                  shuffle=False,
                                  pin_memory=True)

    ## setup model
    net = AN(args)
    net = torch.nn.DataParallel(net).to(device)
    centers = None

    if args.resume_i != 0 or args.resume_j != 0:
        resume_file = osp.join(
            args.save_folder,
            str(args.resume_i) + '_' + str(args.resume_j) + '.pth')
        logging.info('Resuming training, loading {}...'.format(resume_file))
        checkpoint = torch.load(resume_file)
        #net.load_state_dict(checkpoint)
        net.load_state_dict(checkpoint['model_state_dict'])
        centers = checkpoint['class_centers']

    ## setup criterion
    criterion = nn.CrossEntropyLoss()
    criterion2 = CenterLoss(device, centers)

    ## setup optimizer
    if args.cl_weight != 0:
        parameters = list(net.parameters()) + list(criterion2.parameters())
    else:
        parameters = net.parameters()

    if args.optim == 'sgd':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
        logging.info('model will be optimed by sgd')
    elif args.optim == 'adam':
        optimizer = optim.Adam(parameters,
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        logging.info('model will be optimed by adam')
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(parameters,
                                   lr=args.lr,
                                   weight_decay=args.weight_decay)
        logging.info('model will be optimed by adadelta')
    else:
        optimizer = optim.Adam(parameters,
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        logging.info('model will be optimed by adam')

    ## train model
    cudnn.benchmark = True
    net.train()
    iter_counter = args.resume_j + 1
    acc_max = 0
    running_loss, running_cenloss, running_croloss = 0., 0., 0.

    for i in range(args.max_epoches):
        i = args.resume_i + i
        t0 = time.time()
        for j, batch_samples in enumerate(train_loader):
            j = args.resume_j + j + 1
            imgs, labels, paths = batch_samples
            imgs = Variable(imgs.float()).to(device)
            labels = Variable(labels.long()).to(device)  #[batch*len]
            if args.context_vector or args.cl_weight != 0:
                preds, gts = net(imgs, labels)  #[batch,len,classes]
                masks = mask(args, labels.view(args.batch_size, args.max_len),
                             device)
                center_loss = criterion2(gts, labels, masks)
                running_cenloss += center_loss.item()

            else:
                preds = net(imgs, labels)
                center_loss = 0

            ce_loss = criterion(preds.view(-1, args.nClasses), labels.view(-1))
            loss = ce_loss + 0.01 * args.cl_weight * center_loss

            optimizer.zero_grad()
            loss.backward()
            if args.cl_weight != 0:
                for param in criterion2.parameters():
                    # update class centers
                    # remove the effect of lambda on updating centers
                    # lr of center loss set to 0.5 of the model lr
                    param.grad.data *= (0.5 / (0.01 * args.cl_weight))

            torch.nn.utils.clip_grad_norm_(net.parameters(), args.max_norm)
            optimizer.step()
            running_loss += loss.item()
            running_croloss += ce_loss.item()

            if iter_counter % args.log_iter == 0:
                t1 = time.time()
                acc, pred_samples, label_samples = lex_free_acc(
                    preds, labels, converter)
                print(
                    'epoch:%3d  iter:%6d  loss:%4.6f  acc:%4.6f  %4.6fs/batch'
                    % (i, j, running_loss / args.log_iter, acc,
                       (t1 - t0) / args.log_iter))
                writer.add_scalar('train/train_word_accuracy', acc, j)
                writer.add_scalar('train/train_loss',
                                  running_loss / args.log_iter, j)
                if args.cl_weight != 0:
                    writer.add_scalar('train/train_ce_loss',
                                      running_croloss / args.log_iter, j)
                    writer.add_scalar('train/train_center_loss',
                                      running_cenloss / args.log_iter, j)

                if iter_counter % (100 * args.log_iter) == 0:
                    visual_img = imgs[0, :, :, :].unsqueeze(0)
                    writer.add_image('train/train_im', visual_img, j)
                    visual_txt = 'gt: ' + str(
                        label_samples[0]) + ' ----- pred: ' + str(
                            label_samples[0])
                    writer.add_text('train/train_txt', visual_txt, j)
                t0 = time.time()
                running_loss, running_cenloss, running_croloss = 0., 0., 0.

            if iter_counter % args.save_iter == 0:
                print('Saving state, epoch: %d iter:%d' % (i, j))
                torch.save(
                    {
                        'model_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'class_centers': criterion2.centers
                    },
                    args.save_folder + '/' + repr(i) + '_' + repr(j) + '.pth')

            if iter_counter % args.eval_iter == 0:
                ## eval model
                net.eval()
                n_correct = 0
                skip_counter = 0
                for index, sample in enumerate(test_loader):
                    imgs, gt_strs, lexicon50, lexicon1k, lexiconfull, img_paths = sample

                    gt_str = gt_strs[0]
                    if len(gt_str) < args.min_gt_len or not gt_str.isalnum():
                        skip_counter += 1
                        continue
                    imgs = Variable(imgs).cuda()
                    gt_ind, _ = converter.encode(gt_str)
                    gt_ind = torch.IntTensor(
                        (gt_ind + [0] * args.max_len)[:args.max_len])
                    if args.context_vector or args.cl_weight != 0:
                        preds, _ = net(imgs, gt_ind)
                    else:
                        preds = net(imgs, gt_ind)

                    correct, pred_str, _ = lex_free_acc(
                        preds, gt_ind, converter)
                    n_correct += correct

                acc = n_correct * 1.0 / (testset.__len__() - skip_counter)
                if acc > acc_max:
                    acc_max = acc
                logging.info('accuracy=%f   acc_max=%f' % (acc, acc_max))
                writer.add_scalar('val/val_word_accuracy', acc, j)

                net.train()

            if iter_counter > args.max_iter:
                break
            iter_counter += 1

    torch.save(net.state_dict(), args.save_folder + '/final_0.pth')
    logging.info('The training stage on %s is over!!!' % (args.train_dataset))
Exemplo n.º 24
0
def main():
    if config.output_dir is None:
        config.output_dir = 'output'
    if config.restart_training:
        shutil.rmtree(config.output_dir, ignore_errors=True)
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    logger = setup_logger(os.path.join(config.output_dir, 'train_log'))
    logger.info(config.print())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    if config.gpu_id is not None and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logger.info('train with gpu {} and pytorch {}'.format(
            config.gpu_id, torch.__version__))
        device = torch.device("cuda:0")
        torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子
    else:
        logger.info('train with cpu and pytorch {}'.format(torch.__version__))
        device = torch.device("cpu")

    train_data = MyDataset(args.train_dir,
                           data_shape=config.data_shape,
                           n=config.n,
                           m=config.m,
                           transform=transforms.ToTensor())
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=int(config.workers))

    writer = SummaryWriter(config.output_dir)
    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.n,
                   scale=config.scale)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    if args.resume_model:
        resume_model(model, args.resume_model)

    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    # dummy_input = torch.autograd.Variable(torch.Tensor(1, 3, 600, 800).to(device))
    # writer.add_graph(models=models, input_to_model=dummy_input)
    criterion = PSELoss(Lambda=config.Lambda,
                        ratio=config.OHEM_ratio,
                        reduction='mean')
    # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    if config.checkpoint != '' and not config.restart_training:
        start_epoch = load_checkpoint(config.checkpoint, model, logger, device,
                                      optimizer)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            config.lr_decay_step,
            gamma=config.lr_gamma,
            last_epoch=start_epoch)
    else:
        start_epoch = config.start_epoch
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         config.lr_decay_step,
                                                         gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} in dataloader'.format(
        train_data.__len__(), all_step))
    epoch = 0
    best_model = {'recall': 0, 'precision': 0, 'f1': 0, 'models': ''}
    try:
        for epoch in range(start_epoch, args.epochs):
            start = time.time()
            train_loss, lr = train_epoch(model, optimizer, scheduler,
                                         train_loader, device, criterion,
                                         epoch, all_step, writer, logger)
            logger.info(
                '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
                    epoch, config.epochs, train_loss,
                    time.time() - start, lr))

            if epoch % args.save_per_epoch == 0:
                save_model(model, epoch)
        writer.close()

    except KeyboardInterrupt:
        save_checkpoint('{}/final.pth'.format(config.output_dir), model,
                        optimizer, epoch, logger)
    finally:
        if best_model['models']:
            logger.info(best_model)
Exemplo n.º 25
0
    def __init__(self, config, model, criterion):
        config['trainer']['output_dir'] = os.path.join(
            str(pathlib.Path(os.path.abspath(__name__)).parent),
            config['trainer']['output_dir'])
        config['name'] = config['name'] + '_' + model.name
        self.save_dir = os.path.join(config['trainer']['output_dir'],
                                     config['name'])
        self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint')

        if config['trainer']['resume_checkpoint'] == '' and config['trainer'][
                'finetune_checkpoint'] == '':
            shutil.rmtree(self.save_dir, ignore_errors=True)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.global_step = 0
        self.start_epoch = 0
        self.config = config
        self.model = model
        self.criterion = criterion
        # logger and tensorboard
        self.tensorboard_enable = self.config['trainer']['tensorboard']
        self.epochs = self.config['trainer']['epochs']
        self.log_iter = self.config['trainer']['log_iter']

        anyconfig.dump(config, os.path.join(self.save_dir, 'config.yaml'))
        self.logger = setup_logger(os.path.join(self.save_dir, 'train.log'))
        self.logger_info(pformat(self.config))

        # device
        torch.manual_seed(self.config['trainer']['seed'])  # 为CPU设置随机种子
        if torch.cuda.device_count() > 0 and torch.cuda.is_available():
            self.with_cuda = True
            torch.backends.cudnn.benchmark = True
            self.device = torch.device("cuda")
            torch.cuda.manual_seed(
                self.config['trainer']['seed'])  # 为当前GPU设置随机种子
            torch.cuda.manual_seed_all(
                self.config['trainer']['seed'])  # 为所有GPU设置随机种子
        else:
            self.with_cuda = False
            self.device = torch.device("cpu")
        self.logger_info('train with device {} and pytorch {}'.format(
            self.device, torch.__version__))
        # metrics
        self.metrics = {
            'recall': 0,
            'precision': 0,
            'hmean': 0,
            'train_loss': float('inf')
        }
        self.lr = config['optimizer']['args']['lr']
        self.optimizer = self._initialize('optimizer', torch.optim,
                                          model.parameters())

        # resume or finetune
        if self.config['trainer']['resume_checkpoint'] != '':
            self._laod_checkpoint(self.config['trainer']['resume_checkpoint'],
                                  resume=True)
        elif self.config['trainer']['finetune_checkpoint'] != '':
            self._laod_checkpoint(
                self.config['trainer']['finetune_checkpoint'], resume=False)

        if self.config['lr_scheduler']['type'] != 'WarmupPolyLR':
            self.scheduler = self._initialize('lr_scheduler',
                                              torch.optim.lr_scheduler,
                                              self.optimizer)

        self.model.to(self.device)

        if self.tensorboard_enable and config['local_rank'] == 0:
            from torch.utils.tensorboard import SummaryWriter
            self.writer = SummaryWriter(self.save_dir)
            try:
                # add graph
                dummy_input = torch.zeros(1, 3, 640, 640).to(self.device)
                self.writer.add_graph(self.model, dummy_input)
                torch.cuda.empty_cache()
            except:
                import traceback
                self.logger.error(traceback.format_exc())
                self.logger.warn('add graph to tensorboard failed')
        # 分布式训练
        if torch.cuda.device_count() > 1:
            local_rank = config['local_rank']
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[local_rank],
                output_device=local_rank,
                broadcast_buffers=False,
                find_unused_parameters=True)
        # make inverse Normalize
        self.UN_Normalize = False
        if 'transforms' in self.config['dataset']['train']['dataset'][
                'args'].keys():
            for t in self.config['dataset']['train']['dataset']['args'][
                    'transforms']:
                if t['type'] == 'Normalize':
                    self.normalize_mean = t['args']['mean']
                    self.normalize_std = t['args']['std']
                    self.UN_Normalize = True
Exemplo n.º 26
0
def main():
    config.workspace = os.path.join(config.workspace_dir, config.exp_name)
    if not os.path.exists(config.workspace):
        os.makedirs(config.workspace)

    logger = setup_logger(os.path.join(config.workspace, 'train_log'))
    logger.info(config.pprint())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    torch.backends.cudnn.benchmark = True
    logger.info('train with gpu {} and pytorch {}'.format(
        config.gpu_id, torch.__version__))
    device = torch.device("cuda:0")
    torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
    torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子

    train_data = TotalTextoader(config.train_data_dir,
                                config.train_gt_dir,
                                config.test_data_dir,
                                config.test_gt_dir,
                                split='train',
                                is_transform=True,
                                img_size=config.data_shape,
                                kernel_num=config.kernel_num,
                                min_scale=config.min_scale)
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=config.train_batch_size,
                                   shuffle=True,
                                   num_workers=int(config.workers))

    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.kernel_num,
                   scale=config.scale)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    num_gpus = torch.cuda.device_count()
    # if num_gpus > 1:
    model = nn.DataParallel(model)
    model = model.to(device)

    criterion = dice_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    if config.checkpoint != '' and config.restart_training == True:
        start_epoch = load_checkpoint(config.checkpoint, model, logger, device,
                                      optimizer)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            config.lr_decay_step,
            gamma=config.lr_gamma,
            last_epoch=start_epoch)
        logger.info('resume from {}, epoch={}'.format(config.checkpoint,
                                                      start_epoch))
    else:
        start_epoch = 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         config.lr_decay_step,
                                                         gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} iters in dataloader'.format(
        train_data.__len__(), all_step))
    for epoch in range(start_epoch, config.epochs + 1):
        start = time.time()
        train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader,
                                     device, criterion, epoch, all_step,
                                     logger)
        logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
            epoch, config.epochs, train_loss,
            time.time() - start, lr))

        if epoch % config.save_interval == 0:
            save_path = '{}/epoch_{}.pth'.format(config.workspace, epoch)
            latest_path = '{}/latest.pth'.format(config.workspace)
            save_checkpoint(save_path, model, optimizer, epoch, logger)
            save_checkpoint(latest_path, model, optimizer, epoch, logger)
Exemplo n.º 27
0
def main_worker(
        gpu, ngpus_per_node, args, final_output_dir, tb_log_dir
):
    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if cfg.MULTIPROCESSING_DISTRIBUTED:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            # 通过节点序号来计算进程在所有进程之中的序号
            args.rank = args.rank * ngpus_per_node + gpu
        print('Init process group: dist_url: {}, world_size: {}, rank: {}'.
              format(args.dist_url, args.world_size, args.rank))
        dist.init_process_group(
            backend=cfg.DIST_BACKEND,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=args.rank
        )

    update_config(cfg, args)

    # setup logger
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(
        cfg, is_train=True
    )

    # copy model file
    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        this_dir = os.path.dirname(__file__)
        shutil.copy2(
            os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
            final_output_dir
        )

    # 利用tensorboard可视化结果
    writer_dict = {
        'writer': SummaryWriter(logdir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }


    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu]
            )
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = torch.nn.DataParallel(model).cuda()

    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        dump_input = torch.rand(
            (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE)
        ).cuda()
        #writer_dict['writer'].add_graph(model, (dump_input, ))
        logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    # define loss function (criterion) and optimizer
    loss_factory = MultiLossFactory(cfg).cuda()

    # Data loading code
        train_loader = make_dataloader(
            cfg, is_train=True, distributed=args.distributed
        )
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(description='AN')
    parser.add_argument('--name', default='second_training_bn', type=str)

    ## data setting
    parser.add_argument('--root', default='/users/czhang/data/', type=str)
    parser.add_argument('--load_folder',
                        default='/users/czhang/data/FAN/',
                        type=str)
    parser.add_argument('--test_dataset', default='ic13', type=str)
    parser.add_argument('--load_width', default=256, type=int)
    parser.add_argument('--load_height', default=32, type=int)
    parser.add_argument('--batch_size', default=1, type=int)
    parser.add_argument('--num_workers', default=32, type=int)
    parser.add_argument("--gpus", dest="gpu", default="1", type=str)
    parser.add_argument('--min_gt_len', default=3, type=int)
    parser.add_argument('--max_len', default=65, type=int)
    parser.add_argument("--cv", dest="context_vector", action='store_true')
    parser.add_argument('--lexicon', default=None, type=str)
    parser.add_argument('--max_ed', default=3, type=int)
    parser.add_argument('--tbx_folder',
                        default='/users/czhang/data/FAN/tbx',
                        type=str)

    ## model setting
    parser.add_argument('--load_epoch', default=0, type=int)
    parser.add_argument('--load_iter', default=0, type=int)

    parser.add_argument('--alphabet',
                        default=' 0123456789abcdefghijklmnopqrstuvwxyz',
                        type=str)
    ## output setting
    parser.add_argument('--out_dir',
                        default='/users/czhang/data/FAN/',
                        type=str)

    args = parser.parse_args()

    args.nClasses = len(args.alphabet)
    args.load_folder = osp.join(args.load_folder, args.name)
    args.out_dir = osp.join(args.out_dir, args.name, 'tests')
    if not osp.exists(args.out_dir):
        os.mkdir(args.out_dir)

    tbx_dir = osp.join(args.tbx_folder, args.name, 'tests')
    if osp.exists(args.tbx_folder) == False:
        os.mkdir(args.tbx_folder)

    if osp.exists(tbx_dir) == False:
        os.mkdir(tbx_dir)

    writer = SummaryWriter(tbx_dir)

    log_path = os.path.join(args.out_dir, args.test_dataset + '.txt')

    setup_logger(log_path)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    device = torch.device("cuda:0")

    logging.info('model will be evaluated on %s' % (args.test_dataset))
    testset = SceneLoader(args, args.test_dataset, False)
    logging.info('%d test samples' % (testset.__len__()))
    test_loader = data.DataLoader(testset,
                                  args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=False,
                                  pin_memory=True)

    ## model
    net = AN(args)
    net = torch.nn.DataParallel(net).to(device)
    checkpoint = str(args.load_epoch) + '_' + str(args.load_iter) + '.pth'

    load_file = torch.load(osp.join(args.load_folder, checkpoint))
    net.load_state_dict(load_file['model_state_dict'])

    #net.load_state_dict(torch.load(load_file))
    net.eval()
    n_correct = 0
    skip_counter = 0
    converter = strLabelConverter(args.alphabet)

    for index, sample in enumerate(test_loader):
        imgs, gt_strs, lexicon50, lexicon1k, lexiconfull, img_paths = sample

        gt_str = gt_strs[0]
        if args.test_dataset != 'iiit5k':
            if len(gt_str) < args.min_gt_len or not gt_str.isalnum():
                print('skipping: %s' % gt_str)
                skip_counter += 1
                continue
        else:
            if not gt_str.isalnum():
                print('skipping: %s' % gt_str)
                skip_counter += 1
                continue
        imgs = Variable(imgs).cuda()
        gt_ind, _ = converter.encode(gt_str)
        gt_ind = torch.IntTensor((gt_ind + [0] * args.max_len)[:args.max_len])
        preds = net(imgs, gt_ind)

        if args.lexicon is None:
            correct, pred_str, _ = lex_free_acc(preds, gt_ind, converter)
            pred_lex = []

        # lexicon decoding
        if args.lexicon is not None:
            if args.lexicon == '50': lexicon = lexicon50
            if args.lexicon == '1k': lexicon = lexicon1k
            if args.lexicon == 'full': lexicon = full_lexicon
            correct, pred_str = lex_acc(args, lexicon, preds, gt_str,
                                        converter)
        ## decode
        if correct == 0:
            writer.add_image('test_im', imgs[0, :, :, :].unsqueeze(0), index)
            writer.add_text('pred', pred_str, index)
            writer.add_text('gt', gt_str, index)

            logging.info('pred: %s gt:%s ' % (pred_str, gt_str))
        n_correct += correct

    acc = n_correct * 1.0 / (testset.__len__() - skip_counter)
    print(testset.__len__() - skip_counter)
    logging.info('accuracy=%f' % (acc))
if not osp.exists(args.out_dir):
    os.mkdir(args.out_dir)

#tbx_dir =osp.join(args.tbx_folder,args.name,'tests')
tbx_dir = args.tbx_folder
if osp.exists(args.tbx_folder) == False:
    os.mkdir(args.tbx_folder)

if osp.exists(tbx_dir) == False:
    os.mkdir(tbx_dir)

writer = SummaryWriter(tbx_dir)

log_path = os.path.join(args.out_dir, args.test_dataset + '.txt')

setup_logger(log_path)

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
device = torch.device("cuda:0")

logging.info('model will be evaluated on %s' % (args.test_dataset))

net = AN(args)
net = torch.nn.DataParallel(net).to(device)
checkpoint = '../attention_net/0_480000.pth'

load_file = torch.load(checkpoint)
net.load_state_dict(load_file['model_state_dict'])

#net.load_state_dict(torch.load(load_file))
net.eval()
Exemplo n.º 30
0
def main_worker(
        gpu, ngpus_per_node, args, final_output_dir, tb_log_dir
):
    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    if cfg.FP16.ENABLED:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if cfg.FP16.STATIC_LOSS_SCALE != 1.0:
        if not cfg.FP16.ENABLED:
            print("Warning:  if --fp16 is not used, static_loss_scale will be ignored.")

    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if cfg.MULTIPROCESSING_DISTRIBUTED:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        print('Init process group: dist_url: {}, world_size: {}, rank: {}'.
              format(args.dist_url, args.world_size, args.rank))
        dist.init_process_group(
            backend=cfg.DIST_BACKEND,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=args.rank
        )

    update_config(cfg, args)

    # setup logger
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(
        cfg, is_train=True
    )

    # copy model file
    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        this_dir = os.path.dirname(__file__)
        shutil.copy2(
            os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
            final_output_dir
        )

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        dump_input = torch.rand(
            (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE)
        )
        writer_dict['writer'].add_graph(model, (dump_input, ))
        # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    if cfg.FP16.ENABLED:
        model = network_to_half(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu]
            )
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    loss_factory = MultiLossFactory(cfg).cuda()

    # Data loading code
    train_loader = make_dataloader(
        cfg, is_train=True, distributed=args.distributed
    )
    logger.info(train_loader.dataset)

    best_perf = -1
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)

    if cfg.FP16.ENABLED:
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE,
            dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE
        )

    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(
        final_output_dir, 'checkpoint.pth.tar')
    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    if cfg.FP16.ENABLED:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
            last_epoch=last_epoch
        )
    else:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
            last_epoch=last_epoch
        )

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train one epoch
        do_train(cfg, model, train_loader, loss_factory, optimizer, epoch,
                 final_output_dir, tb_log_dir, writer_dict, fp16=cfg.FP16.ENABLED)

        perf_indicator = epoch
        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        if not cfg.MULTIPROCESSING_DISTRIBUTED or (
                cfg.MULTIPROCESSING_DISTRIBUTED
                and args.rank == 0
        ):
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint({
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(
        final_output_dir, 'final_state{}.pth.tar'.format(gpu)
    )

    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Exemplo n.º 31
0
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir):
    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    #os.environ['CUDA_VISIBLE_DEVICES']=gpus

    # if len(gpus) == 1:
    #     gpus = int(gpus)

    update_config(cfg, args)

    #test(cfg, args)

    # logger setting
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # model initilization
    model = {
        "ransac": RANSACTriangulationNet,
        "alg": AlgebraicTriangulationNet,
        "vol": VolumetricTriangulationNet,
        "vol_CPM": VolumetricTriangulationNet_CPM,
        "FTL": FTLMultiviewNet
    }[cfg.MODEL.NAME](cfg)

    discriminator = Discriminator(cfg)

    # load pretrained model before DDP initialization
    if cfg.AUTO_RESUME:
        checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar')
        if os.path.exists(checkpoint_file):
            checkpoint = torch.load(checkpoint_file,
                                    map_location=torch.device('cpu'))
            state_dict = checkpoint['state_dict']
            D_state_dict = checkpoint['D_state_dict']

            for key in list(state_dict.keys()):
                new_key = key.replace("module.", "")
                state_dict[new_key] = state_dict.pop(key)
            for key in list(D_state_dict.keys()):
                new_key = key.replace("module.", "")
                D_state_dict[new_key] = D_state_dict.pop(key)

            model.load_state_dict(state_dict)
            discriminator.load_state_dict(D_state_dict)
            logger.info("=> Loading checkpoint '{}' (epoch {})".format(
                checkpoint_file, checkpoint['epoch']))
        else:
            print('[Warning] Checkpoint file not found! Wrong path: {}'.format(
                checkpoint_file))

    elif cfg.MODEL.HRNET_PRETRAINED:
        logger.info("=> loading a pretrained model '{}'".format(
            cfg.MODEL.PRETRAINED))
        checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED)

        state_dict = checkpoint['state_dict']
        for key in list(state_dict.keys()):
            new_key = key.replace("module.", "")
            state_dict[new_key] = state_dict.pop(key)

        model.load_state_dict(state_dict)

    # initiliaze a optimizer
    # optimizer must be initilized after model initilization
    if cfg.MODEL.TRIANGULATION_MODEL_NAME == "vol":
        optimizer = torch.optim.Adam([{
            'params': model.backbone.parameters(),
            'initial_lr': cfg.TRAIN.LR
        }, {
            'params':
            model.process_features.parameters(),
            'initial_lr':
            cfg.TRAIN.PROCESS_FEATURE_LR
            if hasattr(cfg.TRAIN, "PROCESS_FEATURE_LR") else cfg.TRAIN.LR
        }, {
            'params':
            model.volume_net.parameters(),
            'initial_lr':
            cfg.TRAIN.VOLUME_NET_LR
            if hasattr(cfg.TRAIN, "VOLUME_NET_LR") else cfg.TRAIN.LR
        }],
                                     lr=cfg.TRAIN.LR)
    else:
        optimizer = torch.optim.Adam(
            [{
                'params': filter(lambda p: p.requires_grad,
                                 model.parameters()),
                'initial_lr': cfg.TRAIN.LR
            }],
            lr=cfg.TRAIN.LR)

    D_optimizer = torch.optim.RMSprop([{
        'params':
        filter(lambda p: p.requires_grad, discriminator.parameters()),
        'initial_lr':
        cfg.TRAIN.LR
    }],
                                      lr=cfg.TRAIN.LR)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(os.path.join(this_dir, '../lib/models', 'triangulation.py'),
                 final_output_dir)
    # copy configuration file
    config_dir = args.cfg
    shutil.copy2(os.path.join(args.cfg), final_output_dir)

    # calculate GFLOPS
    # dump_input = torch.rand(
    #     (1, 4, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0])
    # )

    # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    # FP16 SETTING
    if cfg.FP16.ENABLED:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if cfg.FP16.STATIC_LOSS_SCALE != 1.0:
        if not cfg.FP16.ENABLED:
            print(
                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
            )

    if cfg.FP16.ENABLED:
        model = network_to_half(model)

    if cfg.MODEL.SYNC_BN and not cfg.DISTRIBUTED:
        print(
            'Warning: Sync BatchNorm is only supported in distributed training.'
        )

    if cfg.FP16.ENABLED:
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE,
            dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE,
            verbose=False)

    # Distributed Computing
    master = True
    if cfg.DISTRIBUTED:  # This block is not available
        args.local_rank += int(gpus[0])
        print('This process is using GPU', args.local_rank)
        device = args.local_rank
        master = device == int(gpus[0])
        dist.init_process_group(backend='nccl')
        if cfg.MODEL.SYNC_BN:
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if gpus is not None:
            torch.cuda.set_device(device)
            model.cuda(device)
            discriminator.cuda(device)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # workers = int(workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[device],
                output_device=device,
                find_unused_parameters=True)
            discriminator = torch.nn.parallel.DistributedDataParallel(
                discriminator,
                device_ids=[device],
                output_device=device,
                find_unused_parameters=True)
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    else:  # implement this block
        gpu_ids = eval('[' + gpus + ']')
        device = gpu_ids[0]
        print('This process is using GPU', str(device))
        model = torch.nn.DataParallel(model, gpu_ids).cuda(device)
        discriminator = torch.nn.DataParallel(discriminator,
                                              gpu_ids).cuda(device)

    # Prepare loss functions
    criterion = {}
    if cfg.LOSS.WITH_HEATMAP_LOSS:
        criterion['heatmap_loss'] = HeatmapLoss().cuda(device)
    if cfg.LOSS.WITH_POSE2D_LOSS:
        criterion['pose2d_loss'] = JointsMSELoss().cuda(device)
    if cfg.LOSS.WITH_POSE3D_LOSS:
        criterion['pose3d_loss'] = Joints3DMSELoss().cuda(device)
    if cfg.LOSS.WITH_VOLUMETRIC_CE_LOSS:
        criterion['volumetric_ce_loss'] = VolumetricCELoss().cuda(device)
    if cfg.LOSS.WITH_BONE_LOSS:
        criterion['bone_loss'] = BoneLengthLoss().cuda(device)
    if cfg.LOSS.WITH_TIME_CONSISTENCY_LOSS:
        criterion['time_consistency_loss'] = Joints3DMSELoss().cuda(device)
    if cfg.LOSS.WITH_KCS_LOSS:
        criterion['KCS_loss'] = None
    if cfg.LOSS.WITH_JOINTANGLE_LOSS:
        criterion['jointangle_loss'] = JointAngleLoss().cuda(device)

    best_perf = 1e9
    best_model = False
    last_epoch = -1

    # load history
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        begin_epoch = checkpoint['epoch'] + 1
        best_perf = checkpoint['loss']
        optimizer.load_state_dict(checkpoint['optimizer'])
        D_optimizer.load_state_dict(checkpoint['D_optimizer'])

        if 'train_global_steps' in checkpoint.keys() and \
        'valid_global_steps' in checkpoint.keys():
            writer_dict['train_global_steps'] = checkpoint[
                'train_global_steps']
            writer_dict['valid_global_steps'] = checkpoint[
                'valid_global_steps']

    # Floating point 16 mode
    if cfg.FP16.ENABLED:
        logger.info("=> Using FP16 mode")
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer.optimizer,
            cfg.TRAIN.LR_STEP,
            cfg.TRAIN.LR_FACTOR,
            last_epoch=begin_epoch)
    else:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            cfg.TRAIN.LR_STEP,
            cfg.TRAIN.LR_FACTOR,
            last_epoch=begin_epoch)

    # Data loading code
    train_loader_dict = make_dataloader(cfg,
                                        is_train=True,
                                        distributed=cfg.DISTRIBUTED)
    valid_loader_dict = make_dataloader(cfg,
                                        is_train=False,
                                        distributed=cfg.DISTRIBUTED)

    for i, (dataset_name,
            train_loader) in enumerate(train_loader_dict.items()):
        logger.info(
            'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) +
            str(train_loader.dataset))
    for i, (dataset_name,
            valid_loader) in enumerate(valid_loader_dict.items()):
        logger.info('Validation Loader {}/{}:\n'.format(
            i + 1, len(valid_loader_dict)) + str(valid_loader.dataset))

    #writer_dict['writer'].add_graph(model, (dump_input, ))
    """
    Start training
    """
    start_time = time.time()

    with torch.autograd.set_detect_anomaly(True):
        for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
            epoch_start_time = time.time()
            # shuffle datasets with the sample random seed
            if cfg.DISTRIBUTED:
                for data_loader in train_loader_dict.values():
                    data_loader.sampler.set_epoch(epoch)
            # train for one epoch
            logger.info('Start training [{}/{}]'.format(
                epoch, cfg.TRAIN.END_EPOCH - 1))
            train(epoch,
                  cfg,
                  args,
                  master,
                  train_loader_dict, [model, discriminator],
                  criterion, [optimizer, D_optimizer],
                  final_output_dir,
                  tb_log_dir,
                  writer_dict,
                  logger,
                  device,
                  fp16=cfg.FP16.ENABLED)

            # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`.
            lr_scheduler.step()

            # evaluate on validation set
            if not cfg.WITHOUT_EVAL:
                logger.info('Start evaluating [{}/{}]'.format(
                    epoch, cfg.TRAIN.END_EPOCH - 1))
                with torch.no_grad():
                    recorder = validate(cfg, args, master, valid_loader_dict,
                                        [model, discriminator], criterion,
                                        final_output_dir, tb_log_dir,
                                        writer_dict, logger, device)

                val_total_loss = recorder.avg_total_loss

                if val_total_loss < best_perf:
                    logger.info(
                        'This epoch yielded a better model with total loss {:.4f} < {:.4f}.'
                        .format(val_total_loss, best_perf))
                    best_perf = val_total_loss
                    best_model = True
                else:
                    best_model = False

            else:
                val_total_loss = 0
                best_model = True

            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint(
                {
                    'epoch': epoch,
                    'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME,
                    'state_dict': model.state_dict(),
                    'D_state_dict': discriminator.state_dict(),
                    'loss': val_total_loss,
                    'optimizer': optimizer.state_dict(),
                    'D_optimizer': D_optimizer.state_dict(),
                    'train_global_steps': writer_dict['train_global_steps'],
                    'valid_global_steps': writer_dict['valid_global_steps']
                }, best_model, final_output_dir)

            print('\nEpoch {} spent {:.2f} hours\n'.format(
                epoch, (time.time() - epoch_start_time) / 3600))

            #if epoch == 3:break
    if master:
        final_model_state_file = os.path.join(
            final_output_dir, 'final_state{}.pth.tar'.format(gpus))
        logger.info(
            '=> saving final model state to {}'.format(final_model_state_file))
        torch.save(model.state_dict(), final_model_state_file)
        writer_dict['writer'].close()

        print(
            '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format(
                cfg.TRAIN.END_EPOCH - begin_epoch + 1,
                (time.time() - start_time) / 3600))
Exemplo n.º 32
0
def main():
    if config.output_dir is None:
        config.output_dir = 'output'
    if config.restart_training:
        shutil.rmtree(config.output_dir, ignore_errors=True)
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    logger = setup_logger(os.path.join(config.output_dir, 'train_log'))
    logger.info(config.print())

    torch.manual_seed(config.seed)  # 为CPU设置随机种子
    if config.gpu_id is not None and torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
        logger.info('train with gpu {} and pytorch {}'.format(
            config.gpu_id, torch.__version__))
        device = torch.device("cuda:0")
        torch.cuda.manual_seed(config.seed)  # 为当前GPU设置随机种子
        torch.cuda.manual_seed_all(config.seed)  # 为所有GPU设置随机种子
    else:
        logger.info('train with cpu and pytorch {}'.format(torch.__version__))
        device = torch.device("cpu")

    train_data = TibetanDataset(config.json_path,
                                data_shape=config.data_shape,
                                n=config.n,
                                m=config.m,
                                transform=transforms.ToTensor(),
                                base_path=config.base_path)
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=config.train_batch_size,
                                   shuffle=True,
                                   num_workers=int(config.workers))

    writer = SummaryWriter(config.output_dir)
    model = PSENet(backbone=config.backbone,
                   pretrained=config.pretrained,
                   result_num=config.n,
                   scale=config.scale)
    if not config.pretrained and not config.restart_training:
        model.apply(weights_init)

    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    # dummy_input = torch.autograd.Variable(torch.Tensor(1, 3, 600, 800).to(device))
    # writer.add_graph(models=models, input_to_model=dummy_input)
    criterion = PSELoss(Lambda=config.Lambda,
                        ratio=config.OHEM_ratio,
                        reduction='mean')
    # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    if config.checkpoint != '' and not config.restart_training:
        start_epoch = load_checkpoint(config.checkpoint, model, logger, device,
                                      optimizer)
        start_epoch += 1
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            config.lr_decay_step,
            gamma=config.lr_gamma,
            last_epoch=start_epoch)
    else:
        start_epoch = config.start_epoch
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         config.lr_decay_step,
                                                         gamma=config.lr_gamma)

    all_step = len(train_loader)
    logger.info('train dataset has {} samples,{} in dataloader'.format(
        train_data.__len__(), all_step))
    epoch = 0
    best_model = {'recall': 0, 'precision': 0, 'f1': 0, 'models': ''}
    try:
        for epoch in range(start_epoch, config.epochs):
            start = time.time()
            train_loss, lr = train_epoch(model, optimizer, scheduler,
                                         train_loader, device, criterion,
                                         epoch, all_step, writer, logger)
            logger.info(
                '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format(
                    epoch, config.epochs, train_loss,
                    time.time() - start, lr))
            # net_save_path = '{}/PSENet_{}_loss{:.6f}.pth'.format(config.output_dir, epoch,
            #                                                                               train_loss)
            # save_checkpoint(net_save_path, models, optimizer, epoch, logger)
            if (0.3 < train_loss < 0.4 and epoch % 4 == 0) or train_loss < 0.3:
                recall, precision, f1 = merge_eval(model=model,
                                                   save_path=os.path.join(
                                                       config.output_dir,
                                                       'output'),
                                                   test_path=config.testroot,
                                                   device=device,
                                                   base_path=config.base_path,
                                                   use_sub=config.use_sub)
                logger.info(
                    'test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'.
                    format(recall, precision, f1))

                net_save_path = '{}/PSENet_{}_loss{:.6f}_r{:.6f}_p{:.6f}_f1{:.6f}.pth'.format(
                    config.output_dir, epoch, train_loss, recall, precision,
                    f1)
                save_checkpoint(net_save_path, model, optimizer, epoch, logger)
                if f1 > best_model['f1']:
                    best_path = glob.glob(config.output_dir + '/Best_*.pth')
                    for b_path in best_path:
                        if os.path.exists(b_path):
                            os.remove(b_path)

                    best_model['recall'] = recall
                    best_model['precision'] = precision
                    best_model['f1'] = f1
                    best_model['models'] = net_save_path

                    best_save_path = '{}/Best_{}_r{:.6f}_p{:.6f}_f1{:.6f}.pth'.format(
                        config.output_dir, epoch, recall, precision, f1)
                    if os.path.exists(net_save_path):
                        shutil.copyfile(net_save_path, best_save_path)
                    else:
                        save_checkpoint(best_save_path, model, optimizer,
                                        epoch, logger)

                    pse_path = glob.glob(config.output_dir + '/PSENet_*.pth')
                    for p_path in pse_path:
                        if os.path.exists(p_path):
                            os.remove(p_path)

                writer.add_scalar(tag='Test/recall',
                                  scalar_value=recall,
                                  global_step=epoch)
                writer.add_scalar(tag='Test/precision',
                                  scalar_value=precision,
                                  global_step=epoch)
                writer.add_scalar(tag='Test/f1',
                                  scalar_value=f1,
                                  global_step=epoch)
        writer.close()
    except KeyboardInterrupt:
        save_checkpoint('{}/final.pth'.format(config.output_dir), model,
                        optimizer, epoch, logger)
    finally:
        if best_model['models']:
            logger.info(best_model)
Exemplo n.º 33
0
def main():
    tf.set_random_seed(1234)
    np.random.seed(1234)

    # Load MNIST
    data_path = os.path.join('data', 'mnist.pkl.gz')
    x_train, t_train, x_valid, t_valid, x_test, t_test = \
        dataset.load_mnist_realval(data_path)
    x_train = np.vstack([x_train, x_valid])
    x_test = np.random.binomial(1, x_test, size=x_test.shape)
    n_x = x_train.shape[1]
    n_z = FLAGS.n_z

    n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles')
    x_input = tf.placeholder(tf.float32, shape=[None, n_x], name='x')
    x = tf.to_int32(tf.random_uniform(tf.shape(x_input)) <= x_input)
    n = tf.shape(x)[0]

    qz = q_net(x, n_z, n_particles)
    # log_qz = qz.log_prob(qz)
    model, _ = vae({'x': x, 'z': qz}, n, n_x, n_z, n_particles)
    log_px_qz = model.local_log_prob('x')
    eq_ll = tf.reduce_mean(log_px_qz)

    kl = kl_normal_normal(
        qz.distribution.mean, qz.distribution.logstd, 0., 0.)
    kl_term = tf.reduce_mean(tf.reduce_sum(kl, -1))
    lower_bound = eq_ll - kl_term
    cost = -lower_bound

    # log_pz = model.local_log_prob('z')
    # kl_term_est = tf.reduce_mean(log_qz - log_pz)
    # cost = kl_term

    learning_rate_ph = tf.placeholder(tf.float32, shape=[], name='lr')
    optimizer = tf.train.AdamOptimizer(learning_rate_ph, beta1=0.5)
    infer_op = optimizer.minimize(cost)

    # Generate images
    n_gen = 100
    _, x_logits = vae({}, n_gen, n_x, n_z, 1)
    x_gen = tf.reshape(tf.sigmoid(x_logits), [-1, 28, 28, 1])

    # Define training parameters
    lb_samples = 1
    learning_rate = 1e-4
    epochs = 3000
    batch_size = 128
    iters = x_train.shape[0] // batch_size
    save_image_freq = 10
    save_model_freq = 100
    test_freq = 10
    test_batch_size = 400
    test_iters = x_test.shape[0] // test_batch_size
    result_path = "results/vae_conv_{}_".format(n_z) + \
        time.strftime("%Y%m%d_%H%M%S")

    saver = tf.train.Saver(max_to_keep=10)
    logger = setup_logger('vae_conv', __file__, result_path)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Restore from the latest checkpoint
        ckpt_file = tf.train.latest_checkpoint(result_path)
        begin_epoch = 1
        if ckpt_file is not None:
            logger.info('Restoring model from {}...'.format(ckpt_file))
            begin_epoch = int(ckpt_file.split('.')[-2]) + 1
            saver.restore(sess, ckpt_file)

        for epoch in range(begin_epoch, epochs + 1):
            time_epoch = -time.time()
            np.random.shuffle(x_train)
            lbs = []
            for t in range(iters):
                x_batch = x_train[t * batch_size:(t + 1) * batch_size]
                _, lb = sess.run(
                    [infer_op, lower_bound],
                    feed_dict={x_input: x_batch,
                               learning_rate_ph: learning_rate,
                               n_particles: lb_samples})
                lbs.append(lb)

            time_epoch += time.time()
            logger.info(
                'Epoch {} ({:.1f}s): Lower bound = {}'
                .format(epoch, time_epoch, np.mean(lbs)))

            if epoch % test_freq == 0:
                time_test = -time.time()
                test_lbs = []
                for t in range(test_iters):
                    test_x_batch = x_test[t * test_batch_size:
                                          (t + 1) * test_batch_size]
                    test_lb = sess.run(lower_bound,
                                       feed_dict={x: test_x_batch,
                                                  n_particles: lb_samples})
                    test_lbs.append(test_lb)
                time_test += time.time()
                logger.info('>>> TEST ({:.1f}s)'.format(time_test))
                logger.info('>> Test lower bound = {}'
                            .format(np.mean(test_lbs)))

            if epoch % save_image_freq == 0:
                logger.info('Saving images...')
                images = sess.run(x_gen)
                name = os.path.join(result_path,
                                    "vae.epoch.{}.png".format(epoch))
                save_image_collections(images, name)

            if epoch % save_model_freq == 0:
                logger.info('Saving model...')
                save_path = os.path.join(result_path,
                                         "vae.epoch.{}.ckpt".format(epoch))
                if not os.path.exists(os.path.dirname(save_path)):
                    os.makedirs(os.path.dirname(save_path))
                saver.save(sess, save_path)
                logger.info('Done')