def train(cfg):
    n_class = int(cfg["data"]["n_class"])
    img_h = int(cfg["data"]["img_h"])
    img_w = int(cfg["data"]["img_w"])
    batch_size = int(cfg["training"]["batch_size"])
    epochs = int(cfg["training"]["epochs"])
    lr = float(cfg["training"]["optimizer"]["lr"])
    momentum = float(cfg["training"]["optimizer"]["momentum"])
    w_decay = float(cfg["training"]["optimizer"]["weight_decay"])
    step_size = int(cfg["training"]["lr_schedule"]["step_size"])
    gamma = float(cfg["training"]["lr_schedule"]["gamma"])
    configs = "FCNs-BCEWithLogits_batch{}_epoch{}_RMSprop_scheduler-step{}-gamma{}_lr{}_momentum{}_w_decay{}_input_size{}_03091842".format(
        batch_size, epochs, step_size, gamma, lr, momentum, w_decay, img_h)
    print("Configs:", configs)

    root_dir = cfg["data"]["root_dir"]
    train_filename = cfg["data"]["train_file"]
    val_filename = cfg["data"]["val_file"]
    mean_filename = cfg["data"]["mean_file"]
    class_weight_filename = cfg["data"]["class_weight_file"]

    train_file = os.path.join(root_dir, train_filename)
    print(train_file)
    val_file = os.path.join(root_dir, val_filename)
    mean_file = os.path.join(root_dir, mean_filename)
    class_weight_file = os.path.join(root_dir, class_weight_filename)
    model_dir = cfg["training"]["model_dir"]
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_path = os.path.join(model_dir, configs)

    use_gpu = torch.cuda.is_available()
    num_gpu = list(range(torch.cuda.device_count()))

    continue_train = False
    #MeanRGB_train = ComputeMeanofInput(train_file)
    #MeanRGB_train = np.load(mean_file)
    MeanRGB_train = np.array([0.0, 0.0, 0.0])
    print("MeanRGB_train: {}".format(MeanRGB_train))
    train_data = ScanNet2d(csv_file=train_file,
                           phase='train',
                           trainsize=(img_h, img_w),
                           MeanRGB=MeanRGB_train)
    val_data = ScanNet2d(csv_file=val_file,
                         phase='val',
                         trainsize=(img_h, img_w),
                         MeanRGB=MeanRGB_train)

    train_loader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=1)
    val_loader = DataLoader(val_data,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=1)

    #class_weight = trainer.computer_class_weights(train_file)
    class_weight = np.load(class_weight_file)
    print("class_weight: {}".format(class_weight))
    class_weight = torch.from_numpy(class_weight)
    print("shape of class weight {}".format(class_weight.shape))
    vgg_model = VGGNet(requires_grad=True, remove_fc=True)
    fcn_model = FCN8s(encoder_net=vgg_model, n_class=n_class)

    if use_gpu:
        ts = time.time()
        vgg_model = vgg_model.cuda()
        fcn_model = fcn_model.cuda()
        fcn_model = nn.DataParallel(fcn_model, device_ids=num_gpu)
        class_weight = class_weight.cuda()
        print("Finish cuda loading, tme elapsed {}".format(time.time() - ts))

    L = nn.BCEWithLogitsLoss(reduction='none')
    optimizer = optim.RMSprop(fcn_model.parameters(),
                              lr=lr,
                              momentum=momentum,
                              weight_decay=w_decay)
    #optimizer = optim.SGD(fcn_model.parameters(), lr=lr, momentum= momentum, weight_decay=w_decay)
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=step_size,
                                    gamma=gamma)

    score_dir = os.path.join("scores", configs)
    if not os.path.exists(score_dir):
        os.makedirs(score_dir)

    log_headers = [
        'epoch', 'train/loss', 'train/acc', 'train/acc_cls', 'train/mean_iu',
        'train/fwavacc', 'val/loss', 'val/acc', 'val/acc_cls', 'val/mean_iu',
        'val/fwavacc', 'elapsed_time'
    ]
    if not os.path.exists(os.path.join(score_dir, 'log.csv')):
        with open(os.path.join(score_dir, 'log.csv'), 'w') as f:
            f.write(','.join(log_headers) + '\n')

    IU_scores = np.zeros((epochs, n_class + 1))
    pixel_scores = np.zeros(epochs)
    writer = SummaryWriter()
    # color_mapping = util.GenerateColorMapping(n_class)
    best_mean_iu = 0
    epoch_loss = 0.0
    if continue_train:
        model_path = "C:\\Users\\ji\\Documents\\FCN-VGG16\\models\\FCNs-BCEWithLogits_batch1_epoch500_RMSprop_scheduler-step50-gamma0.5_lr0.0001_momentum0.0_w_decay1e-05"
        fcn_model = torch.load(model_path)
        fcn_model.train()
    for epoch in range(epochs):

        fcn_model.train()
        scheduler.step()
        ts = time.time()
        running_loss = 0.0
        ######
        label_preds = []
        label_trues = []
        ######
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            if use_gpu:
                inputs = Variable(batch['X'].cuda())
                labels = Variable(batch['Y'].cuda())
            else:
                inputs, labels = Variable(batch['X']), Variable(batch['Y'])

            outputs = fcn_model(inputs)
            #print("out: {}".format(outputs.shape))
            #print("label: {}".format(labels.shape))
            #print(outputs)
            #print(labels)
            loss = L(outputs, labels)
            # print(loss.shape)
            loss = loss.permute(0, 2, 3,
                                1).reshape(-1,
                                           n_class + 1)  #.view(-1,n_class+1)
            # print(loss.shape)
            loss = torch.mean(loss, dim=0)
            # print(loss.shape)
            loss = torch.mul(loss, class_weight)
            # print(loss.shape)
            loss = torch.mean(loss)
            # print(loss)
            loss.backward()
            # print("grad")
            # print(fcn_model.outp.weight.grad)
            # print(fcn_model.embs[0].weight.grad)
            optimizer.step()
            #scheduler.step()

            if i == 0 and epoch == 0:
                # count= util.count_parameters(fcn_model)
                # print("number of parameters in model {}".format(count))
                visIn = inputs[:3]
                #print('shape of in {}'.format(visIn[:5].shape))
                visLabel = batch['l'][:3]
            epoch_loss += loss.item()
            running_loss += loss.item()
            # print("loss: {}".format(loss.data))
            if i % 10 == 9 and i != 0:
                print("epoch{}, iter{}, Iterloss: {}".format(
                    epoch, i, running_loss / 10))
                writer.add_scalar('train/iter_loss', running_loss / 10,
                                  epoch * len(train_loader) + i)
                running_loss = 0.0
                # N, _, h, w = outputs.shape
                # targets = batch['l'].cpu().numpy().reshape(N,h,w)
                # outputs = outputs.data.cpu().numpy()
                # preds_v, targets_v = util.visulaize_output(outputs,targets,color_mapping,n_class)
                # writer.add_images('train/predictions',torch.from_numpy(preds_v),dataformats='NHWC')

                # writer.add_images('train/targets',torch.from_numpy(targets_v),dataformats='NHWC')
            #####################################
            outputs = outputs.data.cpu().numpy()
            N, _, h, w = outputs.shape
            pred = outputs.transpose(0, 2, 3, 1).reshape(
                -1, n_class + 1).argmax(axis=1).reshape(N, h, w)
            target = batch['l'].cpu().numpy().reshape(N, h, w)
            #########
            for lt, lp in zip(target, pred):
                label_trues.append(lt)
                label_preds.append(lp)

        metrics = util.label_accuracy_score(label_trues, label_preds,
                                            n_class + 1)
        with open(os.path.join(score_dir, "log.csv"), 'a') as f:
            log = [epoch] + [epoch_loss] + list(metrics) + [''] * 7
            log = map(str, log)
            f.write(','.join(log) + '\n')
        ########################################
        #scheduler.step()

        writer.add_scalar('train/epoch_loss', epoch_loss, epoch)
        print("Finish epoch{}, epoch loss {}, time eplapsed {}".format(
            epoch, epoch_loss,
            time.time() - ts))
        epoch_loss = 0.0
        ####################
        writer.add_scalar('train/mean_iu', metrics[2], epoch)
        writer.add_scalar('train/acc', metrics[0], epoch)
        writer.add_scalar('train/acc_cls', metrics[1], epoch)
        ######################
        #Training precess visulize
        visOut = fcn_model(visIn)
        preds_v, targets_v = util.visulaize_output(visOut, visLabel, n_class)
        writer.add_images('train/predictions',
                          torch.from_numpy(preds_v),
                          global_step=epoch,
                          dataformats='NHWC')
        writer.add_images('train/targets',
                          torch.from_numpy(targets_v),
                          global_step=epoch,
                          dataformats='NHWC')

        if not os.path.exists(model_path):
            os.makedirs(model_path)

        torch.save(fcn_model, os.path.join(model_path, str(epoch)))
        best_mean_iu = val_model(epoch, val_loader, fcn_model, use_gpu,
                                 n_class, IU_scores, pixel_scores, score_dir,
                                 writer, best_mean_iu, model_path, L)

    writer.flush()
    writer.close()
示例#2
0
def get_summary_writer():
    name = str(datetime.datetime.now())[:19]
    utils.make_dir(PATH['TF_LOGS'])
    logs_path = os.path.join(PATH['TF_LOGS'], name)
    return SummaryWriter(logs_path)
示例#3
0
    def step(self, action):
        observation, reward, done, info = super(MicroRTSStatsRecorder,
                                                self).step(action)
        self.raw_rewards += [info["raw_rewards"]]
        if done:
            raw_rewards = np.array(self.raw_rewards).sum(0)
            raw_names = [str(rf) for rf in self.rfs]
            info['microrts_stats'] = dict(zip(raw_names, raw_rewards))
            self.raw_rewards = []
        return observation, reward, done, info


# TRY NOT TO MODIFY: setup the environment
experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
writer = SummaryWriter(f"runs/{experiment_name}")
writer.add_text(
    'hyperparameters', "|param|value|\n|-|-|\n%s" %
    ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))
if args.prod_mode:
    import wandb
    run = wandb.init(project=args.wandb_project_name,
                     entity=args.wandb_entity,
                     sync_tensorboard=True,
                     config=vars(args),
                     name=experiment_name,
                     monitor_gym=True,
                     save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
示例#4
0
def test_discrete_bcq(args=get_args()):
    # envs
    env = make_atari_env(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape:", args.state_shape)
    print("Actions shape:", args.action_shape)
    # make environments
    test_envs = ShmemVectorEnv(
        [lambda: make_atari_env_watch(args) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    test_envs.seed(args.seed)
    # model
    feature_net = DQN(*args.state_shape,
                      args.action_shape,
                      device=args.device,
                      features_only=True).to(args.device)
    policy_net = Actor(feature_net,
                       args.action_shape,
                       device=args.device,
                       hidden_sizes=args.hidden_sizes,
                       softmax_output=False).to(args.device)
    imitation_net = Actor(feature_net,
                          args.action_shape,
                          device=args.device,
                          hidden_sizes=args.hidden_sizes,
                          softmax_output=False).to(args.device)
    optim = torch.optim.Adam(list(policy_net.parameters()) +
                             list(imitation_net.parameters()),
                             lr=args.lr)
    # define policy
    policy = DiscreteBCQPolicy(policy_net, imitation_net, optim, args.gamma,
                               args.n_step, args.target_update_freq,
                               args.eps_test, args.unlikely_action_threshold,
                               args.imitation_logits_penalty)
    # load a previous policy
    if args.resume_path:
        policy.load_state_dict(
            torch.load(args.resume_path, map_location=args.device))
        print("Loaded agent from: ", args.resume_path)
    # buffer
    assert os.path.exists(args.load_buffer_name), \
        "Please run atari_dqn.py first to get expert's data buffer."
    if args.load_buffer_name.endswith('.pkl'):
        buffer = pickle.load(open(args.load_buffer_name, "rb"))
    elif args.load_buffer_name.endswith('.hdf5'):
        buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name)
    else:
        print(f"Unknown buffer format: {args.load_buffer_name}")
        exit(0)

    # collector
    test_collector = Collector(policy, test_envs, exploration_noise=True)

    # log
    log_path = os.path.join(
        args.logdir, args.task, 'bcq',
        f'seed_{args.seed}_{datetime.datetime.now().strftime("%m%d-%H%M%S")}')
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer, update_interval=args.log_interval)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return False

    # watch agent's performance
    def watch():
        print("Setup test envs ...")
        policy.eval()
        policy.set_eps(args.eps_test)
        test_envs.seed(args.seed)
        print("Testing agent ...")
        test_collector.reset()
        result = test_collector.collect(n_episode=args.test_num,
                                        render=args.render)
        pprint.pprint(result)
        rew = result["rews"].mean()
        print(f'Mean reward (over {result["n/ep"]} episodes): {rew}')

    if args.watch:
        watch()
        exit(0)

    result = offline_trainer(policy,
                             buffer,
                             test_collector,
                             args.epoch,
                             args.update_per_epoch,
                             args.test_num,
                             args.batch_size,
                             stop_fn=stop_fn,
                             save_fn=save_fn,
                             logger=logger)

    pprint.pprint(result)
    watch()
def initialize():
    # Training settings
    parser = argparse.ArgumentParser(
        description='PyTorch ImageNet Example',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--train-dir',
                        default='/home/cys/datas/data_back/imagenet12/train/',
                        help='path to training data')
    parser.add_argument('--val-dir',
                        default='/home/cys/datas/data_back/imagenet12/val/',
                        help='path to validation data')
    parser.add_argument('--log-dir',
                        default='./logs',
                        help='tensorboard/checkpoint log directory')
    parser.add_argument('--checkpoint-format',
                        default='checkpoint-{epoch}.pth.tar',
                        help='checkpoint file format')
    parser.add_argument('--fp16-allreduce',
                        action='store_true',
                        default=False,
                        help='use fp16 compression during allreduce')
    parser.add_argument('--batches-per-allreduce',
                        type=int,
                        default=1,
                        help='number of batches processed locally before '
                        'executing allreduce across workers; it multiplies '
                        'total batch size.')

    # Default settings from https://arxiv.org/abs/1706.02677.
    parser.add_argument(
        '--model',
        default='resnet50',
        help=
        'Model (resnet35, resnet50, resnet101, resnet152, resnext50, resnext101)'
    )
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        help='input batch size for training')
    parser.add_argument('--val-batch-size',
                        type=int,
                        default=32,
                        help='input batch size for validation')
    parser.add_argument('--epochs',
                        type=int,
                        default=90,
                        help='number of epochs to train')
    parser.add_argument('--base-lr',
                        type=float,
                        default=0.0125,
                        help='learning rate for a single GPU')
    parser.add_argument('--lr-decay',
                        nargs='+',
                        type=int,
                        default=[30, 60, 80],
                        help='epoch intervals to decay lr')
    parser.add_argument('--warmup-epochs',
                        type=float,
                        default=5,
                        help='number of warmup epochs')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        help='SGD momentum')
    parser.add_argument('--wd',
                        type=float,
                        default=0.00005,
                        help='weight decay')
    parser.add_argument('--label-smoothing',
                        type=float,
                        default=0.1,
                        help='label smoothing (default 0.1)')

    # KFAC Parameters
    parser.add_argument(
        '--kfac-update-freq',
        type=int,
        default=10,
        help='iters between kfac inv ops (0 = no kfac) (default: 10)')
    parser.add_argument('--kfac-cov-update-freq',
                        type=int,
                        default=1,
                        help='iters between kfac cov ops (default: 1)')
    parser.add_argument('--kfac-update-freq-alpha',
                        type=float,
                        default=10,
                        help='KFAC update freq multiplier (default: 10)')
    parser.add_argument('--kfac-update-freq-decay',
                        nargs='+',
                        type=int,
                        default=None,
                        help='KFAC update freq schedule (default None)')
    parser.add_argument(
        '--stat-decay',
        type=float,
        default=0.95,
        help='Alpha value for covariance accumulation (default: 0.95)')
    parser.add_argument('--damping',
                        type=float,
                        default=0.002,
                        help='KFAC damping factor (default 0.003)')
    parser.add_argument('--damping-alpha',
                        type=float,
                        default=0.5,
                        help='KFAC damping decay factor (default: 0.5)')
    parser.add_argument('--damping-decay',
                        nargs='+',
                        type=int,
                        default=[40, 80],
                        help='KFAC damping decay schedule (default [40, 80])')
    parser.add_argument('--kl-clip',
                        type=float,
                        default=0.001,
                        help='KL clip (default: 0.001)')
    parser.add_argument(
        '--diag-blocks',
        type=int,
        default=1,
        help='Number of blocks to approx layer factor with (default: 1)')
    parser.add_argument(
        '--diag-warmup',
        type=int,
        default=0,
        help='Epoch to start diag block approximation at (default: 0)')
    parser.add_argument(
        '--distribute-layer-factors',
        action='store_true',
        default=None,
        help='Compute A and G for a single layer on different workers. '
        'None to determine automatically based on worker and '
        'layer count.')

    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--single-threaded',
                        action='store_true',
                        default=False,
                        help='disables multi-threaded dataloading')
    parser.add_argument('--seed', type=int, default=42, help='random seed')

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    hvd.init()
    torch.manual_seed(args.seed)
    args.verbose = 1 if hvd.rank() == 0 else 0
    if args.verbose:
        print(args)

    if args.cuda:
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    cudnn.benchmark = True

    args.log_dir = os.path.join(
        args.log_dir, "imagenet_resnet50_kfac{}_gpu_{}_{}".format(
            args.kfac_update_freq, hvd.size(),
            datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
    args.checkpoint_format = os.path.join(args.log_dir, args.checkpoint_format)
    os.makedirs(args.log_dir, exist_ok=True)

    # If set > 0, will resume training from a given checkpoint.
    args.resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            args.resume_from_epoch = try_epoch
            break

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    args.resume_from_epoch = hvd.broadcast(torch.tensor(
        args.resume_from_epoch),
                                           root_rank=0,
                                           name='resume_from_epoch').item()

    # Horovod: write TensorBoard logs on first worker.
    try:
        if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'):
            from torch.utils.tensorboard import SummaryWriter
        else:
            from tensorboardX import SummaryWriter
        args.log_writer = SummaryWriter(
            args.log_dir) if hvd.rank() == 0 else None
    except ImportError:
        args.log_writer = None

    return args
示例#6
0
 def _setup_writer(self): self.writer = SummaryWriter(log_dir=self.log_dir)
 def __del__(self): self._remove()
示例#7
0
[MODEL]
- {cfg.model['type']}

[OPTIMIZER]
- {optimizer.__class__.__name__}
- hyper params : {optimizer.defaults}

[SCHEDULER]
- {scheduler.__class__.__name__}
- state_dict : {scheduler.state_dict()}

<-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><->
''')
min_val_loss = 99999
with SummaryWriter(log_dir=log_dir) as writer:
    for epoch in range(initial_epoch, cfg.runtime['epochs'] + initial_epoch):
        losses = {
            'train': defaultdict(lambda: 0),
            'val': defaultdict(lambda: 0)
        }
        counts = {'train': 0, 'val': 0}
        result = []
        for phase, (images, image_metas, gt_bboxes, gt_labels) in tqdm(
                chain(dataloaders),
                total=sum(len(dl) for dl in dataloaders.values()),
                desc=f'[Epoch {epoch:3}]'):

            if phase == 'train':
                model.train()
                optimizer.zero_grad()
def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r
        res.append(sum_r)
    res = list(reversed(res))
    mean_q = np.mean(res)
    return [q - mean_q for q in res]


if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    writer = SummaryWriter(comment="-cartpole-reinforce-baseline")

    net = PGN(env.observation_space.shape[0], env.action_space.n)
    print(net)

    agent = ptan.agent.PolicyAgent(
        net, preprocessor=ptan.agent.float32_preprocessor, apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=GAMMA)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    step_idx = 0
    done_episodes = 0
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer, min_length=min_length)
        print('files built')

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
                                                          t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point: start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size: (step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    overall_step += 1
                    if (overall_step + 1) % log_step == 0:
                        tb_writer.add_scalar('loss', loss.item(), overall_step)
                if (overall_step + 1) % log_step == 0:
                    print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        (step + 1) // gradient_accumulation,
                        piece_num,
                        epoch + 1,
                        running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
示例#10
0
文件: train.py 项目: techzizou/yolov5
    # Hyperparameters
    with open(opt.hyp) as f:
        hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps
        if 'box' not in hyp:
            warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' %
                 (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120'))
            hyp['box'] = hyp.pop('giou')

    # Train
    logger.info(opt)
    if not opt.evolve:
        tb_writer = None  # init loggers
        if opt.global_rank in [-1, 0]:
            logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/')
            tb_writer = SummaryWriter(opt.save_dir)  # Tensorboard
        train(hyp, opt, device, tb_writer, wandb)

    # Evolve hyperparameters (optional)
    else:
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
                'box': (1, 0.02, 0.2),  # box loss gain
                'cls': (1, 0.2, 4.0),  # cls loss gain
                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
示例#11
0
# load dataset
train_dataset = datasets.MNIST(root='dataset/',
                               train=True,
                               transform=transforms.ToTensor(),
                               download=True)
train_loader = DataLoader(dataset=train_dataset,
                          shuffle=True,
                          batch_size=batch_size)

model = CNN(in_channels=in_channels, num_classes=num_classes)
model.to(device)

# Loss and Optimier function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0)
writer = SummaryWriter(f'runs/MNIST/traingout_tensorboard')

step = 0
for epoch in epochs:
    losses = []
    accuracies = []
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        scores = model(data)
        loss = criterion(scores, targets)
        losses.apppend(loss)

        #backaward
        optimizer.zero_grad()
# ============================ step 3/5 损失函数 ============================
criterion = nn.CrossEntropyLoss()  # 选择损失函数

# ============================ step 4/5 优化器 ============================
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)  # 选择优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10,
                                            gamma=0.1)  # 设置学习率下降策略

# ============================ step 5/5 训练 ============================
train_curve = list()
valid_curve = list()

iter_count = 0

# 构建 SummaryWriter
writer = SummaryWriter(comment='test_your_comment',
                       filename_suffix="_test_your_filename_suffix")

for epoch in range(MAX_EPOCH):

    loss_mean = 0.
    correct = 0.
    total = 0.

    net.train()
    for i, data in enumerate(train_loader):

        iter_count += 1

        # forward
        inputs, labels = data
        outputs = net(inputs)
示例#13
0
print("\n\tLoad Complete !")

#load pretrained model and reset fully connected layer
model = resnet50(pretrained = True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 2)
model.cuda()

# Observe that all parameters are being optimized
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

#start training
summary = SummaryWriter()
training_accuracy = []
validation_accuracy = []
for epoch in range(num_epochs):

    # training set -- perform model training
    epoch_training_loss = 0.0
    num_batches = 0

    for batch_num, training_batch in enumerate(train_loader):        # 'enumerate' is a super helpful function
        # split training data into inputs and labels
        inputs, labels = training_batch                              # 'training_batch' is a list
        # wrap data in 'Variable'
        inputs, labels = torch.autograd.Variable(inputs.cuda()), torch.autograd.Variable(labels.cuda())
        # Make gradients zero for parameters 'W', 'b'
        optimizer.zero_grad()
示例#14
0
parser.add_argument('--prefix', type=str, default='', help='log prefix')

args = parser.parse_args()

print("\n==> Arguments passed in were:\n")
for key, value in args.__dict__.items():
    print(f"{key}: {value}")
print("")

torch.manual_seed(args.seed)
np.random.seed(args.seed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
log_dir = f'logs/paper/{args.prefix}{args.optim}-{args.initial_lr}-seed{args.seed}/'
writer = SummaryWriter(log_dir=log_dir)
os.mkdir(log_dir + 'samples')

#########################################
#### Prepare data #######################
#########################################

print("\n==> Downloading CIFAR-10 dataset...\n")

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])  # normalizes pixels to be in range (-1,1)

trainset = torchvision.datasets.CIFAR10(root='./data',
示例#15
0
文件: train.py 项目: mmrebuttal/code
parser.add_argument("--model_path", default="")
parser.add_argument("--texture", default="")


if __name__ == '__main__':
    opt = parser.parse_args()
    # ===========================================================
    # Set train dataset & test dataset
    # ===========================================================
    opt.main_path, main_filename = mkdir_path(opt,os.path.basename(__file__))
    UPSCALE_FACTOR = opt.upscale_factor
    NUM_EPOCHS = opt.num_epochs
    opt.GPU_IN_USE = torch.cuda.is_available()
    opt.device = torch.device('cuda' if opt.GPU_IN_USE else 'cpu')
    zipDir(os.getcwd(), opt.main_path+"code.zip")
    writer = SummaryWriter(opt.log_path)

    print('===> Loading datasets')
    train_loader = create_data_loader(opt) #create_old_loader(args)   #create_data_loader(args)
    val_loader = create_test_loader(opt)


    print('===> Construct network')
    netG = SKINET(in_channels=3, out_channels=3, nf=64, scale_factor=1).to(opt.device)     # netG = Generator(UPSCALE_FACTOR)
    netD = Discriminator()
    generator_criterion = GeneratorLoss()
    bce_loss = torch.nn.BCELoss()
    print('# generator parameters:', sum(param.numel() for param in netG.parameters()))
    print('# discriminator parameters:', sum(param.numel() for param in netD.parameters()))
    # print(netG)
    # print(netD)
示例#16
0
        if 'box' not in hyp:
            warn(
                'Compatibility: %s missing "box" which was renamed from "giou" in %s'
                % (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120'))
            hyp['box'] = hyp.pop('giou')

    # Train
    logger.info(opt)
    if not opt.evolve:
        tb_writer, wandb = None, None  # init loggers
        if opt.global_rank in [-1, 0]:
            # Tensorboard
            logger.info(
                f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/'
            )
            tb_writer = SummaryWriter(opt.save_dir)  # runs/train/exp

            # W&B
            try:
                import wandb

                assert os.environ.get('WANDB_DISABLED') != 'true'
            except (ImportError, AssertionError):
                logger.info(
                    "Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)"
                )
        wandb = None
        train(hyp, opt, device, tb_writer, wandb)

    # Evolve hyperparameters (optional)
    else:
示例#17
0
trainset = trainset[:cutoff]

if p.shuffle_dataset:
    trainset = trainset.shuffle()
n_features = trainset.get(0).x.shape[1]
print('Setting up model...')
model = MultiScaleFeaStNet(4, heads=p.heads).to(device)
optimizer = torch.optim.Adam(model.parameters(),
                             lr=learn_rate,
                             weight_decay=p.weight_decay)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
#                                                       factor=p.lr_decay,
#                                                       patience=p.patience)

writer = SummaryWriter(
    comment='model:{}_lr:{}_lr_decay:{}_shuffle:{}_seed:{}'.format(
        p.version, learn_rate, p.lr_decay, p.shuffle_dataset, p.random_seed))

# axes = [0, 1, 2]
max_roc_auc = 0

# ---- Training ----
print('Training...')
for epoch in range(1, epochs + 1):

    train_loader = DataLoader(trainset,
                              shuffle=p.shuffle_dataset,
                              batch_size=p.batch_size)
    val_loader = DataLoader(validset,
                            shuffle=False,
                            batch_size=p.test_batch_size)
def train(args):
    ## 트레이닝 파라메터 설정하기
    mode = args.mode
    train_continue = args.train_continue

    lr = args.lr
    batch_size = args.batch_size
    num_epoch = args.num_epoch

    data_dir = args.data_dir
    ckpt_dir = args.ckpt_dir
    log_dir = args.log_dir
    result_dir = args.result_dir

    task = args.task
    opts = [args.opts[0], np.asarray(args.opts[1:]).astype(np.float)]

    ny = args.ny
    nx = args.nx
    nch = args.nch
    nker = args.nker

    wgt_cycle = args.wgt_cycle
    wgt_ident = args.wgt_ident
    norm = args.norm

    network = args.network
    learning_type = args.learning_type

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("mode: %s" % mode)
    print("norm: %s" % norm)

    print("learning rate: %.4e" % lr)
    print("batch size: %d" % batch_size)
    print("number of epoch: %d" % num_epoch)

    print("task: %s" % task)
    print("opts: %s" % opts)

    print("network: %s" % network)
    print("learning type: %s" % learning_type)

    print("data dir: %s" % data_dir)
    print("ckpt dir: %s" % ckpt_dir)
    print("log dir: %s" % log_dir)
    print("result dir: %s" % result_dir)

    print("device: %s" % device)

    ## 디렉토리 생성하기
    result_dir_train = os.path.join(result_dir, 'train')

    if not os.path.exists(result_dir_train):
        os.makedirs(os.path.join(result_dir_train, 'png', 'a2b'))
        os.makedirs(os.path.join(result_dir_train, 'png', 'b2a'))

    ## 네트워크 학습하기
    if mode == 'train':
        transform_train = transforms.Compose([
            Resize(shape=(286, 286, nch)),
            RandomCrop((ny, nx)),
            Normalization(mean=MEAN, std=STD)
        ])

        dataset_train = Dataset(data_dir=os.path.join(data_dir, 'train'),
                                transform=transform_train,
                                task=task,
                                data_type='both')
        loader_train = DataLoader(dataset_train,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=NUM_WORKER)

        # 그밖에 부수적인 variables 설정하기
        num_data_train = len(dataset_train)
        num_batch_train = np.ceil(num_data_train / batch_size)

    ## 네트워크 생성하기
    if network == "CycleGAN":
        netG_a2b = CycleGAN(in_channels=nch,
                            out_channels=nch,
                            nker=nker,
                            norm=norm,
                            nblk=9).to(device)
        netG_b2a = CycleGAN(in_channels=nch,
                            out_channels=nch,
                            nker=nker,
                            norm=norm,
                            nblk=9).to(device)

        netD_a = Discriminator(in_channels=nch,
                               out_channels=1,
                               nker=nker,
                               norm=norm).to(device)
        netD_b = Discriminator(in_channels=nch,
                               out_channels=1,
                               nker=nker,
                               norm=norm).to(device)

        init_weights(netG_a2b, init_type='normal', init_gain=0.02)
        init_weights(netG_b2a, init_type='normal', init_gain=0.02)

        init_weights(netD_a, init_type='normal', init_gain=0.02)
        init_weights(netD_b, init_type='normal', init_gain=0.02)

    ## 손실함수 정의하기
    fn_cycle = nn.L1Loss().to(device)
    fn_gan = nn.BCELoss().to(device)
    fn_ident = nn.L1Loss().to(device)

    ## Optimizer 설정하기
    optimG = torch.optim.Adam(itertools.chain(netG_a2b.parameters(),
                                              netG_b2a.parameters()),
                              lr=lr,
                              betas=(0.5, 0.999))
    optimD = torch.optim.Adam(itertools.chain(netD_a.parameters(),
                                              netD_b.parameters()),
                              lr=lr,
                              betas=(0.5, 0.999))

    ## 그밖에 부수적인 functions 설정하기
    fn_tonumpy = lambda x: x.to('cpu').detach().numpy().transpose(0, 2, 3, 1)
    fn_denorm = lambda x: (x * STD) + MEAN

    cmap = None

    ## Tensorboard 를 사용하기 위한 SummaryWriter 설정
    writer_train = SummaryWriter(log_dir=os.path.join(log_dir, 'train'))

    ## 네트워크 학습시키기
    st_epoch = 0

    # TRAIN MODE
    if mode == 'train':
        if train_continue == "on":
            netG_a2b, netG_b2a, \
            netD_a, netD_b, \
            optimG, optimD, st_epoch = load(ckpt_dir=ckpt_dir,
                                            netG_a2b=netG_a2b, netG_b2a=netG_b2a,
                                            netD_a=netD_a, netD_b=netD_b,
                                            optimG=optimG, optimD=optimD)

        for epoch in range(st_epoch + 1, num_epoch + 1):
            netG_a2b.train()
            netG_b2a.train()
            netD_a.train()
            netD_b.train()

            loss_G_a2b_train = []
            loss_G_b2a_train = []
            loss_D_a_train = []
            loss_D_b_train = []
            loss_cycle_a_train = []
            loss_cycle_b_train = []
            loss_ident_a_train = []
            loss_ident_b_train = []

            for batch, data in enumerate(loader_train, 1):
                input_a = data['data_a'].to(device)
                input_b = data['data_b'].to(device)

                # forward netG
                output_b = netG_a2b(input_a)
                output_a = netG_b2a(input_b)

                recon_b = netG_a2b(output_a)
                recon_a = netG_b2a(output_b)

                # backward netD
                set_requires_grad([netD_a, netD_b], True)
                optimD.zero_grad()

                # backward netD_a
                pred_real_a = netD_a(input_a)
                pred_fake_a = netD_a(output_a.detach())

                loss_D_a_real = fn_gan(pred_real_a,
                                       torch.ones_like(pred_real_a))
                loss_D_a_fake = fn_gan(pred_fake_a,
                                       torch.zeros_like(pred_fake_a))
                loss_D_a = 0.5 * (loss_D_a_real + loss_D_a_fake)

                # backward netD_b
                pred_real_b = netD_b(input_b)
                pred_fake_b = netD_b(output_b.detach())

                loss_D_b_real = fn_gan(pred_real_b,
                                       torch.ones_like(pred_real_b))
                loss_D_b_fake = fn_gan(pred_fake_b,
                                       torch.zeros_like(pred_fake_b))
                loss_D_b = 0.5 * (loss_D_b_real + loss_D_b_fake)

                loss_D = loss_D_a + loss_D_b
                loss_D.backward()
                optimD.step()

                # backward netG
                set_requires_grad([netD_a, netD_b], False)
                optimG.zero_grad()

                pred_fake_a = netD_a(output_a)
                pred_fake_b = netD_b(output_b)

                loss_G_a2b = fn_gan(pred_fake_a, torch.ones_like(pred_fake_a))
                loss_G_b2a = fn_gan(pred_fake_b, torch.ones_like(pred_fake_b))

                loss_cycle_a = fn_cycle(input_a, recon_a)
                loss_cycle_b = fn_cycle(input_b, recon_b)

                ident_a = netG_b2a(input_a)
                ident_b = netG_a2b(input_b)

                loss_ident_a = fn_ident(input_a, ident_a)
                loss_ident_b = fn_ident(input_b, ident_b)

                loss_G = (loss_G_a2b + loss_G_b2a) + \
                         wgt_cycle * (loss_cycle_a + loss_cycle_b) + \
                         wgt_cycle * wgt_ident * (loss_ident_a + loss_ident_b)

                loss_G.backward()
                optimG.step()

                # 손실함수 계산
                loss_G_a2b_train += [loss_G_a2b.item()]
                loss_G_b2a_train += [loss_G_b2a.item()]

                loss_D_a_train += [loss_D_a.item()]
                loss_D_b_train += [loss_D_b.item()]

                loss_cycle_a_train += [loss_cycle_a.item()]
                loss_cycle_b_train += [loss_cycle_b.item()]

                loss_ident_a_train += [loss_ident_a.item()]
                loss_ident_b_train += [loss_ident_b.item()]

                print(
                    "TRAIN: EPOCH %04d / %04d | BATCH %04d / %04d | "
                    "GEN a2b %.4f b2a %.4f | "
                    "DISC a %.4f b %.4f | "
                    "CYCLE a %.4f b %.4f | "
                    "IDENT a %.4f b %.4f | " %
                    (epoch, num_epoch, batch, num_batch_train,
                     np.mean(loss_G_a2b_train), np.mean(loss_G_b2a_train),
                     np.mean(loss_D_a_train), np.mean(loss_D_b_train),
                     np.mean(loss_cycle_a_train), np.mean(loss_cycle_b_train),
                     np.mean(loss_ident_a_train), np.mean(loss_ident_b_train)))

                if batch % 20 == 0:
                    # Tensorboard 저장하기
                    input_a = fn_tonumpy(fn_denorm(input_a)).squeeze()
                    input_b = fn_tonumpy(fn_denorm(input_b)).squeeze()
                    output_a = fn_tonumpy(fn_denorm(output_a)).squeeze()
                    output_b = fn_tonumpy(fn_denorm(output_b)).squeeze()

                    input_a = np.clip(input_a, a_min=0, a_max=1)
                    input_b = np.clip(input_b, a_min=0, a_max=1)
                    output_a = np.clip(output_a, a_min=0, a_max=1)
                    output_b = np.clip(output_b, a_min=0, a_max=1)

                    id = num_batch_train * (epoch - 1) + batch

                    plt.imsave(os.path.join(result_dir_train, 'png', 'a2b',
                                            '%04d_input_a.png' % id),
                               input_a[0],
                               cmap=cmap)
                    plt.imsave(os.path.join(result_dir_train, 'png', 'a2b',
                                            '%04d_output_b.png' % id),
                               output_b[0],
                               cmap=cmap)

                    plt.imsave(os.path.join(result_dir_train, 'png', 'b2a',
                                            '%04d_input_b.png' % id),
                               input_b[0],
                               cmap=cmap)
                    plt.imsave(os.path.join(result_dir_train, 'png', 'b2a',
                                            '%04d_output_a.png' % id),
                               output_a[0],
                               cmap=cmap)

                    writer_train.add_image('input_a',
                                           input_a,
                                           id,
                                           dataformats='NHWC')
                    writer_train.add_image('input_b',
                                           input_b,
                                           id,
                                           dataformats='NHWC')
                    writer_train.add_image('output_a',
                                           output_a,
                                           id,
                                           dataformats='NHWC')
                    writer_train.add_image('output_b',
                                           output_b,
                                           id,
                                           dataformats='NHWC')

            writer_train.add_scalar('loss_G_a2b', np.mean(loss_G_a2b_train),
                                    epoch)
            writer_train.add_scalar('loss_G_b2a', np.mean(loss_G_b2a_train),
                                    epoch)

            writer_train.add_scalar('loss_D_a', np.mean(loss_D_a_train), epoch)
            writer_train.add_scalar('loss_D_b', np.mean(loss_D_b_train), epoch)

            writer_train.add_scalar('loss_cycle_a',
                                    np.mean(loss_cycle_a_train), epoch)
            writer_train.add_scalar('loss_cycle_b',
                                    np.mean(loss_cycle_b_train), epoch)

            writer_train.add_scalar('loss_ident_a',
                                    np.mean(loss_ident_a_train), epoch)
            writer_train.add_scalar('loss_ident_b',
                                    np.mean(loss_ident_b_train), epoch)

            if epoch % 2 == 0 or epoch == num_epoch:
                save(ckpt_dir=ckpt_dir,
                     epoch=epoch,
                     netG_a2b=netG_a2b,
                     netG_b2a=netG_b2a,
                     netD_a=netD_a,
                     netD_b=netD_b,
                     optimG=optimG,
                     optimD=optimD)

        writer_train.close()
示例#19
0
    D_net.load_state_dict(checkpoint['D_state_dict'])

# update the step_idx
if LOAD_NET:
    step_idx = common.find_stepidx(load_fileName, "-", "\.")
else:
    step_idx = 0

# create the target net (stable)
tgt_D_net = common.TargetNet(D_net)

# define the net_processor
net_processor = common.GANPreprocessor(G_net, D_net, tgt_D_net.target_model)

# define the writer
writer = SummaryWriter(log_dir="../runs/GAN/" + dt_string, comment="GAN_stock_trading")

with common.gan_lossTracker(writer, stop_loss=np.inf, mean_size=1000) as loss_tracker:
    while True:
        step_idx += 1
        net_processor.train_mode(batch_size=BATCH_SIZE)
        # generate the training set
        X_v, K_v, x_v, k_v = train_container.generate_batch(BATCH_SIZE)
        input_real = data.D_preprocess(X_v, K_v, x_v, k_v)

        # train D by input_real
        D_W = D_net(input_real)

        # train D for input_fake
        optimizerD.zero_grad()
        x_v_, k_v_ = G_net(X_v, K_v)
# Model
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f"runs/Loss_plot")
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size,
                      num_layers, enc_dropout).to(device)

decoder_net = Decoder(
    input_size_decoder,
示例#21
0
batch_size = 32

# Model hyperparameters
src_vocab_size = len(lithuanian.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 256
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0
step_valid = 0

# build into torchtext iterators
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

# Initiate transformer model
model = Transformer(
    embedding_size,
示例#22
0
        "WARNING: You have a CUDA device, so you should probably run with using cuda"
    )

is_data_parallel = False
if isinstance(config['cuda']['gpu_id'], list):
    is_data_parallel = True
    cuda_str = 'cuda:' + str(config['cuda']['gpu_id'][0])
elif isinstance(config['cuda']['gpu_id'], int):
    cuda_str = 'cuda:' + str(config['cuda']['gpu_id'])
else:
    raise ValueError('Check out gpu id in config')

device = torch.device(cuda_str if config['cuda']['using_cuda'] else "cpu")

# tensorboard
summary_writer = SummaryWriter(os.path.join(config['model']['exp_path'],
                                            'log'))

# Data
target_classes = utils.read_txt(config['params']['classes'])
num_classes = len(target_classes)
img_size = config['params']['image_size'].split('x')
img_size = (int(img_size[0]), int(img_size[1]))

print('==> Preparing data..')
bbox_params = A.BboxParams(format='pascal_voc', min_visibility=0.3)
train_transforms = A.Compose(
    [
        A.Resize(height=img_size[0], width=img_size[1], p=1.0),
        A.HorizontalFlip(p=0.5),
        # A.OneOf([
        #     A.Sequential([
示例#23
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for e in train_iterator:
        if args.local_rank != -1:
            train_dataloader.sampler.set_epoch(e)
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert", "bart"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
示例#24
0
 def _set_writer(self, **kwargs):
     if not kwargs:
         kwargs = {'log_dir': os.path.join(self.workdir, 'tensorboard')}
     self.summary_writer_path = kwargs['log_dir']
     self.writer = SummaryWriter(**kwargs)
storage_client = storage.Client()
bucket_name = opt.job_dir[:-1] if opt.job_dir.endswith('/') else opt.job_dir
bucket_name = bucket_name.replace('gs://', '')
bucket = storage_client.bucket(bucket_name)


def upload_file(src: str):
    blob = bucket.blob(os.path.basename(src))
    blob.upload_from_filename(src)


# ------------
# Tensorboard configuration
# ------------

writer = SummaryWriter(opt.job_dir)

# ----------
#  Training
# ----------


def sample_image(writer: SummaryWriter, samples_per_class: int,
                 iterations: int):
    z = torch.randn(samples_per_class * n_classes, opt.latent_dim).to(device)
    labels = np.array(
        [num for num in range(samples_per_class) for _ in range(n_classes)])
    labels = torch.Tensor(labels).to(device)
    gen_imgs = generator(z, labels)
    writer.add_images('gan_grid', gen_imgs, iterations)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
            args.model_name_or_path
            and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break
        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
示例#27
0
        "batch_size": BATCH_SIZE,
        "layer_size": LAYER_SIZE,
        "nStep": nstep,
        "gamma": GAMMA,
        "tau": TAU,
        "learningRate": LR,
        "epsilon": EPS,
        "updateEvery": UPDATE_EVERY,
        "nUpdate": NUPDATES
    }

    np.random.seed(seed)
    env = BipedalWalker()

    now = datetime.now()
    writer = SummaryWriter('logdir/' + now.strftime("%Y%m%d-%H%M%S") + "/")
    writer.add_hparams(paramDict, {})

    env.seed(seed)
    action_size = env.action_space.shape[0]
    state_size = env.observation_space.shape[0]

    agent = DQN_Agent(state_size=state_size,
                      action_size=action_size,
                      layer_size=LAYER_SIZE,
                      BATCH_SIZE=BATCH_SIZE,
                      BUFFER_SIZE=BUFFER_SIZE,
                      PER=per,
                      LR=LR,
                      EPS=EPS,
                      Nstep=nstep,
示例#28
0
    'pin_memory': True
}
val_params = {
    'batch_size': args.batch,
    'shuffle': False,
    'num_workers': 20,
    'pin_memory': True
}

num_epochs = args.epochs

save_path = os.path.join(args.save_root, args.expt)
utils.create_dirs(save_path)

## tensorboard summary logger
writer = SummaryWriter(log_dir=os.path.join(save_path, 'logs'))

## configure runtime logging
logging.basicConfig(level=logging.INFO,
                    filename=os.path.join(save_path, 'logs', 'logfile.log'),
                    format='%(asctime)s - %(message)s',
                    filemode='w')
# logger=logging.getLogger()#.setLevel(logging.INFO)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
logging.getLogger('').addHandler(console)
logging.info(args)

## dataloaders using hdf5 file
data_path = None
示例#29
0
def main():
    #### options
    parser = argparse.ArgumentParser()
    parser.add_argument('-opt', type=str, help='Path to option YAML file.')
    parser.add_argument('--launcher',
                        choices=['none', 'pytorch'],
                        default='none',
                        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    opt = option.parse(args.opt, is_train=True)

    #### distributed training settings
    if args.launcher == 'none':  # disabled distributed training
        opt['dist'] = False
        rank = -1
        print('Disabled distributed training.')
    else:
        opt['dist'] = True
        init_dist()
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()

    #### loading resume state if exists
    if opt['path'].get('resume_state', None):
        # distributed resuming: all load into default GPU
        device_id = torch.cuda.current_device()
        resume_state = torch.load(
            opt['path']['resume_state'],
            map_location=lambda storage, loc: storage.cuda(device_id))
        option.check_resume(opt, resume_state['iter'])  # check resume options
    else:
        resume_state = None

    #### mkdir and loggers
    if rank <= 0:  # normal training (rank -1) OR distributed training (rank 0)
        if resume_state is None:
            util.mkdir_and_rename(
                opt['path']
                ['experiments_root'])  # rename experiment folder if exists
            util.mkdirs(
                (path for key, path in opt['path'].items()
                 if not key == 'experiments_root'
                 and 'pretrain_model' not in key and 'resume' not in key))

        # config loggers. Before it, the log will not work
        util.setup_logger('base',
                          opt['path']['log'],
                          'train_' + opt['name'],
                          level=logging.INFO,
                          screen=True,
                          tofile=True)
        logger = logging.getLogger('base')
        logger.info(option.dict2str(opt))
        # tensorboard logger
        if opt['use_tb_logger'] and 'debug' not in opt['name']:
            version = float(torch.__version__[0:3])
            if version >= 1.1:  # PyTorch 1.1
                from torch.utils.tensorboard import SummaryWriter
            else:
                logger.info(
                    'You are using PyTorch {}. Tensorboard will use [tensorboardX]'
                    .format(version))
                from tensorboardX import SummaryWriter
            tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name'])
    else:
        util.setup_logger('base',
                          opt['path']['log'],
                          'train',
                          level=logging.INFO,
                          screen=True)
        logger = logging.getLogger('base')

    # convert to NoneDict, which returns None for missing keys
    opt = option.dict_to_nonedict(opt)

    #### random seed
    seed = opt['train']['manual_seed']
    if seed is None:
        seed = random.randint(1, 10000)
    if rank <= 0:
        logger.info('Random seed: {}'.format(seed))
    util.set_random_seed(seed)

    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True

    #### create train and val dataloader
    dataset_ratio = 200  # enlarge the size of each epoch
    for phase, dataset_opt in opt['datasets'].items():
        if phase == 'train':
            train_set = create_dataset(dataset_opt)
            train_size = int(
                math.ceil(len(train_set) / dataset_opt['batch_size']))
            total_iters = int(opt['train']['niter'])
            total_epochs = int(math.ceil(total_iters / train_size))
            if dataset_opt['use_text']:
                import pdb
                pdb.set_trace()
                from lib.datasets.dataset import LmdbDataset as genTextDataset
                print("Start Load IIIT5K dataset!")
                text_dataset = genTextDataset(dataset_opt['dataset_dir'],
                                              dataset_opt['voc_type'],
                                              dataset_opt['max_len'],
                                              dataset_opt['num_sample'])
                print("Load IIIT5K dataset success!")
                text_dataloader = DataLoader(
                    text_dataset,
                    batch_size=dataset_opt['batch_size'],
                    number_workers=dataset_opt['n_workers'],
                    shuffle=True,
                    pin_memory=True,
                    drop_last=True,
                    collate_fn=AlignCollate(imgH=height,
                                            imgW=width,
                                            keep_ratio=keep_ratio))

            if opt['dist']:
                train_sampler = DistIterSampler(train_set, world_size, rank,
                                                dataset_ratio)
                total_epochs = int(
                    math.ceil(total_iters / (train_size * dataset_ratio)))
            else:
                train_sampler = None
            train_loader = create_dataloader(train_set, dataset_opt, opt,
                                             train_sampler)
            if rank <= 0:
                logger.info(
                    'Number of train images: {:,d}, iters: {:,d}'.format(
                        len(train_set), train_size))
                logger.info('Total epochs needed: {:d} for iters {:,d}'.format(
                    total_epochs, total_iters))
        elif phase == 'val':
            val_set = create_dataset(dataset_opt)
            val_loader = create_dataloader(val_set, dataset_opt, opt, None)
            if rank <= 0:
                logger.info('Number of val images in [{:s}]: {:d}'.format(
                    dataset_opt['name'], len(val_set)))
        else:
            raise NotImplementedError(
                'Phase [{:s}] is not recognized.'.format(phase))
    assert train_loader is not None
    assert len(train_loader) == len(
        text_dataloader), "text gt size should be the same as training dataset"

    #### create model
    model = create_model(opt, text_dataset)

    #### resume training
    if resume_state:
        logger.info('Resuming training from epoch: {}, iter: {}.'.format(
            resume_state['epoch'], resume_state['iter']))

        start_epoch = resume_state['epoch']
        current_step = resume_state['iter']
        model.resume_training(resume_state)  # handle optimizers and schedulers
    else:
        current_step = 0
        start_epoch = 0

    #### training
    logger.info('Start training from epoch: {:d}, iter: {:d}'.format(
        start_epoch, current_step))
    for epoch in range(start_epoch, total_epochs + 1):
        if opt['dist']:
            train_sampler.set_epoch(epoch)
        for _, (train_data,
                input_dict) in enumerate(zip(train_loader, text_dataloader)):
            current_step += 1
            if current_step > total_iters:
                break
            #### update learning rate
            model.update_learning_rate(current_step,
                                       warmup_iter=opt['train']['warmup_iter'])

            #### training
            model.feed_data(train_data)
            model.optimize_parameters(current_step)

            #### log
            if current_step % opt['logger']['print_freq'] == 0:
                logs = model.get_current_log()
                message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format(
                    epoch, current_step)
                for v in model.get_current_learning_rate():
                    message += '{:.3e},'.format(v)
                message += ')] '
                for k, v in logs.items():
                    message += '{:s}: {:.4e} '.format(k, v)
                    # tensorboard logger
                    if opt['use_tb_logger'] and 'debug' not in opt['name']:
                        if rank <= 0:
                            tb_logger.add_scalar(k, v, current_step)
                if rank <= 0:
                    logger.info(message)
            #### validation
            if opt['datasets'].get(
                    'val',
                    None) and current_step % opt['train']['val_freq'] == 0:
                if opt['model'] in [
                        'sr', 'srgan'
                ] and rank <= 0:  # image restoration validation
                    # does not support multi-GPU validation
                    pbar = util.ProgressBar(len(val_loader))
                    avg_psnr = 0.
                    idx = 0
                    for val_data in val_loader:
                        idx += 1
                        img_name = os.path.splitext(
                            os.path.basename(val_data['LQ_path'][0]))[0]
                        img_dir = os.path.join(opt['path']['val_images'],
                                               img_name)
                        util.mkdir(img_dir)

                        model.feed_data(val_data)
                        model.test()

                        visuals = model.get_current_visuals()
                        sr_img = util.tensor2img(visuals['rlt'])  # uint8
                        gt_img = util.tensor2img(visuals['GT'])  # uint8

                        # Save SR images for reference
                        save_img_path = os.path.join(
                            img_dir,
                            '{:s}_{:d}.png'.format(img_name, current_step))
                        util.save_img(sr_img, save_img_path)

                        # calculate PSNR
                        sr_img, gt_img = util.crop_border([sr_img, gt_img],
                                                          opt['scale'])
                        avg_psnr += util.calculate_psnr(sr_img, gt_img)
                        pbar.update('Test {}'.format(img_name))

                    avg_psnr = avg_psnr / idx

                    # log
                    logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr))
                    # tensorboard logger
                    if opt['use_tb_logger'] and 'debug' not in opt['name']:
                        tb_logger.add_scalar('psnr', avg_psnr, current_step)
                else:  # video restoration validation
                    if opt['dist']:
                        # multi-GPU testing
                        psnr_rlt = {}  # with border and center frames
                        if rank == 0:
                            pbar = util.ProgressBar(len(val_set))
                        for idx in range(rank, len(val_set), world_size):
                            val_data = val_set[idx]
                            val_data['LQs'].unsqueeze_(0)
                            val_data['GT'].unsqueeze_(0)
                            folder = val_data['folder']
                            idx_d, max_idx = val_data['idx'].split('/')
                            idx_d, max_idx = int(idx_d), int(max_idx)
                            if psnr_rlt.get(folder, None) is None:
                                psnr_rlt[folder] = torch.zeros(
                                    max_idx,
                                    dtype=torch.float32,
                                    device='cuda')
                            # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda')
                            model.feed_data(val_data)
                            model.test()
                            visuals = model.get_current_visuals()
                            rlt_img = util.tensor2img(visuals['rlt'])  # uint8
                            gt_img = util.tensor2img(visuals['GT'])  # uint8
                            # calculate PSNR
                            psnr_rlt[folder][idx_d] = util.calculate_psnr(
                                rlt_img, gt_img)

                            if rank == 0:
                                for _ in range(world_size):
                                    pbar.update('Test {} - {}/{}'.format(
                                        folder, idx_d, max_idx))
                        # # collect data
                        for _, v in psnr_rlt.items():
                            dist.reduce(v, 0)
                        dist.barrier()

                        if rank == 0:
                            psnr_rlt_avg = {}
                            psnr_total_avg = 0.
                            for k, v in psnr_rlt.items():
                                psnr_rlt_avg[k] = torch.mean(v).cpu().item()
                                psnr_total_avg += psnr_rlt_avg[k]
                            psnr_total_avg /= len(psnr_rlt)
                            log_s = '# Validation # PSNR: {:.4e}:'.format(
                                psnr_total_avg)
                            for k, v in psnr_rlt_avg.items():
                                log_s += ' {}: {:.4e}'.format(k, v)
                            logger.info(log_s)
                            if opt['use_tb_logger'] and 'debug' not in opt[
                                    'name']:
                                tb_logger.add_scalar('psnr_avg',
                                                     psnr_total_avg,
                                                     current_step)
                                for k, v in psnr_rlt_avg.items():
                                    tb_logger.add_scalar(k, v, current_step)
                    else:
                        pbar = util.ProgressBar(len(val_loader))
                        psnr_rlt = {}  # with border and center frames
                        psnr_rlt_avg = {}
                        psnr_total_avg = 0.
                        for val_data in val_loader:
                            folder = val_data['folder'][0]
                            idx_d = val_data['idx'].item()
                            # border = val_data['border'].item()
                            if psnr_rlt.get(folder, None) is None:
                                psnr_rlt[folder] = []

                            model.feed_data(val_data)
                            model.test()
                            visuals = model.get_current_visuals()
                            rlt_img = util.tensor2img(visuals['rlt'])  # uint8
                            gt_img = util.tensor2img(visuals['GT'])  # uint8

                            # calculate PSNR
                            psnr = util.calculate_psnr(rlt_img, gt_img)
                            psnr_rlt[folder].append(psnr)
                            pbar.update('Test {} - {}'.format(folder, idx_d))
                        for k, v in psnr_rlt.items():
                            psnr_rlt_avg[k] = sum(v) / len(v)
                            psnr_total_avg += psnr_rlt_avg[k]
                        psnr_total_avg /= len(psnr_rlt)
                        log_s = '# Validation # PSNR: {:.4e}:'.format(
                            psnr_total_avg)
                        for k, v in psnr_rlt_avg.items():
                            log_s += ' {}: {:.4e}'.format(k, v)
                        logger.info(log_s)
                        if opt['use_tb_logger'] and 'debug' not in opt['name']:
                            tb_logger.add_scalar('psnr_avg', psnr_total_avg,
                                                 current_step)
                            for k, v in psnr_rlt_avg.items():
                                tb_logger.add_scalar(k, v, current_step)

            #### save models and training states
            if current_step % opt['logger']['save_checkpoint_freq'] == 0:
                if rank <= 0:
                    logger.info('Saving models and training states.')
                    model.save(current_step)
                    model.save_training_state(epoch, current_step)

    if rank <= 0:
        logger.info('Saving the final model.')
        model.save('latest')
        logger.info('End of training.')
        tb_logger.close()
示例#30
0
        else:
            self.early_stop_count += 1

    def evaluate(self):
        pass


def main():
    parser = argparse.ArgumentParser(description='Parse the config path')
    parser.add_argument(
        "-c",
        "--config",
        dest="path",
        default='./configs/train.json',
        help=
        'The path to the config file. e.g. python train.py --config configs/dc_config.json'
    )

    config = parser.parse_args()
    with open(config.path) as f:
        args = json.load(f)
        args = AttrDict(args)
    t = trainer(args)
    t.run()


if __name__ == "__main__":
    writer = SummaryWriter("./tensorboard/speakerbeam")
    main()
    writer.close()