def train(args, exp_num, data=None):
    #assert args.load, 'Model name not provided'
    #assert os.path.isfile(args.load), 'Model file not found'
    if args.load and os.path.isfile(args.load):
        load_pretrained_model = True
    else:
        load_pretrained_model = False
    args_subset = ['exp', 'cpk', 'model', 'time', 'chunks']
    book = BookKeeper(args,
                      args_subset,
                      args_dict_update={
                          'chunks': args.chunks,
                          'batch_size': args.batch_size,
                          'model': args.model,
                          's2v': args.s2v,
                          'cuda': args.cuda,
                          'save_dir': args.save_dir,
                          'early_stopping': args.early_stopping,
                          'debug': args.debug,
                          'stop_thresh': args.stop_thresh,
                          'desc': args.desc,
                          'curriculum': args.curriculum,
                          'lr': args.lr
                      },
                      tensorboard=args.tb,
                      load_pretrained_model=load_pretrained_model)
    ## load_pretrained_model makes sure that the model is loaded, old save files are not updated and _new_exp is called to assign new filename

    args = book.args

    ## Start Log
    book._start_log()

    ## Training parameters
    path2data = args.path2data
    dataset = args.dataset
    lmksSubset = args.lmksSubset
    desc = args.desc
    split = (args.train_frac, args.dev_frac)
    idx_dependent = args.idx_dependent
    batch_size = args.batch_size
    time = args.time
    global chunks
    chunks = args.chunks
    offset = args.offset
    mask = args.mask
    feats_kind = args.feats_kind
    s2v = args.s2v
    f_new = args.f_new
    curriculum = args.curriculum

    if args.debug:
        shuffle = False
    else:
        shuffle = True

    ## Load data iterables
    if data is None:
        data = Data(path2data,
                    dataset,
                    lmksSubset,
                    desc,
                    split,
                    batch_size=batch_size,
                    time=time,
                    chunks=chunks,
                    offset=offset,
                    shuffle=shuffle,
                    mask=mask,
                    feats_kind=feats_kind,
                    s2v=s2v,
                    f_new=f_new)
        print('Data Loaded')
    else:
        print('Data already loaded! Yesss!!')

    train = data.train
    dev = data.dev
    test = data.test

    ## Create a model
    device = torch.device('cuda:{}'.format(
        args.cuda)) if args.cuda >= 0 else torch.device('cpu')
    input_shape = data.input_shape
    kwargs_keys = ['pose_size', 'trajectory_size']
    modelKwargs = {key: input_shape[key] for key in kwargs_keys}
    modelKwargs.update(args.modelKwargs)

    ## TODO input_size is hardcoded to the w2v input size. can be extracted from Data
    if args.s2v:
        input_size = 300
    elif args.desc:
        input_size = len(args.desc)
    else:
        input_size = 0
    model = eval(args.model)(chunks,
                             input_size=input_size,
                             Seq2SeqKwargs=modelKwargs,
                             load=args.load)
    model.to(device).double()
    book._copy_best_model(model)
    print('Model Created')

    ## would have to skip this way of loading model
    #if args.load:
    #  print('Loading Model')
    #  book._load_model(model)

    ## Loss function
    criterion = Loss(args.losses, args.lossKwargs)

    ## Optimizers
    optim = torch.optim.Adam(model.parameters(), lr=args.lr)

    ## LR scheduler
    scheduler = lr_scheduler.ExponentialLR(optim, gamma=0.99)

    ## Transforms
    columns = get_columns(feats_kind, data)
    pre = Transforms(args.transforms, columns, args.seed, mask, feats_kind,
                     dataset, f_new)

    def loop(model, data, pre, desc='train', epoch=0):
        running_loss = 0
        running_internal_loss = 0
        running_count = 0

        if desc == 'train':
            model.train(True)
        else:
            model.eval()

        Tqdm = tqdm(data,
                    desc=desc + ' {:.4f}'.format(0),
                    leave=False,
                    ncols=20)
        for count, batch in enumerate(Tqdm):
            model.zero_grad()
            optim.zero_grad()
            X, Y, s2v = batch['input'], batch['output'], batch['desc']
            pose, trajectory, start_trajectory = X
            pose_gt, trajectory_gt, start_trajectory_gt = Y

            x = torch.cat((trajectory, pose), dim=-1)
            y = torch.cat((trajectory_gt, pose_gt), dim=-1)

            x = x.to(device)
            y = y.to(device)
            if isinstance(s2v, torch.Tensor):
                s2v = s2v.to(device)

            ## Transform before the model
            x = pre.transform(x)
            y = pre.transform(y)

            if desc == 'train':
                y_cap, internal_losses = model(x, s2v, train=True)
            else:
                y_cap, internal_losses = model(x, s2v, train=False)

            loss = 0
            loss_ = 0
            if y_cap is not None:  ## if model returns only internal losses
                loss = criterion(y_cap, y)
                loss_ = loss.item()

            for i_loss in internal_losses:
                loss += i_loss
                loss_ += i_loss.item()
                running_internal_loss += i_loss.item()

            running_count += np.prod(y.shape)
            running_loss += loss_
            ## update tqdm
            Tqdm.set_description(desc + ' {:.4f} {:.4f}'.format(
                running_loss / running_count, running_internal_loss /
                running_count))
            Tqdm.refresh()

            if desc == 'train':
                loss.backward()
                optim.step()

            x = x.detach()
            y = y.detach()
            loss = loss.detach()
            if y_cap is not None:
                y_cap = y_cap.detach()
            internal_losses = [i.detach() for i in internal_losses]
            if count >= 0 and args.debug:  ## debugging by overfitting
                break

        return running_loss / running_count

    num_epochs = args.num_epochs

    ## set up curriculum learning for training
    time_list = []
    time_list_idx = 0
    if curriculum:
        for power in range(1, int(np.log2(time - 1)) + 1):
            time_list.append(2**power)
        data.update_dataloaders(time_list[0])
    time_list.append(time)
    tqdm.write('Training up to time: {}'.format(time_list[time_list_idx]))

    ## Training Loop
    for epoch in tqdm(range(num_epochs), ncols=20):
        train_loss = loop(model, train, pre, 'train', epoch)
        dev_loss = loop(model, dev, pre, 'dev', epoch)
        test_loss = loop(model, test, pre, 'test', epoch)
        scheduler.step()  ## Change the Learning Rate

        ## save results
        book.update_res({
            'train': train_loss,
            'dev': dev_loss,
            'test': test_loss
        })
        book._save_res()

        ## update tensorboard
        book.update_tb({
            'scalar': [[f'{args.cpk}/train', train_loss, epoch],
                       [f'{args.cpk}/dev', dev_loss, epoch],
                       [f'{args.cpk}/test', test_loss, epoch]]
        })

        # 'histogram':[[f'{args.cpk}/'+name, param.clone().cpu().detach().numpy(), epoch]
        #             for name, param in model.named_parameters()]})

        ## print results
        book.print_res(epoch,
                       key_order=['train', 'dev', 'test'],
                       exp=exp_num,
                       lr=scheduler.get_lr())

        if book.stop_training(model, epoch):
            ## if early_stopping criterion is met,
            ## start training with more time steps
            time_list_idx += 1
            book.stop_count = 0  ## reset the threshold counter
            book.best_dev_score = np.inf
            model.load_state_dict(copy.deepcopy(book.best_model))
            if len(time_list) > time_list_idx:
                time_ = time_list[time_list_idx]
                data.update_dataloaders(time_)
                tqdm.write('Training up to time: {}'.format(time_))
            else:
                break

    ## Sample
    print('Loading the best model and running the sample loop')
    args.__dict__.update({
        'load':
        book.name(book.weights_ext[0], book.weights_ext[1], args.save_dir)
    })
    sample(args, exp_num, data)

    ## Render (on a cpu only node)
    # feats_kind_dict = {'rifke':'fke'}
    # print('Rendering')
    # render = Slurm('render', slurm_kwargs={'partition':'cpu_long', 'time':'10-00:00', 'n':10})
    # python_cmd = ['source activate torch',
    #               'python render.py -dataset {} -load {} -feats_kind {} -render_list {}'.format(
    #                 args.dataset,
    #                 args.load,
    #                 feats_kind_dict[args.feats_kind],
    #                 args.render_list)]
    # render.run('\n'.join(python_cmd))

    ## Render new sentences
    print('Rendering New Sentences')
    render_new_sentences(args, exp_num, data)

    # End Log
    book._stop_log()
示例#2
0
def train(args, exp_num):
  args_subset = ['exp', 'cpk', 'model', 'time']
  book = BookKeeper(args, args_subset, args_dict_update={},
                    tensorboard=args.tb)
  args = book.args
  global ARGS
  ARGS = args

  ## Start Log
  book._start_log()
  
  ## Training parameters
  path2data = args.path2data
  dataset = args.dataset
  lmksSubset = args.lmksSubset
  desc = args.desc
  split = (args.train_frac, args.dev_frac)
  idx_dependent = args.idx_dependent
  batch_size = args.batch_size
  time = args.time
  chunks = args.chunks
  offset = args.offset
  mask = args.mask
  feats_kind = args.feats_kind
  s2v = args.s2v
  f_new = args.f_new
  curriculum = args.curriculum
  kl_anneal = args.kl_anneal
  
  ## Load data iterables
  data = Data(path2data, dataset, lmksSubset, desc,
              split, batch_size=batch_size,
              time=time,
              chunks=chunks,
              offset=offset,
              shuffle=True,
              mask=mask,
              feats_kind=feats_kind,
              s2v=s2v,
              f_new=f_new)

  print('Data Loaded')
  
  ## Create a model
  device = torch.device('cuda:{}'.format(args.cuda)) if args.cuda>=0 else torch.device('cpu')
  input_shape = data.input_shape
  kwargs_keys = ['pose_size', 'trajectory_size']
  modelKwargs = {key:input_shape[key] for key in kwargs_keys}
  modelKwargs.update(args.modelKwargs)

  model = eval(args.model)(**modelKwargs)
  model.to(device).double()

  book._copy_best_model(model)
  print('Model Created')
    
  ## Load model
  if args.load:
    print('Loading Model')
    book._load_model(model)

  ## Loss function
  criterion = Loss(args.losses, args.lossKwargs)
  
  ## Optimizers
  optim = torch.optim.Adam(model.parameters(), lr=args.lr)
  #optim = torch.optim.RMSprop(model.parameters(), lr=args.lr)
  ## LR scheduler
  scheduler = lr_scheduler.ExponentialLR(optim, gamma=0.99)
  
  ## Transforms
  columns = get_columns(feats_kind, data)
  pre = Transforms(args.transforms, columns, args.seed, mask, feats_kind, dataset, f_new)

  def loop(model, data, pre, desc='train', epoch=0):
    running_loss = 0
    running_internal_loss = 0
    running_count = 0
    # if kl_anneal > 0:
    #   kl_weight = lambda x: min((x+1)/(kl_anneal+1.), 2)
    # else:
    #   kl_weight = lambda x: 1
    count = 0
    if desc == 'train':
      model.train(True)
    else:
      model.eval()
    
    Tqdm = tqdm(data, desc=desc+' {:.4f}'.format(running_loss/(count+1.)), leave=False, ncols=20)
    for count, batch in enumerate(Tqdm):
      model.zero_grad()
      optim.zero_grad()
      
      X, Y = batch['input'], batch['output']
      pose, trajectory, start_trajectory = X
      pose_gt, trajectory_gt, start_trajectory_gt = Y

      x = torch.cat((trajectory, pose), dim=-1)
      y = torch.cat((trajectory_gt, pose_gt), dim=-1)
      
      x = x.to(device)
      y = y.to(device)

      ## Transform before the model
      x = pre.transform(x)
      y = pre.transform(y)

      if desc=='train':
        y_cap, internal_losses = model(x, train=True)
      else:
        y_cap, internal_losses = model(x, train=False)

      loss = 0
      loss_ = 0
      if y_cap is not None: ## if model returns only internal losses
        loss = criterion(y_cap, y)
        loss_ = loss.item()

      for i_loss in internal_losses:
        loss += i_loss
        loss_ += i_loss.item()
        running_internal_loss += i_loss.item()

      running_count +=  np.prod(y.shape)    
      running_loss += loss_
        
      # loss = criterion(y_cap, y)
      # loss_= loss.item()
      # #if count == 0 and desc == 'train':
      # #  pdb.set_trace()
      # for i_loss in internal_losses:
      #   loss += kl_weight(epoch) * i_loss
      #   loss_ += i_loss.item()
      #   running_internal_loss += i_loss
      
      # #running_loss += loss.item()
      # running_loss += loss_

      ## update tqdm
      Tqdm.set_description(desc+' {:.4f} {:.4f}'.format(running_loss/running_count, running_internal_loss/running_count))
      Tqdm.refresh()
      
      if desc == 'train':
        loss.backward()
        optim.step()
        # if kl_anneal == 0:
        #   y_cap, internal_losses = model(x, train=True, epoch=epoch)
        #   sum(internal_losses).backward()
        #   optim.step()

      x = x.detach()
      y = y.detach()
      loss = loss.detach()
      if y_cap is not None:
        y_cap = y_cap.detach()
      internal_losses = [i.detach() for i in internal_losses]

      if count>=0 and args.debug: ## debugging by overfitting
        break

    return running_loss/running_count

  num_epochs = args.num_epochs

  ## set up curriculum learning for training
  time_list = []
  time_list_idx = 0
  if curriculum:
    for power in range(1, int(np.log2(time-1)) + 1):
      time_list.append(2**power)
    data.update_dataloaders(time_list[0])
  time_list.append(time)
  
  ## Training Loop
  for epoch in tqdm(range(num_epochs), ncols=20):
    train_loss = loop(model, data.train, pre, 'train', epoch)
    dev_loss = loop(model, data.dev, pre, 'dev')
    test_loss = loop(model, data.test, pre, 'test')
    scheduler.step() ## Change the Learning Rate
    
    ## save results
    book.update_res({'train':train_loss,
                     'dev':dev_loss,
                     'test':test_loss})
    book._save_res()

    ## update tensorboard
    book.update_tb({'scalar':[[f'{args.cpk}/train', train_loss, epoch],
                              [f'{args.cpk}/dev', dev_loss, epoch],
                              [f'{args.cpk}/test', test_loss, epoch]]})
                   
                   # 'histogram':[[f'{args.cpk}/'+name, param.clone().cpu().detach().numpy(), epoch]
                    #             for name, param in model.named_parameters()]})

    ## print results
    book.print_res(epoch, key_order=['train','dev','test'], exp=exp_num, lr=scheduler.get_lr())

    ## ignore increasing dev loss till the annealing occurs
    # if epoch < kl_anneal:
    #   book.stop_count = 0
    
    if book.stop_training(model, epoch):
      ## if early_stopping criterion is met,
      ## start training with more time steps
      time_list_idx += 1
      book.stop_count = 0 ## reset the threshold counter
      book.best_dev_score = np.inf
      model.load_state_dict(copy.deepcopy(book.best_model))
      if len(time_list) > time_list_idx:
        time_ = time_list[time_list_idx]
        data.update_dataloaders(time_)
        tqdm.write('Training up to time: {}'.format(time_))
      else:
        break
    
  # End Log
  book._stop_log()

  # ## Sample
  print('Loading the best model and training with language input as well')
  args.__dict__.update({'load':book.name(book.weights_ext[0], book.weights_ext[1], args.save_dir),
                        'model':'Seq2SeqConditioned10'})
  train_wordConditioned(args, exp_num, data)