def forward(self, y_z, mu_phi, log_var_phi, mu_theta, log_var_theta): std_theta = std(log_var_theta) std_phi = std(log_var_phi) N = Normal(mu_theta, std_theta) # if input more than one sample # print(y_z.shape) # print(y_z[0,-9:]) # y_z = th.mean(y_z.view(batch_size, input_size, n_samples_y), dim=2) y_z = y_z.view(-1, self.n_samples_y, self.input_size) y_z = y_z[:, -1, :] # print(y_z.shape) # print(y_z[0, -1, :]) # print(y_z.size()) # two clusters y_expanded = th.cat([y_z, y_z], dim=1) # expand(y_z) # print(y_expanded.size()) pdf_y = N.log_prob(y_expanded) pdf_y = reshape(pdf_y) # print(y_expanded) # print(pdf_y) # sample z to build empirical sample mean over z for the likelihood # we are using only one sample at time from the mixture ----> likelihood # is simply the normal loglikelihood = 0 # for every sample compute the weighted mixture for sample in range(self.n_samples_z): eps = th.randn(y_expanded.size()) # we use z_y as a selector/weight (z_i is a three dimensional Gaussian # in this way we can also measure uncertainly) z_y = eps * std_phi + mu_phi z_y = reshape(z_y) z_y = F.softmax(z_y, dim=2) # log of mixture weighted with z # print(z_y.shape) # print(pdf_y.shape) s = th.sum(pdf_y * z_y, dim=2) loglikelihood += th.sum(pdf_y * z_y, dim=2) loglikelihood /= self.n_samples_z loglikelihood = th.sum(loglikelihood, dim=1) loglikelihood = th.mean(loglikelihood) # / y_z.size()[0]*y_z.size()[1] # reduce mean over the batch size reduce sum over the lidars loglikelihood = move_to_cuda(loglikelihood) # reduce over KLD # explicit form when q(z|x) is normal and N(0,I) # what about k? 9 or 27? k = 1 # z_y.size()[2] kld = 0.5 * ((log_var_phi.exp() + mu_phi.pow(2) - log_var_phi) - k) kld = th.sum(kld, dim=1) kld = th.mean(kld) # we want to maximize this guy elbo = loglikelihood - kld # so we need to negate the elbo to minimize return -elbo, kld, loglikelihood
def train(args): ckpt = ckpt_utc() train_set = PandaDataSetImg(root_dir=args.data_dir, split=args.split) train_loader = DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True) loss_fn = LossReconstruction() loss_fn = move_to_cuda(loss_fn) model = Autoencoder() model = move_to_cuda(model) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) epoch_loss_history = [] for epoch in range(args.epochs): loss_batch_history = [] for iter, x in enumerate(train_loader): x = tensor_to_variable(x) depth_pred, _ = model(x) loss = loss_fn(x, depth_pred) loss.backward() optimizer.step() optimizer.zero_grad() loss_batch_history.append(loss.item()) epoch_loss = np.mean(loss_batch_history) epoch_loss_history.append(epoch_loss) print("train epoch: {} avg. loss: {:.4f}".format(epoch, epoch_loss)) plot_eval( np.arange(len(epoch_loss_history)), np.array(epoch_loss_history), save_to=osp.join(args.result_dir, "train_loss.png"), title="train loss", xlabel="epochs", ylabel="loss", ) th.save(model.state_dict(), osp.join(args.ckpt_dir, ckpt))
def test(args): test_set = PandaDataSetImg(root_dir=args.data_dir, split=args.split) test_loader = DataLoader(dataset=test_set, batch_size=args.batch_size, shuffle=False) model = Autoencoder() model = move_to_cuda(model) model.eval() saved_state_dict = th.load(args.ckpt_dir + args.ckpt_test) model.load_state_dict(saved_state_dict) MSE, T = 0, 0 for iter, x in enumerate(test_loader): x_pred, _ = model(x) b, _, n, m = x.size() T += n * m * b MSE += th.sum((x - x_pred) ** 2) print(MSE) print("RMSE: {:.4f}".format(np.sqrt(MSE / T)))
def predict(args, model, eval_dataloader, device, logger): model.eval() num_correct = 0 num_total = 0.0 rrs = [] # reciprocal rank for batch in tqdm(eval_dataloader): batch_to_feed = move_to_cuda(batch) with torch.no_grad(): outputs = model(batch_to_feed) q = outputs['q'] c = outputs['c'] neg_c = outputs['neg_c'] product_in_batch = torch.mm(q, c.t()) product_neg = (q * neg_c).sum(-1).unsqueeze(1) product = torch.cat([product_in_batch, product_neg], dim=-1) target = torch.arange(product.size(0)).to(product.device) ranked = product.argsort(dim=1, descending=True) prediction = product.argmax(-1) # MRR idx2rank = ranked.argsort(dim=1) for idx, t in enumerate(target.tolist()): rrs.append(1 / (idx2rank[idx][t].item() + 1)) pred_res = prediction == target num_total += pred_res.size(0) num_correct += pred_res.sum(0) acc = num_correct / num_total mrr = np.mean(rrs) logger.info(f"evaluated {num_total} examples...") logger.info(f"avg. Acc: {acc}") logger.info(f'MRR: {mrr}') model.train() return mrr
logger.info("Encoding claims and searching") questions = [_["claim"] for _ in ds_items] metrics = [] retrieval_outputs = [] for b_start in tqdm(range(0, len(questions), args.batch_size)): with torch.no_grad(): batch_q = questions[b_start:b_start + args.batch_size] batch_ann = ds_items[b_start:b_start + args.batch_size] bsize = len(batch_q) batch_q_encodes = tokenizer.batch_encode_plus( batch_q, max_length=args.max_q_len, pad_to_max_length=True, return_tensors="pt") batch_q_encodes = move_to_cuda(dict(batch_q_encodes)) q_embeds = model.encode_q( batch_q_encodes["input_ids"], batch_q_encodes["attention_mask"], batch_q_encodes.get("token_type_ids", None)) q_embeds_numpy = q_embeds.cpu().contiguous().numpy() D, I = index.search(q_embeds_numpy, args.topk) for b_idx in range(bsize): topk_docs = [] for _, doc_id in enumerate(I[b_idx]): doc = id2doc[str(doc_id)] topk_docs.append({"title": doc[0], "text": doc[1]}) # saving when there's no annotations
def train(args): ckpt = ckpt_utc() loss_fn = nELBO(args.batch_size, args.n_samples_z, args.n_samples_y) model = VAE( encoder_layer_sizes=args.encoder_layer_sizes, decoder_layer_sizes=args.decoder_layer_sizes, latent_size=args.latent_size, batch_size=args.batch_size, conditional=args.conditional, num_labels=args.num_labels, ) model = move_to_cuda(model) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) optimizer.zero_grad() dataset = Loader(split=args.split, samples=args.n_samples_y) # randomize an auxiliary index because we want to use sample of time-series (10 time steps) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=False) loss_list = [] for epoch in range(args.epochs): dataset.generate_index() print("Epoch: ", epoch) L = [] for itr, batch in enumerate(data_loader): # observable y, x = batch y = tensor_to_variable(y) x = tensor_to_variable(x) if y.size(0) != args.batch_size: continue else: mu_phi, log_var_phi, mu_theta, log_var_theta = model(y, x) loss, kld, ll = loss_fn(y, mu_phi, log_var_phi, mu_theta, log_var_theta) if args.split == "train": loss.backward() optimizer.step() optimizer.zero_grad() # compute the loss averaging over epochs and dividing by batches L.append(loss.cpu().data.numpy()) print("negative likelihood: ", -ll.cpu().data.numpy()) print("kl: ", kld.cpu().data.numpy()) print("loss:", loss.cpu().data.numpy()) loss_list.append(np.mean(L) / (len(data_loader))) plt.plot(np.array(loss_list)) plt.grid() plt.show() path_exists(args.ckpt_dir) th.save(model.state_dict(), args.ckpt_dir + ckpt) print("done!")
def train(args): ckpt = ckpt_utc() loss_fn = Loss( args.batch_size, args.n_samples_y, args.lidar_input_size, args.n_clusters, model_type=args.model_type, is_entropy=args.is_entropy, lmbda=args.lmbda, ) model = Model( encoder_layer_sizes=args.encoder_layer_sizes, latent_size=args.latent_size, n_clusters=args.n_clusters, batch_size=args.batch_size, model_type=args.model_type, is_multimodal=args.is_multimodal, ) model = move_to_cuda(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) optimizer.zero_grad() dataset = Dataset(path=args.data_dir, split=args.split, n_samples=args.n_samples_y) # randomize an auxiliary index because we want to use random sample of time-series (10 time steps) # but the time series have to be intact data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=False) dataset_val = Dataset(path=args.data_dir, split="val", n_samples=args.n_samples_y) data_loader_val = DataLoader(dataset=dataset_val, batch_size=args.batch_size, shuffle=False) loss_train, loss_val = [], [] for epoch in range(args.epochs): model.train() dataset.generate_index() print("Epoch: ", epoch) loss_epoch = [] for itr, batch in enumerate(data_loader): # observable y, x, depth = batch y = tensor_to_variable(y) x = tensor_to_variable(x) depth = tensor_to_variable(depth) mu_c, std_c, clusters = model(x) loss = loss_fn(y, mu_c, std_c, clusters) loss.backward() optimizer.step() optimizer.zero_grad() # compute the loss averaging over epochs and dividing by batches loss_epoch.append(loss.cpu().data.numpy()) print("train loss:", np.mean(loss_epoch)) loss_train.append(np.mean(loss_epoch)) if epoch % args.test_every_n_epochs == 0: model.eval() loss_epoch = [] with th.no_grad(): for itr, batch in enumerate(data_loader): y, x = batch mu_c, std_c, clusters = model(x) loss = loss_fn(y, mu_c, std_c, clusters) loss_epoch.append(loss.cpu().data.numpy()) print("val loss:", np.mean(loss_epoch)) loss_val.append(np.mean(loss_epoch)) plt.plot(np.array(loss_train)) plt.plot(np.array(loss_val)) plt.grid() plt.show() path_exists(args.ckpt_dir) th.save(model.state_dict(), args.ckpt_dir + ckpt) print("done!")
def train(args): ckpt = ckpt_utc() loss_fn = Loss() model = Model(layer_sizes=args.encoder_layer_sizes, latent_size=args.latent_size, is_uq=False) model = move_to_cuda(model) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) optimizer.zero_grad() # randomize an auxiliary index because we want to use random sample of time-series (10 time steps) # but the time series have to be intact dataset = Dataset( path=args.data_dir, path_images=args.data_dir + "TRAIN_DATA/DEPTH/", split=args.split, n_samples=args.n_samples_y, is_label_y=args.is_label_y, is_multimodal=args.is_multimodal, ) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=False) dataset_val = Dataset( path=args.data_dir, path_images=args.data_dir + "TRAIN_DATA/DEPTH/", split="val", n_samples=args.n_samples_y, is_label_y=args.is_label_y, is_multimodal=args.is_multimodal, ) data_loader_val = DataLoader(dataset=dataset_val, batch_size=args.batch_size, shuffle=False) loss_train, loss_val = [], [] for epoch in range(args.epochs): model.train() dataset.generate_index() print("Epoch: ", epoch) loss_epoch = [] for itr, batch in enumerate(data_loader): # observable y, x, lbl = batch y = tensor_to_variable(y) x = tensor_to_variable(x) lbl = tensor_to_variable(lbl) state = th.cat([y, x], dim=1) pred = model(state) pred = pred.reshape(-1, args.n_clusters, args.lidar_input_size) # .permute(0,2,1) loss = loss_fn(pred, lbl) loss.backward() optimizer.step() optimizer.zero_grad() # compute the loss averaging over epochs and dividing by batches loss_epoch.append(loss.cpu().data.numpy()) print("train loss:", np.mean(loss_epoch)) loss_train.append(np.mean(loss_epoch)) if epoch % args.test_every_n_epochs == 0: model.eval() loss_epoch = [] with th.no_grad(): for itr, batch in enumerate(data_loader): y, x, lbl = batch state = th.cat([y, x], dim=1) pred = model(state) pred = pred.reshape( -1, args.n_clusters, args.lidar_input_size) # .permute(0,2,1) loss = loss_fn(pred, lbl) loss_epoch.append(loss.cpu().data.numpy()) print("val loss:", np.mean(loss_epoch)) loss_val.append(np.mean(loss_epoch)) plt.plot(np.array(loss_train)) plt.plot(np.array(loss_val)) plt.grid() plt.show() path_exists(args.ckpt_dir) th.save(model.state_dict(), args.ckpt_dir + ckpt) print("done!")
def main(): args = train_args() if args.fp16: apex.amp.register_half_function(torch, 'einsum') date_curr = date.today().strftime("%m-%d-%Y") model_name = f"{args.prefix}-seed{args.seed}-bsz{args.train_batch_size}-fp16{args.fp16}-lr{args.learning_rate}-decay{args.weight_decay}-warm{args.warmup_ratio}-{args.model_name}" args.output_dir = os.path.join(args.output_dir, date_curr, model_name) tb_logger = SummaryWriter( os.path.join(args.output_dir.replace("logs", "tflogs"))) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print( f"output directory {args.output_dir} already exists and is not empty." ) os.makedirs(args.output_dir, exist_ok=True) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, handlers=[ logging.FileHandler(os.path.join(args.output_dir, "log.txt")), logging.StreamHandler() ]) logger = logging.getLogger(__name__) logger.info(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") bert_config = AutoConfig.from_pretrained(args.model_name) if args.momentum: model = MomentumRetriever(bert_config, args) elif "roberta" in args.model_name: model = RobertaRetrieverSingle(bert_config, args) else: model = BertRetrieverSingle(bert_config, args) tokenizer = AutoTokenizer.from_pretrained(args.model_name) collate_fc = partial(sp_collate, pad_id=tokenizer.pad_token_id) if args.do_train and args.max_c_len > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_c_len, bert_config.max_position_embeddings)) if "fever" in args.predict_file: eval_dataset = FeverSingleDataset(tokenizer, args.predict_file, args.max_q_len, args.max_c_len) else: eval_dataset = SPDataset(tokenizer, args.predict_file, args.max_q_len, args.max_c_len) eval_dataloader = DataLoader(eval_dataset, batch_size=args.predict_batch_size, collate_fn=collate_fc, pin_memory=True, num_workers=args.num_workers) logger.info(f"Num of dev batches: {len(eval_dataloader)}") if args.init_checkpoint != "": model = load_saved(model, args.init_checkpoint) model.to(device) print( f"number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}" ) if args.do_train: no_decay = ['bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = Adam(optimizer_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.fp16: model, optimizer = apex.amp.initialize( model, optimizer, opt_level=args.fp16_opt_level) else: if args.fp16: model = apex.amp.initialize(model, opt_level=args.fp16_opt_level) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: global_step = 0 # gradient update step batch_step = 0 # forward batch count best_mrr = 0 train_loss_meter = AverageMeter() model.train() if "fever" in args.predict_file: train_dataset = FeverSingleDataset(tokenizer, args.train_file, args.max_q_len, args.max_c_len, train=True) else: train_dataset = SPDataset(tokenizer, args.train_file, args.max_q_len, args.max_c_len, train=True) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, pin_memory=True, collate_fn=collate_fc, num_workers=args.num_workers, shuffle=True) t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warmup_steps = t_total * args.warmup_ratio scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) logger.info('Start training....') for epoch in range(int(args.num_train_epochs)): for batch in tqdm(train_dataloader): batch_step += 1 batch = move_to_cuda(batch) loss = loss_single(model, batch, args.momentum) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss_meter.update(loss.item()) if (batch_step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( apex.amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 tb_logger.add_scalar('batch_train_loss', loss.item(), global_step) tb_logger.add_scalar('smoothed_train_loss', train_loss_meter.avg, global_step) if args.eval_period != -1 and global_step % args.eval_period == 0: mrr = predict(args, model, eval_dataloader, device, logger) logger.info( "Step %d Train loss %.2f MRR %.2f on epoch=%d" % (global_step, train_loss_meter.avg, mrr * 100, epoch)) if best_mrr < mrr: logger.info( "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" % (best_mrr * 100, mrr * 100, epoch)) torch.save( model.state_dict(), os.path.join(args.output_dir, f"checkpoint_best.pt")) model = model.to(device) best_mrr = mrr mrr = predict(args, model, eval_dataloader, device, logger) logger.info("Step %d Train loss %.2f MRR %.2f on epoch=%d" % (global_step, train_loss_meter.avg, mrr * 100, epoch)) tb_logger.add_scalar('dev_mrr', mrr * 100, epoch) if best_mrr < mrr: torch.save( model.state_dict(), os.path.join(args.output_dir, f"checkpoint_last.pt")) logger.info( "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" % (best_mrr * 100, mrr * 100, epoch)) torch.save( model.state_dict(), os.path.join(args.output_dir, f"checkpoint_best.pt")) model = model.to(device) best_mrr = mrr logger.info("Training finished!") elif args.do_predict: acc = predict(args, model, eval_dataloader, device, logger) logger.info(f"test performance {acc}")