def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model, step = get_model(word_vectors, char_vectors, log, args) model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler #optimizer = get_opt(model,args) #optimizer = optim.Adam(model.parameters(),lr=1,betas=[0.8,0.999],eps=1e-7,weight_decay=args.l2_wd) #warmup=args.warmup # cooldown=args.hidden_size*100 #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5) if step<=cooldown else max(1e-4 ,1e-3*(1-min(1,(step-cooldown)/cooldown))**0.5)) #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5)) ''' cr = 1.0 / math.log(warmup) scheduler = sched.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warmup else 1) ''' optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda step: 1) # Get data loader log.info('Building dataset...') if (args.model_name == "KnowGQA"): train_dataset = SQUADwithGraph(args.train_record_file_graph, args.use_squad_v2) dev_dataset = SQUADwithGraph(args.dev_record_file_graph, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn_graph) dev_loader = data.DataLoader(dev_dataset, batch_size=4, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn_graph) else: train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: if (args.model_name == "KnowGQA"): for cw_idxs, cc_idxs, qw_idxs, qc_idxs, co_idxs, adjs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) co_idxs = co_idxs.to(device) adjs = adjs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, co_idxs, adjs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "KnowGQA", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) elif args.model_name == "BiDAF_nochar": for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "BiDAF_nochar", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) else: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "BiDAF", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) # Writes entries directly to event files in the logdir to be consumed by TensorBoard. device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # 一个gpu: batch_size=64 看实际情况 # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: # default=None log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # ema_decay = 0.999 # ema core => new_average = (1.0 - decay) * param.data + decay * self.shadow[name] # Get saver # metric_name: NLL or EM F1 saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, # max_checkpoints = 5 metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) # lr : default=0.5 l2_wd : default=0 scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) # train_record_file = './data/train.npz' train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, # 64 shuffle=True, # sampler = RandomSampler(dataset) batch_sampler = BatchSampler(sampler, batch_size, drop_last) num_workers=args.num_workers, # 4 collate_fn=collate_fn) # merges a list of samples to form a mini-batch. dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) # dev_record_file = './data/dev.npz' dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)# Merge examples of different length by padding all examples to the maximum length in the batch. # Train log.info('Training...') steps_till_eval = args.eval_steps # 50000 epoch = step // len(train_dataset) # len(train_dataset)= 7 epoch=0 while epoch != args.num_epochs: # 30 epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # 64 optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # max_grad_norm : default=5.0 optimizer.step() # 进行1次optimize scheduler.step(step // batch_size)# train : step=0 ema(model, step // batch_size) # def __call__(self, model, num_updates): # Log info step += batch_size #step: 0 batch_size=64 progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) # Add scalar data to summary. tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # 50000 # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) ## results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, # './data/dev_eval.json' args.max_ans_len, # 15 args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') ''' TODO: YOUR MODEL HERE ''' model = None model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = MyDataset(args.train_record_file) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = MyDataset(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for features, ys, ids in train_loader: # Setup for forward batch_size = 1 # TODO: optimizer.zero_grad() # Forward outputs = model(features) y = y.to(device) loss = loss_fn(outputs, y) # TODO loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices name = "train_exp2" args.save_dir = util.get_save_dir(args.logging_dir, name, training=True) log = get_logger(args.save_dir, name) tbx = SummaryWriter(args.save_dir) device, gpu_ids = util.get_available_devices() log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") args.batch_size *= max(1, len(gpu_ids)) # Set random seed log.info(f"Using random seed {args.random_seed}...") random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # Get embeddings log.info(f"Loading embeddings from {args.word_emb_file}...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, gpu_ids) if args.load_path: log.info(f"Loading checkpoint from {args.load_path}...") model, step = util.load_model(model, args.load_path, gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.learning_rate, weight_decay=args.learning_rate_decay) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR scheduler = sched.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.1, patience=2, verbose=True, cooldown=0 min_lr=0.0005) for epoch in range(args.num_epochs): log.info(f"Starting epoch {epoch}...") for i in range(args.num_train_chunks): # Get data loader train_rec_file = f"{args.train_record_file_exp2}_{i}.npz" log.info(f'Building dataset from {train_rec_file} ...') train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 # torch.set_num_threads(7) with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = qw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f"Evaluating at step {step}...") ema.assign(model) for i in range(args.num_dev_chunks): # Get data loader all_pred_dicts = {} all_results = OrderedDict() dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz" log.info(f'Building evaluating dataset from {dev_rec_file} ...') dev_dataset = SQuAD(dev_rec_file, args.exp2_dev_topic_contexts, use_v2=True) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, use_squad_v2=True) all_results.update(results) all_pred_dicts.update(pred_dict) del dev_dataset del dev_loader del results del pred_dict torch.cuda.empty_cache() saver.save(step, model, all_results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items()) log.info(f"Dev {results_str}") # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in all_results.items(): tbx.add_scalar(f"dev/{k}", v, step) util.visualize(tbx, pred_dict=all_pred_dicts, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) torch.cuda.empty_cache() del train_dataset del train_loader torch.cuda.empty_cache()
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # ###################################### # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) # train_examples = None # train_examples = read_squad_examples( # input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) # train_features = convert_examples_to_features( # examples=train_examples, # tokenizer=tokenizer, # max_seq_length=args.max_seq_length, # doc_stride=args.doc_stride, # max_query_length=args.max_query_length, # is_training=True) # if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) # with open(cached_train_features_file, "wb") as writer: # pickle.dump(train_features, writer) # all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) # x = all_input_ids ########################################### # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # added_flag cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(course_dir, text_embedding_size, audio_embedding_size, image_embedding_size, hidden_size, drop_prob, max_text_length, out_heatmaps_dir, args, batch_size=3, num_epochs=100): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Create Dataset objects text_dataset = TextDataset(course_dir, max_text_length) audio_dataset = AudioDataset(course_dir) target_dataset = TargetDataset(course_dir) # Preprocess the image in prescribed format normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) image_dataset = ImageDataset(course_dir, transform) assert len(text_dataset) == len(audio_dataset) and len( audio_dataset) == len(image_dataset) and len(image_dataset) == len( target_dataset), "Unequal dataset lengths" # Creating data indices for training and validation splits: train_indices, val_indices = gen_train_val_indices(text_dataset) # Creating PT data samplers and loaders: train_sampler = torch.utils.data.SequentialSampler(train_indices) val_sampler = torch.utils.data.SequentialSampler(val_indices) # Get sentence embeddings train_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get Audio embeddings train_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get images train_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Load Target text train_target_loader = torch.utils.data.DataLoader( target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=train_sampler) val_target_loader = torch.utils.data.DataLoader(target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=val_sampler) # print("lens - train_text_loader {}, val_text_loader {}".format(len(train_text_loader), len(val_text_loader))) # print("lens - train_audio_loader {}, val_audio_loader {}".format(len(train_audio_loader), len(val_audio_loader))) # print("lens - train_image_loader {}, val_image_loader {}".format(len(train_image_loader), len(val_image_loader))) # print("lens - train_target_loader {}, val_target_loader {}".format(len(train_target_loader), len(val_target_loader))) # Create model model = MMBiDAF(hidden_size, text_embedding_size, audio_embedding_size, image_embedding_size, device, drop_prob, max_text_length) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # For exponential moving average # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Need to change the metric name # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Let's do this! loss = 0 eps = 1e-8 log.info("Training...") steps_till_eval = args.eval_steps epoch = step // len(TextDataset(course_dir, max_text_length)) while epoch != args.num_epochs: epoch += 1 log.info("Starting epoch {epoch}...") count_item = 0 loss_epoch = 0 with torch.enable_grad(), tqdm( total=len(train_text_loader.dataset)) as progress_bar: for (batch_text, original_text_lengths), ( batch_audio, original_audio_lengths), ( batch_images, original_img_lengths), (batch_target_indices, batch_source_paths, batch_target_paths, original_target_len) in zip( train_text_loader, train_audio_loader, train_image_loader, train_target_loader): loss = 0 max_dec_len = torch.max( original_target_len ) # TODO check error : max decoder timesteps for each batch # Transfer tensors to GPU batch_text = batch_text.to(device) log.info("Loaded batch text") batch_audio = batch_audio.to(device) log.info("Loaded batch audio") batch_images = batch_images.to(device) log.info("Loaded batch image") batch_target_indices = batch_target_indices.to(device) log.info("Loaded batch targets") # Setup for forward batch_size = batch_text.size(0) optimizer.zero_grad() log.info("Starting forward pass") # Forward batch_out_distributions, loss = model( batch_text, original_text_lengths, batch_audio, original_audio_lengths, batch_images, original_img_lengths, batch_target_indices, original_target_len, max_dec_len) loss_val = loss.item() # numerical value of loss loss_epoch = loss_epoch + loss_val log.info("Starting backward") # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) # To tackle exploding gradients optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) # TODO # scores, results = evaluate(model, dev_loader, device, # args.dev_eval_file, # args.max_ans_len, # args.use_squad_v2) saver.save(step, model, device) ema.resume(model) # Generate summary print('Generated summary for iteration {}: '.format(epoch)) summaries = get_generated_summaries(batch_out_distributions, original_text_lengths, batch_source_paths) print(summaries) # Evaluation # rouge = Rouge() # rouge_scores = rouge.get_scores(batch_source_paths, batch_target_paths, avg=True) # print('Rouge score at iteration {} is {}: '.format(epoch, rouge_scores)) # Generate Output Heatmaps # sns.set() # for idx in range(len(out_distributions)): # out_distributions[idx] = out_distributions[idx].squeeze(0).detach().numpy() # Converting each timestep distribution to numpy array # out_distributions = np.asarray(out_distributions) # Converting the timestep list to array # ax = sns.heatmap(out_distributions) # fig = ax.get_figure() # fig.savefig(out_heatmaps_dir + str(epoch) + '.png') print("Epoch loss is : {}".format(loss_epoch / count_item))
def main(args): if args.large: args.train_record_file += '_large' args.dev_eval_file += '_large' model_name = "albert-xlarge-v2" else: model_name = "albert-base-v2" if args.xxlarge: args.train_record_file += '_xxlarge' args.dev_eval_file += '_xxlarge' model_name = "albert-xxlarge-v2" # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') if args.bidaf: char_vectors = util.torch_from_json(args.char_emb_file) if args.model_name == 'albert_highway': model = models.albert_highway(model_name) elif args.model_name == 'albert_lstm_highway': model = models.LSTM_highway(model_name, hidden_size=args.hidden_size) elif args.model_name == 'albert_bidaf': model = models.BiDAF(char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.model_name == 'albert_bidaf2': model = models.BiDAF2(model_name=model_name, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) else: model = AlbertForQuestionAnswering.from_pretrained(args.model_name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2, args.bidaf) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) with open(args.dev_gold_file) as f: gold_dict = json.load(f) tokenizer = AlbertTokenizer.from_pretrained(model_name) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch in train_loader: batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], 'start_positions': batch[3], 'end_positions': batch[4], } if args.bidaf: inputs['char_ids'] = batch[6] y1 = batch[3] y2 = batch[4] # Setup for forward batch_size = inputs["input_ids"].size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(**inputs) y1, y2 = y1.to(device), y2.to(device) outputs = model(**inputs) loss = outputs[0] loss = loss.mean() # loss_fct = nn.CrossEntropyLoss() # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2) # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(args, model, dev_dataset, dev_loader, gold_dict, tokenizer, device, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get size of char vocab with open(args.char2idx_file, 'r') as fh: char_vocab_size = len(json_load(fh)) # Get model log.info('Building model...') model = QANet(word_vectors=word_vectors, hidden_size=args.hidden_size, char_vocab_size = char_vocab_size, char_emb_size = args.char_emb_size, word_char_emb_size = args.word_char_emb_size, drop_prob=args.drop_prob, num_blocks_embd = args.num_blocks_embd, num_conv_embd = args.num_conv_embd, kernel_size = args.kernel_size, num_heads = args.num_heads, num_blocks_model = args.num_blocks_model, num_conv_model = args.num_conv_model, dropout_char = args.dropout_char, dropout_word = args.dropout_word, survival_prob = args.survival_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler #params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=1, betas=(args.beta1, args.beta2), eps=args.adam_eps, weight_decay=args.l2_wd, params=model.parameters()) cr = args.lr / math.log2(args.warm_up) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < args.warm_up else args.lr) # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) torch.autograd.set_detect_anomaly(True) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs=cc_idxs.to(device) qc_idxs=qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # set device if args.use_gpu and torch.cuda.is_available(): device = torch.device("cuda:{}".format(args.gpu_ids[0])) args.batch_size *= max(1, len(args.gpu_ids)) print(f"device is cuda: gpu_ids = {args.gpu_ids}") else: device = torch.device("cpu") print("device is cpu") # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = NAQANet( device, word_vectors, char_vectors, c_max_len=args.context_limit, q_max_len=args.question_limit, answering_abilities=['passage_span_extraction', 'counting'], max_count=args.max_count ) # doesn't large max_count lead to meaningless probability? if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler lr = args.lr base_lr = 1.0 warm_up = args.lr_warm_up_num params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(lr=base_lr, betas=(args.beta1, args.beta2), eps=1e-7, weight_decay=3e-7, params=params) cr = lr / math.log(warm_up) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warm_up else lr) # Get data loader log.info('Building dataset...') train_dataset = DROP(args.train_record_file) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = DROP(args.dev_record_file) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, \ qw_idxs, qc_idxs, \ start_idxs, end_idxs, \ counts, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) start_idxs = start_idxs.to(device) end_idxs = end_idxs.to(device) counts = counts.to(device) ids = ids.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward output_dict = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, ids, start_idxs, end_idxs, counts) loss = output_dict["loss"] loss = torch.sum(loss, dim=0) / len(args.gpu_ids) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Save the model print("Saving the model ...") torch.save(model.state_dict(), args.model_dir) print("Done!")
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') """ model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) """ ''' model = charBiDAF(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) ''' model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) print( "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________" ) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() #print("Model status - ", cuda_or_cpu(model)) ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300 train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() #print(torch.cuda.memory_allocated(device=None)) scheduler.step(step // batch_size) ema(model, step // batch_size) #torch.cuda.empty_cache() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') # print('Memory 1: ', torch.cuda.memory_allocated()) ema.assign(model) # print('Memory 2: ', torch.cuda.memory_allocated()) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, type="train") log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get your model log.info('Building model...') model, step = get_model(log, args) model = model.to(device) model.train() #Exponential moving average ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=[0.8, 0.999], eps=1e-7, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda step: 1) #get loss computer cri = FocalLoss(alpha=torch.tensor([args.alpha, 1]).to(device), gamma=args.gamma) # Get data loader log.info('Building dataset...') dev_dataset = util.load_dataset(args.dev_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes, train=False) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=util.collate_fn) train_dataset = util.load_dataset(args.train_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=util.collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B, batch_y in train_loader: # Setup for forward batch_a = batch_a.to(device) batch_bio_a = batch_bio_a.to(device) batch_A = batch_A.to(device) batch_bio_b = batch_bio_b.to(device) batch_b = batch_b.to(device) batch_B = batch_B.to(device) batch_y = batch_y.to(device) batch_y = batch_y.long() batch_size = batch_bio_a.size(0) optimizer.zero_grad() # Forward output = model(batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B) loss = cri(output, batch_y) #loss = F.nll_loss(output, batch_y) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results = evaluate(model, dev_loader, cri, device) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.5f}' for k, v in results.items()) log.info(f'Dev {results_str}') log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) log.info('Loading word2Idx...') word2Idx = json.loads(open(args.word2idx_file).read()) Idx2Word = {v: k for (k, v) in word2Idx.items()} vocab_size = len(word2Idx) print(f"Vocab Size is : {vocab_size}") def getWords(idxList): words = [] for i in idxList: words.append(Idx2Word[i]) return words def create_new_model(): if args.model_type == "seq2seq": return Seq2Seq(word_vectors=word_vectors, hidden_size=args.hidden_size, output_size=vocab_size, device=device) elif args.model_type == "seq2seq_attn": return Seq2SeqAttn(word_vectors=word_vectors, hidden_size=args.hidden_size, output_size=vocab_size, device=device) elif args.model_type == "transformer": return TransformerModel(vocab_size, device, num_encoder_layers=2, num_decoder_layers=2, dropout=0.1) #return make_model(vocab_size, vocab_size, N=2, dropout=0.0) # Get model log.info('Building model...') model = create_new_model() model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() # Get saver saver = util.CheckpointSaver(args.save_dir, args.best_model_name, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) model_save_path = os.path.join(args.save_dir, args.best_model_name) # Initialize optimizer and loss function optimizer = NoamOpt( model.module.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) criterion = nn.NLLLoss(ignore_index=PAD, reduction='sum') loss_compute = SimpleLossCompute(criterion, optimizer) # Default project starter code uses Adadelta, but we're going to use Adam # optimizer = torch.optim.Adam(model.parameters(), lr=float(args.lr)) num_trial = 0 train_iter = patience = total_loss = report_loss = total_words = report_words = 0 total_examples = report_examples = epoch = valid_num = 0 train_time = begin_time = time.time() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: train_iter += 1 cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Setup for forward src_idxs = cw_idxs src_idxs = torch.cat( (torch.zeros( (batch_size, 1), device=device, dtype=torch.long), src_idxs, torch.zeros( (batch_size, 1), device=device, dtype=torch.long)), dim=-1) src_idxs[:, 0] = SOS src_idxs[:, -1] = EOS #optimizer.zero_grad() tgt_idxs = qw_idxs[:, :-1] tgt_idxs_y = qw_idxs[:, 1:] #c_mask = (cw_idxs != pad).unsqueeze(-2) src_mask = src_idxs == PAD tgt_mask = tgt_idxs == PAD #copy_idxs_tgt_mask = make_std_mask(copy_idxs_tgt, pad) # Forward if args.model_type in ['seq2seq', 'seq2seq_attn']: log_p = model(src_idxs, tgt_idxs) #(batch_size, q_len, vocab_size) elif args.model_type == 'transformer': log_p = model(src_idxs, tgt_idxs, src_mask, tgt_mask) #(batch_size, q_len, vocab_size) ''' print("Context:") print(src_idxs[0]) print("Question:") print(tgt_idxs[0]) print("Predicted:") print(log_p[0].argmax(-1)) ''' log_p = log_p.contiguous().view(-1, log_p.size(-1)) #qw_idxs_tgt = qw_idxs[:, 1:] # omitting leading `SOS` #qw_idxs_tgt = copy_idxs[:, 1:] #print(qw_idxs_tgt.shape) #qw_idxs_tgt = qw_idxs_tgt.contiguous().view(qw_idxs_tgt.size(0) * qw_idxs_tgt.size(1)) #print(qw_idxs_tgt.shape) #q_tgt_mask = torch.zeros_like(qw_idxs_tgt) != qw_idxs_tgt #q_len = q_tgt_mask.sum(-1) tgt_idxs_y = tgt_idxs_y.contiguous().view(-1) tgt_no_pad = tgt_idxs_y != PAD tgt_len = tgt_no_pad.sum(-1) #batch_loss = F.nll_loss(log_p, qw_idxs_tgt, ignore_index=0, reduction='sum') #loss = batch_loss / batch_size #loss_val = loss.item() # Backward #loss.backward() #nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #optimizer.step() batch_words = torch.sum(tgt_len).item() report_words += batch_words total_words += batch_words report_examples += batch_size total_examples += batch_size batch_loss = loss_compute(log_p, tgt_idxs_y, batch_words, model.training) ''' model_opt.optimizer.zero_grad() loss = criterion(log_p, copy_idxs_tgt_y) / batch_words loss.backward() model_opt.step() ''' #batch_loss = loss.item() * batch_words report_loss += batch_loss total_loss += batch_loss # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=batch_loss) if train_iter % args.log_every == 0: ''' log.info('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time)) ''' ''' print("Context Words:") print(getWords(src_idxs[0].squeeze().tolist())) #util.evaluateRandomly(model, word2Idx, Idx2Word, re_cw_idxs[batch_size-1].unsqueeze(0), device) print("Question Words:") print(getWords(tgt_idxs[0].squeeze().tolist())) print("Predicted Words:") model.eval() predicted_words = util.greedy_decode(model, src_idxs[0].unsqueeze(0), src_mask[0].unsqueeze(0), max_len=30, start_symbol=2) print(predicted_words) print(getWords(predicted_words.squeeze().tolist())) model.train() ''' log.info('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_words, math.exp(report_loss / report_words), total_examples, report_words / (time.time() - train_time), time.time() - begin_time)) train_time = time.time() report_loss = report_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print('begin validation ...', file=sys.stderr) # compute dev metrics results = evaluate(model, dev_loader, device, args.use_squad_v2) log.info( 'validation: iter %d, dev avg. loss %.2f, dev. ppl %f' % (train_iter, results['NLL'], results['PPL'])) if saver.is_best(results[args.metric_name]): log.info('save currently the best model to [%s]' % model_save_path) saver.save(step, model, results[args.metric_name], device) '''
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # NEW : load the tag embeddings pos_vectors = util.torch_from_json(args.pos_emb_file) ner_vectors = util.torch_from_json(args.ner_emb_file) # Add loss if 'loss' in args.name: distance_criterion = DistanceFromAnswerLoss(coefficient=.5, device=device, normalization=True, penalization_type='quadratic', reduction='mean') # Choose model log.info('Building model {}...'.format(args.name)) if 'baseline' in args.name: model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'BiDAF_char': model = BiDAF_char(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_loss'): model = BiDAF_tag(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_unfrozen_loss'): model = BiDAF_tag(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, freeze_tag=False) elif args.name == 'BiDAF_tag_ext': model = BiDAF_tag_ext(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'BiDAF_tag_ext_unfrozen': model = BiDAF_tag_ext(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, freeze_tag=False) elif args.name == 'coattn': model = CoattentionModel(hidden_dim=args.hidden_size, embedding_matrix=word_vectors, train_word_embeddings=False, dropout=0.35, pooling_size=16, number_of_iters=4, number_of_layers=2, device=device) else: raise NameError('No model named ' + args.name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in train_loader: # NEW # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward if 'baseline' in args.name: log_p1, log_p2 = model(cw_idxs, qw_idxs) elif args.name == 'BiDAF_char': # Additional setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs) elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) cw_ems = cw_ems.to(device) cw_tfs = cw_tfs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) qw_ems = qw_ems.to(device) qw_tfs = qw_tfs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs) elif args.name == 'coattn': max_c_len = cw_idxs.size(1) max_q_len = qw_idxs.size(1) c_len = [] q_len = [] for i in range(cw_idxs.size(0)): if len((cw_idxs[i] == 0).nonzero()) != 0: c_len_i = (cw_idxs[i] == 0).nonzero()[0].item() else: c_len_i = cw_idxs.size(1) if len((qw_idxs[i] == 0).nonzero()) != 0: q_len_i = (qw_idxs[i] == 0).nonzero()[0].item() else: q_len_i = qw_idxs.size(1) c_len.append(int(c_len_i)) q_len.append(int(q_len_i)) c_len = torch.Tensor(c_len).int() q_len = torch.Tensor(q_len).int() num_examples = int(cw_idxs.size(0) / len(args.gpu_ids)) log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, True) else: raise NameError('No model named ' + args.name) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) # Add distance penalization if 'loss' in args.name: loss += distance_criterion(log_p1, y1) + distance_criterion(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args.name, args.gpu_ids) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): IMAGE_TRAIN_PIXEL_FILE = "./data/train.csv" MODEL_SAVED_DIR = "./save" MODEL_SAVED_DIR = util.get_save_dir(MODEL_SAVED_DIR, args[1], True) LOG_DIR = "./log" dr = digit_recognizer() log = util.get_basic_logger(LOG_DIR, "train") #Process the data before giving input to the model images_data, label_data = load_image_binary_data(IMAGE_TRAIN_PIXEL_FILE) load_first_image = images_data.iloc[:1, :] load_first_image = np.array(load_first_image).reshape(28, 28) pdf = PdfPages( './data/first_image_pass_first_cnnlayer_with_relu_maxpool_outchannel_5_with_topmost_original_image.pdf' ) log.info("Display original image") display_image(load_first_image, pdf) load_first_image = torch.from_numpy(load_first_image).type( torch.FloatTensor) load_first_image = load_first_image.unsqueeze(0).unsqueeze(0) #output = dr(load_first_image) # Display the first image after passign through one cnn layer with relu and maxpool # for filter_no in range(OUT_CHANNEL): # display_image(output[0][filter_no].detach().numpy(),pdf) # pdf.close() # Divide the train data into train data and dev eval data # It seems adding 1 as an extra dimension does not have any impact images_data = np.array(images_data).reshape(-1, 1, 28, 28) label_data = np.array(label_data) x_train, x_valid, y_train, y_valid = train_test_split(images_data, label_data, test_size=0.2) log.info('The shape of training data: {}'.format(x_train.shape)) log.info("The shape of testing data: {}".format(x_valid.shape)) log.info("The shape of training label data: {}".format(y_train.shape)) log.info("The shape of testing label data: {}".format(y_valid.shape)) #Initialize the model model = digit_recognizer() device, gpu_ids = util.get_available_devices() log.info( "The device on which the model is running is :: {}".format(device)) model.to(device) train_dataset = torch.utils.data.TensorDataset( torch.from_numpy(x_train).float(), torch.from_numpy(y_train).long()) eval_dataset = torch.utils.data.TensorDataset( torch.from_numpy(x_valid).float(), torch.from_numpy(y_valid).long()) #create train dataloader object train_loader = data.DataLoader(dataset=train_dataset, batch_size=150, shuffle=True) #create eval dataloader object eval_loader = data.DataLoader(dataset=eval_dataset, batch_size=150, shuffle=False) #Initialize the optimizer optimizer = optim.Adam(model.parameters(), lr=0.0001) #initialize the cross entropy loss loss_fn = nn.CrossEntropyLoss() #Declare the checkpointsaver to save the model saver = util.CheckpointSaver(MODEL_SAVED_DIR, max_checkpoints=4, metric_name='Accuracy', maximize_metric=True, log=log) #initilaize the tensorboard with the path of the model saved dir tbx = SummaryWriter(MODEL_SAVED_DIR) epoch = int(args[2]) global_step_train = 0 global_step_eval = 0 for i in range(epoch): log.info("Starting training") global_step_train = train(train_loader, model, loss_fn, device, optimizer, tbx, log, global_step_train) log.info("Starting evaluation ") accuracy, global_step_eval = evaluate(eval_loader, model, loss_fn, device, tbx, log, global_step_eval) log.info("Completed epoch {} ".format(i)) saver.save(i, model, accuracy, device)
def train(args): args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) if args.gpu_ids == 'cpu': device, args.gpu_ids = torch.device('cpu'), [] else: device, args.gpu_ids = util.get_available_devices() log.info('training on device {} with gpu_id {}'.format(str(device), str(args.gpu_ids))) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log.info('Building model...') if args.task == 'tag': model = SummarizerLinear() # model = SummarizerLinearAttended(128, 256) # model = SummarizerRNN(128, 256) else: model = SummarizerAbstractive(128, 256, device) if len(args.gpu_ids) > 0: model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ## get a saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.l2_wd) log.info('Building dataset...') data_path = PROCESSED_DATA_SUPER_TINY if args.split == 'super_tiny' else PROCESSED_DATA with open(data_path, 'rb') as f: all_data = pickle.load(f) if 'tiny' in args.split: train_split = all_data['tiny'] dev_split = all_data['tiny'] else: train_split = all_data['train'] dev_split = all_data['dev'] train_dataset = SummarizationDataset( train_split['X'], train_split['y'], train_split['gold']) dev_dataset = SummarizationDataset( dev_split['X'], dev_split['y'], dev_split['gold']) collate_fn = tag_collate_fn if args.task == 'tag' else decode_collate_fn train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=collate_fn) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=collate_fn) ## Train! log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) batch_num = 0 with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for X, y, _ in train_loader: batch_size = X.size(0) batch_num += 1 X = X.to(device) y = y.float().to(device) # (batch_size, max_len) for tag, (batch_size, 110) for decode optimizer.zero_grad() if args.task == 'tag': logits = model(X) # (batch_size, max_len) mask = (X != PAD_VALUE).float() # 1 for real data, 0 for pad, size of (batch_size, max_len) loss = (F.binary_cross_entropy_with_logits(logits, y, reduction='none') * mask).mean() loss_val = loss.item() else: logits = model(X, y[:, :-1]) # (batch_size, 109, max_len) loss = sum(F.cross_entropy(logits[i], y[i, 1:], ignore_index=-1, reduction='mean')\ for i in range(batch_size)) / batch_size loss_val = loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() # scheduler.step(step // batch_size) # Log info step += args.batch_size progress_bar.update(args.batch_size) progress_bar.set_postfix(epoch=epoch, Loss=loss_val) tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) results, pred_dict = evaluate(args, model, dev_loader, device) if results is None: log.info('Selected predicted no select for all in batch') continue saver.save(step, model, results[args.metric_name], device) # # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step)
def main(args): # Set up faulthandler faulthandler.enable() # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model_params = { 'word_vectors': word_vectors, 'char_vectors': char_vectors, 'args': args } model = get_model(args.model, model_params) print('Model size: {:f} MB'.format( sum(p.nelement() * p.element_size() for p in model.parameters()) / (1024 * 1024))) # model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: progress_bar.set_description( 'Batch data_loading finished'.ljust(30)) progress_bar.refresh() # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() progress_bar.set_description( 'Batch initialization finished'.ljust(30)) progress_bar.refresh() # Forward faulthandler.dump_traceback_later(timeout=3) log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch forward finished'.ljust(30)) progress_bar.refresh() y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward faulthandler.dump_traceback_later(timeout=3) loss.backward() faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch backward finished'.ljust(30)) progress_bar.refresh() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() progress_bar.set_description('Optimization finished'.ljust(30)) progress_bar.refresh() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) progress_bar.set_description( 'Evaluation finished'.ljust(30)) progress_bar.refresh() saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(data, flags): # Set up logging and devices log_dir = data.logging_dir log = util.get_logger(log_dir, "toy") tbx = SummaryWriter(data.logging_dir) device, data.gpu_ids = util.get_available_devices() log.info('Config: {}'.format(dumps(vars(data), indent=4, sort_keys=True))) data.batch_size *= max(1, len(data.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(data.random_seed)) random.seed(data.random_seed) np.random.seed(data.random_seed) torch.manual_seed(data.random_seed) torch.cuda.manual_seed_all(data.random_seed) if flags[1] == "toy": word_emb_file = data.toy_word_emb_file training_data = data.toy_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.toy_eval_exp3 elif flags[1] == "train": word_emb_file = data.word_emb_file training_data = data.train_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.train_eval_exp3 elif flags[1] == "dev": word_emb_file = data.word_emb_file training_data = data.dev_record_file_exp3 test_data = data.toy_record_file_exp3 eval_file = data.dev_eval_exp3 # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=data.hidden_size, drop_prob=data.drop_prob) model = nn.DataParallel(model, data.gpu_ids) if data.load_path: log.info('Loading checkpoint from {}...'.format(data.load_path)) model, step = util.load_model(model, data.load_path, data.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, data.ema_decay) # Get saver saver = util.CheckpointSaver(data.logging_dir, max_checkpoints=10, metric_name=data.metric_name, maximize_metric=data.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), data.learning_rate, weight_decay=data.learning_weight_decay) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') # np.load(data.toy_record_file_exp3) train_dataset = SQuAD3(training_data, use_v2=True) train_loader = torchdata.DataLoader(train_dataset, batch_size=data.batch_size, shuffle=True, num_workers=data.num_workers, collate_fn=collate_fn) test_dataset = SQuAD3(test_data, use_v2=True) test_loader = torchdata.DataLoader(test_dataset, batch_size=data.batch_size, shuffle=False, num_workers=data.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = data.eval_steps epoch = step // len(test_dataset) while epoch != data.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log.info("cw_idxs length: {}".format(str(len(cw_idxs)))) log.info("qw_idxs length: {}".format(str(len(qw_idxs)))) log.info("cw_idxs size: {}".format(str( sys.getsizeof(cw_idxs)))) log.info("qw_idxs size: {}".format(str( sys.getsizeof(qw_idxs)))) log.info("cw_idxs shape: {}".format(str(cw_idxs.shape))) log.info("qw_idxs shape: {}".format(str(qw_idxs.shape))) log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), data.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('toy/NLL', loss_val, step) tbx.add_scalar('toy/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = data.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, test_loader, device, eval_path=eval_file, max_len=sys.maxsize, use_squad_v2=True) saver.save(step, model, results[data.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=step, split='dev', num_visuals=data.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed # To make the data generation of every experiment same log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) print(word_vectors.size()) print(char_vectors.size()) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader # (context_idxs(context_len,): Indices of the words in the context., # context_char_idx(context_len, max_word_len): Indices of the characters in the context, # question_idxs(question_len,): Indices of the words in the question, # question_char_idx(question_len, max_word_len): Indices of the characters in the question, # y1:start, -1 if no answer: answer start index, # y2:start, -1 if no answer: answer end index, # id ID of the example) log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward # cw_idxs: Indices of the words in the context # cc_idxs: Indices of the characters in the context # qw_idxs: Indices of the words in the query # qc_idxs: Indices of the characters in teh query cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) # cw_idx with shape(context_len, ) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) # L(theta) = - 1/N * sum(log(P1_yi_1) + log(P2_yi_2)) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def __init__(self, args, gpu_num: int, train_loader, val_loader, crop_sz=125, output_sz=121, lambda0=1e-4, padding=2.0, output_sigma_factor=0.1): self.crop_sz = crop_sz self.output_sz = output_sz self.lambda0 = lambda0 self.padding = padding output_sigma = crop_sz / (1 + padding) * output_sigma_factor self.args = args self.gpu_num = gpu_num self.train_loader = train_loader self.val_loader = val_loader self.batch_size = args.batch_size * gpu_num self.best_loss = 1e6 # shape: 121, 121 self.y = torch.tensor( util.gaussian_shaped_labels( output_sigma, [self.output_sz, self.output_sz]).astype(np.float32)).cuda() # shape: 1, 1, 121, 61, 2 self.yf = fft.rfftn(self.y.view(1, 1, self.output_sz, self.output_sz), dim=[-2, -1]) # Shape: 121, 121 self.initial_y = self.y.clone() # Shape: batch, 1, 121, 61 self.label = self.yf.repeat(self.batch_size, 1, 1, 1) self.model = DCFNet(lambda0=self.lambda0).cuda() print('GPU NUM: {:2d}'.format(gpu_num)) if gpu_num > 1: self.model = torch.nn.DataParallel(self.model, list(range(gpu_num))).cuda() self.criterion = nn.MSELoss(reduction='sum').cuda() self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( self.optimizer, gamma=util.compute_lr_gamma(args.lr, 1e-5, args.epochs)) # Bring the lr scheduler to the first epoch for epoch in range(args.start_epoch): self.lr_scheduler.step() # for training self.target = self.y.unsqueeze(0).unsqueeze(0).repeat( args.batch_size * gpu_num, 1, 1, 1) # optionally resume from a checkpoint if args.resume: if isfile(args.resume): print(f"=> loading checkpoint '{args.resume}'") checkpoint = torch.load(args.resume) self.args.start_epoch = checkpoint['epoch'] self.best_loss = checkpoint['best_loss'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print( f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})" ) else: print(f"=> no checkpoint found at '{args.resume}'") cudnn.benchmark = True checkpoint_path = args.save if args.save else config.checkpoint_root self.checkpoint_saver = util.CheckpointSaver(save_path=os.path.join( checkpoint_path, f'crop_{args.input_sz:d}_{args.padding:1.1f}'), verbose=True)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) max_len = 10 # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ch_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, ch_vectors=ch_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) #print("ckpt 1") ans_lens = y2 - y1 loss = 0 for i in range(max_len): mask = ((torch.ones_like(y1) * i) == ans_lens).type( torch.cuda.LongTensor) y = y1 * mask loss += F.nll_loss(log_p[:, :, i], y) #print("ckpt 2") loss_val = loss.item() #print("ckpt 3") # Backward loss.backward() #print("ckpt 4") nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #print("ckpt 5") optimizer.step() #print("ckpt 6") scheduler.step(step // batch_size) ema(model, step // batch_size) #print("ckpt 7") # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) if step % (50 * batch_size) == 0: print(loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size #print("ckpt 8") if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) # Comment out to only use 1 GPU on nv12 args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = None max_context_len, max_question_len = args.para_limit, args.ques_limit if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"): model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.model_type == "dcn" or args.model_type == "bert-dcn"): model = DCN(word_vectors=word_vectors, hidden_size=args.hidden_size, max_context_len=max_context_len, max_question_len=max_question_len, drop_prob=args.drop_prob) elif (args.model_type == "bert-basic"): model = BERT(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) if model is None: raise ValueError('Model is unassigned. Please ensure --model_type \ chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) count_skip = 0 while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: batch_size = cw_idxs.size(0) count_skip += 1 if (args.skip_examples == True and (count_skip % 5 == 1 or count_skip % 5 == 2 or count_skip % 5 == 3 or count_skip % 5 == 4)): step += batch_size progress_bar.update(batch_size) steps_till_eval -= batch_size continue # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_train_embeddings = get_embeddings( "train", ids, args.para_limit, args.ques_limit) else: bert_train_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vec = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.name == 'baseline': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings': model = BiDAFChar(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings2': model = BiDAFChar2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'qanet': model = QANet(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, total_prob=args.total_drop, final_prob=args.final_prob) elif args.name == 'qanet2': model = QANet2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet3': model = QANet3(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet4': model = QANet4(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) else: raise ValueError('Wrong model name') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler if args.name == 'qanet': optimizer = optim.Adam(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adam': if args.grad_cent: optimizer = AdamWGC(model.parameters(), args.lr, betas=(0.9, 0.999), weight_decay=3 * 1e-7, eps=1e-7, use_gc=True) else: optimizer = AdamW(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adadelta': optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR elif args.opt == 'sgd': optimizer = optim.SGD(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) i = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() i += 1 loss /= args.acc_step # Backward loss.backward() if i % args.acc_step == 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(i // (args.acc_step)) ema(model, i // (args.acc_step)) optimizer.zero_grad() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0 and i % args.acc_step == 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(): torch.backends.cudnn.enabled = False # Make save dir for logfiles and state_dicts run_name = "optflow_nvidia" save_path = osp.join("save", run_name) save_dir = util.get_save_dir(save_path, run_name, training=True) # unique save dir log = util.get_logger(save_dir, run_name) # logger saver = util.CheckpointSaver( save_dir, # save model max_checkpoints=10, maximize_metric=False, metric_name="MSE", log=log) # Data, batches & epochs max_epoch = 25 batch_size = 32 window = 7 train_loader, val_loader = create_datasplit(batch_size=batch_size, window=window) # Model Creation log.info("Building model") # model = resnet18(sample_size=(240, 640), sample_duration=2*window+1, shortcut_type="A", num_classes=1) # model = C3D() model = nvidia() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = nn.DataParallel( model ) # If loading state dict throws KeyError ie. unexpected keys "module.convX.X" model.to(device) model.train() # Possibly load model state_dict here step = 0 # load_path = 'c3d.pickle' # log.info('Loading checkpoint from {}...'.format(load_path)) # model.load_state_dict(torch.load(load_path)) # model = util.load_model(model, load_path, 0) # uses the saved step num # Loss & Optimizer criterion = nn.MSELoss() lr = 1e-4 weight_decay = 1e-5 # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=.9, weight_decay=weight_decay) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # Log args # log.info('Args: {}'.format(dumps(vars({"lr": lr, "max_epoch": max_epoch, "batch_size": batch_size, # "window_size": window}), indent=4, sort_keys=True))) # Initialize epoch and step_to_eval counters steps_till_eval = int(.4 * len(train_loader.dataset)) epoch = step // len(train_loader.dataset) while epoch != max_epoch: epoch += 1 log.info("=============Epoch %i=============" % epoch) with torch.enable_grad(), tqdm( total=len(train_loader.dataset)) as progress_bar: for sample_batch in train_loader: model.zero_grad() # Zero gradients after each batch x, y = sample_batch x = x.to(device) y = y.to(device) f = model(x) loss = criterion(f, y) loss.backward() optimizer.step() step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, MSE=loss.item()) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = int(.4 * len(train_loader.dataset)) # Eval on validation set val_mse = evalu(model, val_loader, device) # Save checkpoint saver.save(step, model, val_mse, device) # Print to console results_str = "MSE: " + str(val_mse) log.info('Dev {}'.format(results_str))
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.model_name == 'sketchy': model = SketchyReader(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, char_embed_drop_prob=args.char_embed_drop_prob, num_heads=args.num_heads, drop_prob=args.drop_prob) # SKETCHY elif args.model_name == 'intensive': model = IntensiveReader(word_vectors=word_vectors, char_vectors=char_vectors, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, hidden_size=args.hidden_size, drop_prob=args.drop_prob) # INTENSIVE elif args.model_name == 'retro': model = RetroQANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, intensive_path=args.load_path_i, sketchy_path=args.load_path_s, gpu_ids=args.gpu_ids, drop_prob=args.drop_prob) # Outer model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # setup losses bceLoss = nn.BCELoss() # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) if args.optim == "adam": optimizer = optim.Adam( model.parameters(), 0.001, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: counter = 0 epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: counter += 1 # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward y1, y2 = y1.to(device), y2.to(device) if args.model_name == 'sketchy': yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = bceLoss(yi, torch.where( y1 == 0, 0, 1).type(torch.FloatTensor)) elif args.model_name == 'intensive': yi, log_p1, log_p2 = model( cw_idxs, qw_idxs, cc_idxs, qc_idxs) # if counter % 100 == 0: #print(torch.max(log_p1.exp(), dim=1)[0]) # $print(torch.max(log_p2.exp(), dim=1)[0]) #weights = torch.ones(log_p1.shape[1]) #weights[0] = 2/(log_p1.shape[1]) #nll_loss = nn.NLLLoss(weight=weights.to(device='cuda:0')) # gt_0 = torch.zeros(yi.shape[0]).to(device) # gt_1 = torch.ones(yi.shape[0]).to(device) loss = args.alpha_1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type( torch.FloatTensor)) + args.alpha_2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)) #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) elif args.model_name == 'retro': log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) else: raise ValueError( 'invalid --model_name, sketchy or intensive required') loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/' + args.model_name, loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, model_name=args.model_name, a1=args.alpha_1, a2=args.alpha_2) saver.save( step, model, results[args.metric_name], device, model_name=args.model_name) ema.resume(model) # Log to console results_str = ', '.join( f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Load embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Build QA model log.info('Building model...') model = QA_Model(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, attention_type=args.attention_type, train_embeddings=args.train_embeddings) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: # Load QA model from file log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) #optimizer = optim.Adam(model.parameters(), lr=args.lr) # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}')
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings # Args: word_vectors: word vector tensor of dimension [vocab_size * wemb_dim] log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get Model log.info('Building Model...') model = QANet(word_vectors, char_vectors, args.para_limit, args.ques_limit, args.f_model, num_head=args.num_head, train_cemb = (not args.pretrained_char)) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam( params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss_f = torch.nn.CrossEntropyLoss() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size if step % 1000 == 0 and step > 0: log.info(f'Step {step}: training loss {loss_val}...') steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(vectors=(word_vectors, char_vectors), hidden_size=args.hidden_size, drop_prob=args.drop_prob, p_sdd=args.p_sdd, char_limit=args.char_limit, use_transformer=args.use_transformer, inter_size=args.inter_size, heads=args.heads, c2w_size=args.c2w_size, enc_blocks=args.enc_blocks, enc_convs=args.enc_convs, mod_blocks=args.mod_blocks, mod_convs=args.mod_convs, use_GRU=args.use_GRU) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) # uses the saved step num else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # The scheduler MULTIPLIES the base LR, NOT replaces optimizer = optim.Adam(model.parameters(), 1., betas=(.9, .98), eps=1e-9, weight_decay=args.l2_wd) scheduler = sched.LambdaLR( optimizer, lambda s: 0.001 * math.log(s + 1) / math.log(1000 - 1) if s < 1000 else 0.001) # Chute (must use math.log, else TypeError) # scheduler = sched.LambdaLR(optimizer, lambda s: (args.hidden_size**(-.5)) * # min((s+1e-9)**(-.5), s*(4000**(-1.5))) # ) # From Vaswani et. al 2017 # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward optimizer.zero_grad() batch_size = cw_idxs.size(0) cc_idxs = cc_idxs.to(device) # (batch, c_limit, char_limit) qc_idxs = qc_idxs.to(device) cw_idxs = cw_idxs.to(device) # (batch, c_limit) qw_idxs = qw_idxs.to(device) c_idxs, q_idxs = (cw_idxs, cc_idxs), (qw_idxs, qc_idxs) # Forward log_p1, log_p2 = model(c_idxs, q_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step( step // batch_size ) # By default, schedules per epoch; pass in step # as "epoch" ema(model, step // batch_size) # Log info step += batch_size # Number of examples. Step is usually the number of (mini)-batches progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): print("in main") print("args: ", args) if True: args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') # CHECK IF WE NEED TO USE ALL OF THESE???? word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) print("train dataset!: ", train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) device, args.gpu_ids = util.get_available_devices() print(args.gpu_ids) tbx = SummaryWriter(args.save_dir) #this lets use save model saver = util.CheckpointSaver(args.save_dir, max_checkpoints=15, metric_name='accuracy', maximize_metric=True, log=log) #build model here log.info("Building model") #model = VGGLinear() #model = Baseline(8 * 96 * 64) #model = VGGLSTM() #model = Resnet() model = TimeCNN() model = model.double() print('bbefore data parallel') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info("Loading checkpoints") model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 print('before to device') model = model.to(device) model.train() #optimizer = optim.Adam(model.parameters(), lr = 0.001, betas=(.9,.999), eps=1e-08) # These are the parameters with the VGGLinear #optimizer = optim.Adam(model.parameters(), lr = 0.001, betas=(.9,.999), eps=1e-08, weight_decay=.001) log.info("Building Dataset") # These are the parameters that worked with the best TimeCNN model optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(.9, .999), eps=1e-08, weight_decay=.005) train_dataset = Shots("videos/train.h5py", "labels/train.npy") train_loader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, collate_fn=collate_fn) dev_dataset = Shots("videos/dev.h5py", "labels/dev.npy") dev_loader = data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, collate_fn=collate_fn) #print(len(train_loader.dataset)) log.info("Training") steps_til_eval = 2000 for epoch in range(30): with torch.enable_grad(), tqdm( total=len(train_loader.dataset)) as progress_bar: for frames, ys in train_loader: batch_size = frames.shape[0] step += batch_size frames = frames.to(device) ys = ys.to(device) optimizer.zero_grad() #forwards pass scores = model(frames) loss = F.cross_entropy(scores, ys) loss_val = loss.item() #Backwards pass loss.backward() optimizer.step() #some logging progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) steps_til_eval -= batch_size if steps_til_eval <= 0: steps_til_eval = 2000 results, loss = evaluate(model, dev_loader, device) # save checkpoint saver.save(step, model, results, device) log.info("Dev Accuracy " + str(results)) log.info("Dev loss " + str(loss)) #logging to tensorboard tbx.add_scalar('dev_accuracy', results, step) tbx.add_scalar("dev_loss", loss, step) tbx.close()