def main(): n_samples = 3 # 1, 3 or 5 n_hidden = 30 # target = [1, 2, 6, 7, 8] # case1 # target = [3, 4, 5, 9, 10, 11, 12] # case2 target = list(range(1, 22)) # total train_data, train_labels = gen_seq_data(target, n_samples, is_train=True) test_data, test_labels = gen_seq_data(target, n_samples, is_train=False) scaler = preprocessing.StandardScaler().fit(train_data) train_data = scaler.transform(train_data) test_data = scaler.transform(test_data) train_dataset = tchdata.TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels)) test_dataset = tchdata.TensorDataset(torch.from_numpy(test_data), torch.from_numpy(test_labels)) train_loader = tchdata.DataLoader(train_dataset, batch_size=32, shuffle=True) test_loader = tchdata.DataLoader(test_dataset, batch_size=32, shuffle=False) model = TEMlp(52 * n_samples, n_hidden, len(target)) model.cuda() torch.backends.cudnn.benchmark = True optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.005) for i in range(60): train_acc = train(model, optimizer, train_loader) test_acc = validate(model, test_loader) print('{}\tepoch = {}\ttrain accuracy: {:0.3f}\ttest accuracy: {:0.3f}' \ .format(datetime.now(), i, train_acc, test_acc))
def main(): ### Create the torch datasets and get the size of the 'on-the-fly' created vocabulary and the length of the longest caption trainDataset = loadData.FlickrTrainDataset(images_folder, captions_folder, trans, 'TRAIN') valDataset = loadData.FlickrValDataset(images_folder, captions_folder, trans, 'VAL') voc_size = trainDataset.getVocabSize() max_capt = trainDataset.getMaxCaptionsLength() ### Create the models Encoder = model.Encoder() Decoder = model.Decoder(encoder_dim=2048, decoder_dim=512, attention_dim=256, vocab_size=voc_size) Embedding = model.Embedding(vocab_size=voc_size, embedding_dim=128) ### Set the optimizer for the decoder(the only component that is actually trained) and the device for the model tensors decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, Decoder.parameters()), lr=e - 3) Encoder.to(device) Decoder.to(device) Embedding.to(device) ### Create the data loaders for training and evaluation loader_train = DataLoader(trainDataset, 32, sampler=sampler.SubsetRandomSampler( range(30000))) val_loader = DataLoader(valDataset, 32, sampler=sampler.SubsetRandomSampler(range(30000))) best_bleu = 0 #The best blue score by now for i in range(epochs): ## One epoch's training train.train(data_loader=loader_train, encoder=Encoder, decoder=Decoder, embedding=Embedding, max_caption_length=max_capt, optim=decoder_optimizer) ## One epoch's validation new_bleu = train.validate(data_loader=val_loader, encoder=Encoder, decoder=Decoder, embedding=Embedding, max_capt) if new_bleu > best_bleu: best_bleu = new_bleu else: ## We had no improvement since last time,so se don't train more break ## Save the model for deploying torch.save(Encoder, 'Encoder') torch.save(Decoder, 'Decoder') torch.save(Embedding, 'Embedding')
def main(): args = get_args() if args.opts: cfg.merge_from_list(args.opts) cfg.freeze() # create model print("=> creating model '{}'".format(cfg.MODEL.ARCH)) model = get_model(model_name=cfg.MODEL.ARCH, pretrained=None) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # load checkpoint resume_path = args.resume if Path(resume_path).is_file(): print("=> loading checkpoint '{}'".format(resume_path)) checkpoint = torch.load(resume_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}'".format(resume_path)) else: raise ValueError("=> no checkpoint found at '{}'".format(resume_path)) if device == "cuda": cudnn.benchmark = True test_dataset = FaceDataset(args.data_dir, "test", img_size=cfg.MODEL.IMG_SIZE, augment=False) test_loader = DataLoader(test_dataset, batch_size=cfg.TEST.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.WORKERS, drop_last=False) print("=> start testing") _, _, test_mae = validate(test_loader, model, None, 0, device) print(f"test mae: {test_mae:.3f}")
def main(): args = get_args() if args.opts: cfg.merge_from_list(args.opts) cfg.freeze() # creat model print("=> creating model ") model = get_model() device = "cuda:5" if torch.cuda.is_available() else "cpu" model = model.to(device) # load checkpoint resume_path = args.resume if Path(resume_path).is_file(): print("=> loading checkpoint '{}".format(resume_path)) checkpoint = torch.load(resume_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}'".format(resume_path)) else: raise ValueError("=> no checkpoint found at '{}'".format(resume_path)) if 'cuda' in device: cudnn.benchmark = True if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=[3, 4, 5]) age_test_dataset = dataset.AgeDataset(args.age_dir, 'test', cfg.MODEL.IMG_SIZE, augment=False) age_test_loader = DataLoader(dataset=age_test_dataset, batch_size=cfg.TEST.BATCH_SIZE, shuffle=True) # gender_test_dataset = dataset.GenderDataset(args.gender_dir,'test',cfg.MODEL.IMG_SIZE,augment=False) # gender_test_loader = DataLoader(dataset=gender_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True, # num_workers=cfg.TEST.WORKERS) # # hat_test_dataset = dataset.HatDataset(args.hat_dir,'test',cfg.MODEL.IMG_SIZE,augment=False) # hat_test_loader = DataLoader(dataset=hat_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True, # num_workers=cfg.TEST.WORKERS) # # glasses_test_dataset = dataset.GlassDataset(args.glasses_dir,'test',cfg.MODEL.IMG_SIZE,augment=False) # glasses_test_loader = DataLoader(dataset=glasses_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True, # num_workers=cfg.TEST.WORKERS) print("=> start testing") _, _, test_mae = validate(age_test_loader, model, None, 0, device, 'age') #gender_loss,gender_acc = validate(gender_test_loader,model,None,0,device,'gender') #hat_loss,hat_acc = validate(hat_test_loader,model,None,0,device,'hat') #glasses_loss,glasses_acc = validate(glasses_test_loader,model,None,0,device,'glasses') print(f"age test mae: {test_mae:.3f}")
def test(test_dataloader, args, writer, log): model = train.MyNERModel(args).cuda() model_param = torch.load( os.path.join(args.output_dir, 'global_best_model.bin')) model.load_state_dict(model_param) model.eval() f1 = train.validate(model, test_dataloader, args, log, is_test=True) print('-----------Test set, F1-Score: %.4f-----------' % f1) log.write('-----------Test set, F1-Score: %.4f----------- \n' % f1)
def run_ctc(): """ Main function for running the program """ args = parse_args() # for the feature transforming if args.feature_transform: timit_dir = args.timit_dir feature_dir = args.feature_dir if not os.path.exists(timit_dir): print("TIMIT directory doesnot exist!") sys.exit(0) if not os.path.exists(feature_dir): os.makedirs(feature_dir) featpickle_path = os.path.join(feature_dir, "features.pickle") featext.compute_and_store_features(timit_dir, featpickle_path) # for training elif args.train_unirnn: outputdir = args.train_output_dir featdir = args.feature_dir if not os.path.exists(outputdir): os.makedirs(outputdir) featpickle_file = os.path.join(featdir, "features.pickle") if not os.path.exists(featpickle_file): print("features not exist. Please run feature transform first") sys.exit(0) model_path = os.path.join(outputdir, "inference_model") train.train(model_path, featpickle_file) # for validation else: model_dir = args.model_input_dir model_path = os.path.join(model_dir, "inference_model.meta") audio_path = args.validation_audio_path phoneme_path = args.validation_phoneme_path if not os.path.isfile(audio_path): print("audio file not exist!") sys.exit(0) if not os.path.isfile(phoneme_path): print("phoneme file not exist!") if not os.path.isfile(model_path): print("model not exist! please train first") train.validate(audio_path, phoneme_path, model_path)
def main(): args = get_args() if args.opts: cfg.merge_from_list(args.opts) cfg.freeze() # create model print("=> creating model '{}'".format(cfg.MODEL.ARCH)) model = get_model(model_name=cfg.MODEL.ARCH, pretrained=None) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # TODO: delete if torch.cuda.device_count() > 1: print("Let's use [1,2,4,5] GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model, device_ids=[1, 2, 4, 5]) model.to(device) # load checkpoint resume_path = args.resume if Path(resume_path).is_file(): print("=> loading checkpoint '{}'".format(resume_path)) checkpoint = torch.load(resume_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}'".format(resume_path)) else: raise ValueError("=> no checkpoint found at '{}'".format(resume_path)) if device == "cuda": cudnn.benchmark = True test_dataset = FaceDataset(args.data_dir, "test", img_size=cfg.MODEL.IMG_SIZE, augment=False) test_loader = DataLoader(test_dataset, batch_size=cfg.TEST.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.WORKERS, drop_last=False) criterion = nn.CrossEntropyLoss().to(device) print("=> start testing") _, _, test_mae, gen_acc = validate(test_loader, model, criterion, 0, device) print(f"Test age mae: {test_mae:.3f}") print(f"Test gender accuracy: {gen_acc:.2f}")
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize(args.imsize_pre), transforms.CenterCrop(args.imsize), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) if args.dataset == "coco": val_dset = CocoDataset( root=args.root_path, split="val", transform=transform, ) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater, ) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) model = SPVSE( len(vocab), args.emb_size, args.out_size, args.max_len, args.cnn_type, args.rnn_type, pad_idx=vocab.padidx, bos_idx=vocab.bosidx, ) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") model = model.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) model.load_state_dict(ckpt["model_state"]) _ = validate(1000, val_loader, model, vocab, args)
def main(): train_set = SinaDataset(path.join(args.source, 'train.json'), input_dim) test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim) train_loader = DataLoader(train_set, batch_size=args.bs, shuffle=True, drop_last=True) test_loader = DataLoader(test_set, batch_size=args.bs, shuffle=True, drop_last=True) model = TextCNN(input_dim, 200) # model = MyLSTM(input_dim, hidden_dim=8) model = model.to(device) optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.wd) epoch = 0 train_loss = [] train_accu = [] valid_loss = [] valid_accu = [] while True: epoch += 1 epoch_loss, epoch_accu = train_one_epoch(epoch, model, optimizer, train_loader, device, args.bs) val_loss, val_accu = validate(model, test_loader, device, args.bs) train_loss += epoch_loss train_accu += epoch_accu valid_loss += val_loss valid_accu += val_accu print('saving...') torch.save(model.state_dict(), './saved_models/epoch' + str(epoch) + '.pkl') print() if args.max_epoch and epoch >= args.max_epoch: train_result = { 'batch-size': args.bs, 'train-loss': train_loss, 'train-accu': train_accu, 'valid-loss': valid_loss, 'valid-accu': valid_accu } with open('train-result.json', 'w', encoding='utf-8') as f: json.dump(train_result, f) break
def main(): global args args = parser.parse_args() print('dataset:', args.root_path) print('end2end?:', args.end2end) # load image train_loader = load_image( args.train_list, transforms.Compose([ transforms.CenterCrop(128), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ]), True, True) val_loader = load_image( args.val_list, transforms.Compose([ transforms.CenterCrop(128), transforms.ToTensor(), ]), False, True) # prepare model model = create_model(args.end2end) params = create_model_parameters(args, model) criterion = nn.CrossEntropyLoss().cuda() # loss function optimizer = torch.optim.SGD(params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True print(validate(val_loader, model, criterion))
def check_ans(self, word, lvl, label, *args): correct = False result = tk.StringVar() if train.validate(word, *args): correct = True result.set(train.correct(word, lvl)) label.config(fg='green') else: result.set(train.incorrect(word, lvl)) label.config(fg='red') label.config(textvariable=result) self.elements.append(label) if not correct: for el in self.elements: if el.winfo_class() == 'Button': el.destroy() self.elements.remove(el) bt_review = tk.Button(text="Done", width=10, height=2, bg="black", font=(subtext_font, subtext_size), fg="white", anchor="center", highlightthickness=0, command=partial(self.homepage)) self.elements.append(bt_review) bt_review.pack(pady=(subtext_size, 0)) label.pack() return label.pack() self.root.update_idletasks() time.sleep(2) self.homepage()
def train(args, trainer, task, epoch_itr, epoch_aux_itr): """Train the model for one epoch.""" # Update parameters every N batches if epoch_itr.epoch <= len(args.update_freq): update_freq = args.update_freq[epoch_itr.epoch - 1] else: update_freq = args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple', ) # Auxiliary iterator aux_itr = epoch_aux_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus) aux_itr = iterators.GroupedIterator(aux_itr, update_freq, restart_when_done=True) extra_meters = collections.defaultdict(lambda: AverageMeter()) first_valid = args.valid_subset.split(',')[0] max_update = args.max_update or math.inf for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): # Record gradients from auxiliary data aux_samples = next(aux_itr) trainer.train_step(aux_samples, update_params=False) # if hasattr(trainer.optimizer, "save_constraints"): trainer.optimizer.save_constraints() log_output = trainer.train_step(samples) if log_output is None: continue # log mid-epoch stats stats = get_training_stats(trainer) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue # these are already logged above if 'loss' in k: extra_meters[k].update(v, log_output['sample_size']) else: extra_meters[k].update(v) stats[k] = extra_meters[k].avg progress.log(stats) # ignore the first mini-batch in words-per-second calculation if i == 0: trainer.get_meter('wps').reset() num_updates = trainer.get_num_updates() if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0 and num_updates > 0: valid_losses = validate(args, trainer, task, epoch_itr, [first_valid]) save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if num_updates >= max_update: break # log end-of-epoch stats stats = get_training_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats) # reset training meters for k in [ 'train_loss', 'train_nll_loss', 'wps', 'ups', 'wpb', 'bsz', 'gnorm', 'clip', ]: meter = trainer.get_meter(k) if meter is not None: meter.reset()
def evaluate_val_acc(model, data): crit = nn.CrossEntropyLoss() loss, acc = validate(model, data, crit) return acc
def run(opts): rank = opts.local_rank if torch.cuda.device_count() > 1 else 0 # Set the random seed torch.manual_seed(opts.seed + rank) random.seed(opts.seed + rank) np.random.seed(opts.seed + rank) if not os.path.exists(opts.save_dir) and rank == 0: os.makedirs(opts.save_dir) # Optionally configure wandb if not opts.no_wandb and rank == 0: wandb.login('never', '31ce01e4120061694da54a54ab0dafbee1262420') wandb.init(dir=opts.save_dir, config=opts, project='large_scale_tsp', name=opts.run_name, sync_tensorboard=True, save_code=True) # Set the device if opts.use_cuda: torch.cuda.set_device(rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') opts.device = torch.device("cuda", rank) else: opts.device = torch.device("cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: if rank == 0: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model: torch.nn.Module = model_class( opts.embedding_dim, opts.hidden_dim, problem, attention_type=opts.attention_type, n_encode_layers=opts.n_encode_layers, n_heads=opts.n_heads, feed_forward_dim=opts.feed_forward_dim, encoding_knn_size=opts.encoding_knn_size, decoding_knn_size=opts.decoding_knn_size, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.init_normalization_parameters: for m in model.modules(): if isinstance(m, Normalization): m.init_parameters() if opts.use_cuda: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to( opts.device) model = DDP(model, device_ids=[rank]) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) scaler = torch.cuda.amp.GradScaler() if opts.precision == 16 else None # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) if rank == 0: print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, scaler, baseline, lr_scheduler, epoch, val_dataset, problem, opts)
def run(opts): # start time start_time = time() train_run = [] opts.save_hrs.sort() run_name = opts.run_name # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 torch.save(model, os.path.join('.', 'empty.pt')) if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): avg_time = train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, start_time) train_run.append(avg_time) for hr in opts.save_hrs: if (time() - start_time) > hr * 3600: opts.save_hrs.remove(hr) print('Saving model and state...') hr_time = int(round((time() - start_time) / 3600)) with open( '../models/att/hist_{}_{}hr.pickle'.format( run_name, hr_time), 'wb') as handle: pickle.dump(train_run, handle, protocol=pickle.HIGHEST_PROTOCOL) torch.save( { 'model': get_inner_model(model).state_dict(), 'optimizer': optimizer.state_dict(), 'rng_state': torch.get_rng_state(), 'cuda_rng_state': torch.cuda.get_rng_state_all(), 'baseline': baseline.state_dict() }, os.path.join( '../models/att', '{}_{}hr-model-att-only.pt'.format( run_name, hr_time))) torch.save( model, os.path.join( '../models/att', '{}_{}hr-model.pt'.format(run_name, hr_time)))
def main(): args.eval_iters = [int(val) for val in args.eval_iters.split(',')] # args.loss_reset_step = 10 args.log_step = 10 args.dataset = args.dataset.lower() args.basenet = args.basenet.lower() args.bn = abs(args.bn) # 0 freeze or else use bn if args.bn > 0: args.bn = 1 # update bn layer set the flag to 1 args.exp_name = 'FPN{:d}-{:s}sh{:02d}-{:s}-bs{:02d}-{:s}-lr{:05d}-bn{:d}'.format( args.input_dim, args.anchor_type, args.shared_heads, args.dataset, args.batch_size, args.basenet, int(args.lr * 100000), args.bn) args.save_root += args.dataset + '/' args.save_root = args.save_root + 'cache/' + args.exp_name + '/' if not os.path.isdir( args.save_root): # if save directory doesn't exist create it os.makedirs(args.save_root) source_dir = args.save_root + '/source/' # where to save the source utils.copy_source(source_dir) anchors = 'None' with torch.no_grad(): if args.anchor_type == 'kmeans': anchorbox = kanchorBoxes(input_dim=args.input_dim, dataset=args.dataset) else: anchorbox = anchorBox(args.anchor_type, input_dim=args.input_dim, dataset=args.dataset) anchors = anchorbox.forward() args.ar = anchorbox.ar args.num_anchors = anchors.size(0) anchors = anchors.cuda(0, non_blocking=True) if args.dataset == 'coco': args.train_sets = ['train2017'] args.val_sets = ['val2017'] else: args.train_sets = ['train2007', 'val2007', 'train2012', 'val2012'] args.val_sets = ['test2007'] args.means = [0.485, 0.456, 0.406] args.stds = [0.229, 0.224, 0.225] val_dataset = Detection(args, train=False, image_sets=args.val_sets, transform=BaseTransform(args.input_dim, args.means, args.stds), full_test=False) print('Done Loading Dataset Validation Dataset :::>>>\n', val_dataset.print_str) args.data_dir = val_dataset.root args.num_classes = len(val_dataset.classes) + 1 args.classes = val_dataset.classes args.bias_heads = args.bias_heads > 0 args.head_size = 256 if args.shared_heads > 0: net = build_fpn_shared_heads(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes, bias_heads=args.bias_heads) else: net = build_fpn_unshared(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes, bias_heads=args.bias_heads) net = net.cuda() if args.ngpu > 1: print('\nLets do dataparallel\n') net = torch.nn.DataParallel(net) net.eval() for iteration in args.eval_iters: args.det_itr = iteration log_file = open( "{:s}/testing-{:d}.log".format(args.save_root, iteration), "w", 1) log_file.write(args.exp_name + '\n') args.model_path = args.save_root + '/model_' + repr(iteration) + '.pth' log_file.write(args.model_path + '\n') net.load_state_dict(torch.load(args.model_path)) print('Finished loading model %d !' % iteration) # Load dataset val_data_loader = data_utils.DataLoader(val_dataset, int(args.batch_size / 2), num_workers=args.num_workers, shuffle=False, pin_memory=True, collate_fn=custum_collate) # evaluation torch.cuda.synchronize() tt0 = time.perf_counter() log_file.write('Testing net \n') net.eval() # switch net to evaluation mode if args.dataset != 'coco': mAP, ap_all, ap_strs, det_boxes = validate( args, net, anchors, val_data_loader, val_dataset, iteration, iou_thresh=args.iou_thresh) else: mAP, ap_all, ap_strs, det_boxes = validate_coco( args, net, anchors, val_data_loader, val_dataset, iteration, iou_thresh=args.iou_thresh) for ap_str in ap_strs: print(ap_str) log_file.write(ap_str + '\n') ptr_str = '\nMEANAP:::=>' + str(mAP) + '\n' print(ptr_str) log_file.write(ptr_str) torch.cuda.synchronize() print('Complete set time {:0.2f}'.format(time.perf_counter() - tt0)) log_file.close()
np.savetxt(logdir + "test_set.txt", sorted([fi for fi in test_set]), fmt="%s") # Ensure that test set file list is complete chs = ("g", "r", "i") test_set = [ (fi, ch) for fi, ch in zip(np.repeat(test_set, len(chs)), itertools.cycle(chs)) if isfile("./data/gals/{}-{}.fits".format(fi, ch)) and isfile("./data/sbs_gri_noise/{}_{}.txt".format(fi, ch)) ] # Init and load Pix2Prof encoder = ResNet18(num_classes=args.encoding_len).to(cuda) decoder = GRUNet(input_dim=1, hidden_dim=args.encoding_len, output_dim=1, n_layers=3).to(cuda) criterion = nn.MSELoss() encoder_op = optim.Adam(encoder.parameters(), lr=0.0002) decoder_op = optim.Adam(decoder.parameters(), lr=0.0002) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) decoder_op.load_state_dict(checkpoint["decoder_op"]) encoder_op.load_state_dict(checkpoint["encoder_op"]) # Validate validate(test_set, encoder, decoder, chk_epoch, criterion, logdir=logdir) plot_validation_set("{}/{:04d}".format(logdir, chk_epoch))
def main(args): # create labeled, validation, and test data loader # unlabeled data loader not needed for baseline training train_loader, val_loader, args = get_data_loaders_no_ssl(args) # create models model = create_model(args, model='efficient', efficient_version=args.efficient_version) # optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss() # stats and logger logger = Logger(os.path.join(args.out, 'log.txt')) logger.set_names(['Train Loss', 'Valid Loss', \ 'Valid Acc.', 'Train Acc.']) start_epoch, best_acc = 0, 0 # load from checkpoints if args.resume: print('==> Resuming from checkpoint.') load_checkpoint(args, model, optimizer, ema_model=None) if args.transfer_learning and start_epoch > args.unfreeze: print('Unfreezing layers of model.') model = unfreeze_layer(model) # initialize useful stats / logger variables writer = SummaryWriter(args.out) step = 0 test_accs = [] # train and val for epoch in range(start_epoch, args.epochs): # transfer learning approach for the efficientNet model # First run only the last layers while keeping pre-trained frozen # after args.unfreeze epochs, fine-tune the whole network if args.transfer_learning and epoch == args.unfreeze: model = unfreeze_layer(model) print(f'\nEpoch: [{epoch+1} | {args.epochs}] LR: {args.lr}') train_loss, train_acc = train_no_ssl(model=model, optimizer=optimizer, criterion=criterion, train_loader=train_loader, args=args) # get validation loss and accuracy val_loss, val_acc = validate(val_loader, model=model, criterion=criterion, epoch=epoch, mode='Validating', device=args.device) step = args.batch_size * len(train_loader) * (epoch + 1) # loggin stats writer.add_scalar('losses/train_loss', train_loss, step) writer.add_scalar('losses/valid_loss', val_loss, step) writer.add_scalar('accuracy/train_acc', train_acc, step) writer.add_scalar('accuracy/val_acc', val_acc, step) # append logger file logger.append([train_loss, val_loss, val_acc, train_acc]) # save model is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': val_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.out) logger.close() writer.close() print('Best acc:') print(best_acc)
def _run_sl(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) assert opts.problem == 'tspsl', "Only TSP is supported for supervised learning" # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = {'attention': AttentionModel}.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) encoder_class = { 'gat': GraphAttentionEncoder, 'gcn': GCNEncoder, 'mlp': MLPEncoder }.get(opts.encoder, None) assert encoder_class is not None, "Unknown encoder: {}".format( encoder_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, encoder_class, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size, use_cuda=opts.use_cuda).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Compute number of network parameters print(model) nb_param = 0 for param in model.parameters(): nb_param += np.prod(list(param.data.size())) print('Number of parameters: ', nb_param) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }]) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop train_dataset = problem.make_dataset(size=opts.graph_size, filename=opts.train_dataset) opts.epoch_size = train_dataset.size val_dataset = problem.make_dataset(size=opts.graph_size, filename=opts.val_dataset) opts.val_size = val_dataset.size if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch_sl(model, optimizer, lr_scheduler, epoch, train_dataset, val_dataset, problem, tb_logger, opts)
print("Dataset and model ready. Starting training ...") cur_var_not_best = 0 for epoch in range(start_epoch, epochs): plot_data['epoch'] = epoch # Train for one epoch plot_data = train.train(train_loader, model, criterion, optimizer, epoch, print_freq, plot_data, gpu, margin, train_iters, variance) # Evaluate on validation set plot_data = train.validate(val_loader, model, criterion, optimizer, epoch, print_freq, plot_data, gpu, margin, val_iters, variance) # Remember best model and save checkpoint is_best = plot_data['val_loss'][epoch] < best_loss if is_best: best_model = 1 else: best_model = 0 cur_var_not_best += 1 if is_best: print("New best model by loss. Val Loss = " + str(plot_data['val_loss'][epoch])) best_loss = plot_data['val_loss'][epoch] filename = dataset + '/models/' + training_id + '_epoch_' + str(
def run(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size, steps=opts.awe_steps, graph_size=opts.graph_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'constant': baseline = ConstantBaseline() elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) elif opts.baseline == 'critic_lp': assert problem.NAME == 'lp' dim_vocab = {2: 2, 3: 5, 4: 15, 5: 52, 6: 203, 7: 877, 8: 4140} baseline = CriticBaseline( (CriticNetworkLP(dim_vocab[opts.awe_steps], opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution, size=opts.graph_size, degree=opts.degree, steps=opts.awe_steps, awe_samples=opts.awe_samples) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: extra = {'updates': 0, 'avg_reward': 10**8, "best_epoch": -1} start = time.time() for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, extra) finish = time.time() with open("experiments.log", "a+") as f: f.write("{} {:.4f} {} {:.2f}\n".format( '-'.join(opts.train_dataset.split('/')[-2:]), extra["avg_reward"], extra["best_epoch"], finish - start)) print("Took {:.2f} sec for {} epochs".format(finish - start, opts.n_epochs))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: raise ValueError("Distibuted training not supported by multiobj " "training") # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest # checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion if args.restore_file is not None: # Load from checkpoint print('| loading model from {}'.format(args.restore_file)) [model], _model_args = checkpoint_utils.load_model_ensemble( [args.restore_file], arg_overrides=eval(args.model_overrides), task=task, ) # Overwrite architecture arguments # (this is very hacky but I don't know a better way) for k, v in _model_args.__dict__.items(): is_model_argument = k == "arch" is_model_argument |= k.startswith("encoder_") is_model_argument |= k.startswith("decoder_") is_model_argument |= k.startswith("share_") is_model_argument |= k.startswith("adaptive_") if hasattr(args, k) and is_model_argument: setattr(args, k, v) else: # Or build model from scratch model = task.build_model(args) # Training criterion criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Load auxiliary data epoch_aux_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset, idx=1), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.model.max_positions(), ), ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, epoch=0, ) # Estimate fisher if needed if args.inverse_fisher or args.ewc > 0: fisher_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset, idx=1), max_tokens=args.max_tokens, max_sentences=1, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.model.max_positions(), ), ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, epoch=0, ) fim = estimate_diagonal_fisher(args, trainer, fisher_itr, args.n_fisher_samples, precomputed=args.precomputed_fisher) trainer.fim = fim # EWC if args.ewc > 0.0: trainer.prepare_ewc(args.ewc) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr, epoch_aux_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, None) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def train(args, trainer, task, epoch_itr, epoch_aux_itr, fim=None): """Train the model for one epoch.""" # Update parameters every N batches update_freq = args.update_freq[epoch_itr.epoch - 1] \ if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1] print(update_freq) # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=(epoch_itr.epoch >= args.curriculum), ) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple', ) # Auxiliary iterator aux_itr = epoch_aux_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus) aux_itr = iterators.GroupedIterator(aux_itr, update_freq, bottomless=True) extra_meters = collections.defaultdict(lambda: AverageMeter()) valid_subsets = args.valid_subset.split(',') max_update = args.max_update or math.inf for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): # Record gradients from auxiliary data aux_samples = next(aux_itr) trainer.train_step(aux_samples, update_params=False) # Fisher if hasattr(trainer.optimizer, "save_auxiliary"): trainer.optimizer.save_auxiliary() else: print("Warning, the optimizer is ignoring the auxiliary gradients") # Take a step on the primary task log_output = trainer.train_step(samples, apply_ewc=args.ewc > 0) if log_output is None: continue # log mid-epoch stats stats = get_training_stats(trainer) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue # these are already logged above if 'loss' in k: extra_meters[k].update(v, log_output['sample_size']) else: extra_meters[k].update(v) stats[k] = extra_meters[k].avg progress.log(stats, tag='train', step=stats['num_updates']) # ignore the first mini-batch in words-per-second calculation if i == 0: trainer.get_meter('wps').reset() num_updates = trainer.get_num_updates() if (not args.disable_validation and args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0 and num_updates > 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, None) if num_updates >= max_update: break # log end-of-epoch stats stats = get_training_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag='train', step=stats['num_updates']) # reset training meters for k in [ 'train_loss', 'train_nll_loss', 'wps', 'ups', 'wpb', 'bsz', 'gnorm', 'clip', ]: meter = trainer.get_meter(k) if meter is not None: meter.reset()
dropout=0.3, lr=args.learning_rate, activation_fn=nn.LeakyReLU(0.2)).to(device) print("========== Encoder ==========\n{}".format(enc)) print("========== Decoder ==========\n{}".format(dec)) print("========== Discriminator ==========\n{}".format(disc)) for epoch in range(1, args.num_epochs + 1): print("========== Start epoch {} at {} ==========".format( epoch, datetime.now().strftime("%H:%M:%S"))) train(epoch, enc, dec, disc, prior_size, train_dl, TEXT.vocab, device) validate(epoch, enc, dec, disc, prior_size, valid_dl, TEXT.vocab, device) print_decoded(enc, dec, gen_dl, vocab=TEXT.vocab, device=device) print_sample(dec, sample_size=prior_size, max_seq_len=41, vocab=TEXT.vocab, style_vocab=LABEL.vocab, device=device) torch.save(enc.state_dict(), 'rcaae.enc.pt') torch.save(dec.state_dict(), 'rcaae.dec.pt') torch.save(disc.state_dict(), 'rcaae.disc.pt')
batch_size=32, shuffle=False, num_workers=4) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss().to(device) if opt.train: epoch_num = opt.num_epochs best_val_acc = 0 total_loss_val, total_acc_val = [], [] for epoch in tqdm(range(1, epoch_num + 1)): loss_train, acc_train, total_loss_train, total_acc_train = train( train_loader, model, criterion, optimizer, epoch, device) loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch, device) total_loss_val.append(loss_val) total_acc_val.append(acc_val) if acc_val > best_val_acc: best_val_acc = acc_val print('*****************************************************') print('best record: [epoch %d], [val loss %.5f], [val acc %.5f]' % (epoch, loss_val, acc_val)) print('*****************************************************') fig = plt.figure(num=2) fig1 = fig.add_subplot(2, 1, 1) fig2 = fig.add_subplot(2, 1, 2) fig1.plot(total_loss_train, label='training loss') fig1.plot(total_acc_train, label='training accuracy') fig2.plot(total_loss_val, label='validation loss')
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {},'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) print('| Optimizer {}'.format(trainer.optimizer.__class__.__name__)) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Dataloader for the auxiliary task epoch_aux_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset, idx=1), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr, epoch_aux_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) configure(tensorboard_dir) best_valid_acc, patience_counter = 0, 0 for epoch in range(0, hp['EPOCHS']): print('\nEpoch: {}/{} - LR: {:.6f}'.format(epoch + 1, hp['EPOCHS'], hp['LEARNING_RATE'])) # train for 1 epoch train_loss, train_acc = train_one_epoch(model, optimizer, train_loader, epoch, hp) # evaluate on validation set valid_loss, valid_acc = validate(model, valid_loader, epoch, hp) # # reduce lr if validation loss plateaus # self.scheduler.step(valid_loss) is_best = valid_acc > best_valid_acc msg1 = "train loss: {:.3f} - train acc: {:.3f} " msg2 = "- val loss: {:.3f} - val acc: {:.3f}" if is_best: patience_counter = 0 msg2 += " [*]" msg = msg1 + msg2 print(msg.format(train_loss, train_acc, valid_loss, valid_acc)) # check for improvement if not is_best:
nodes in its output layer. For example, a 2-4-3 network has 2 input nodes, one hidden layer with 4 nodes, and 3 output nodes.""" train1 = 'examples/train1.txt' train2 = 'examples/train2.txt' train3 = 'examples/train3.txt' validate1 = 'examples/validation1.txt' validate2 = 'examples/validation2.txt' validate3 = 'examples/validation3.txt' set3Labels = ['upper_left', 'upper_right', 'lower_left', 'lower_right'] p1 = Perceptron(2) p2 = Perceptron(2) print("\nPerceptrons with constant learning rates, datasets 1 and 2:") train(p1, train1, constantLearningRate(1)) validate(p1, validate1) train(p2, train2, constantLearningRate(1)) validate(p2, validate2) p1 = Perceptron(2) p2 = Perceptron(2) print("\nPerceptrons with inverse time learning rates, datasets 1 and 2:") train(p1, train1, inverseTimeLearningRate(1)) validate(p1, validate1) train(p2, train2, inverseTimeLearningRate(1)) validate(p2, validate2) p1 = Perceptron(2, bias=True) p2 = Perceptron(2, bias=True) print("\nPerceptrons with exponential learning rates and bias, " + "datasets 1 and 2:")
if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': float(opts.lr_model) }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': float(opts.lr_critic) }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size) if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, opts)
pin_memory=True, num_workers=8) test_loader = torch.utils.data.DataLoader(test_dataset_full, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=8) if args.PRETRAIN: print('pretraining...') net = VGG_small().cuda() loss_func = torch.nn.CrossEntropyLoss().cuda() optimizer = torch.optim.Adam(net.parameters(), lr=5e-2) get_accuracy(net, train_loader, loss_func) val_accuracy = validate(net, val_loader, loss_func) best_acc = val_accuracy[0] test(net, test_loader, loss_func) save_model_ori(args.model_ori, net, optimizer) for epoch in range(100): if epoch % 30 == 0: optimizer.param_groups[0]['lr'] *= 0.2 train_fullprecision(net, train_loader, loss_func, optimizer, epoch) val_accuracy = validate(net, val_loader, loss_func) if val_accuracy[0] > best_acc: best_acc = val_accuracy[0] test(net, test_loader, loss_func) save_model_ori(args.model_ori, net, optimizer) if args.ALQ:
def _run_rl(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) encoder_class = { 'gat': GraphAttentionEncoder, 'gcn': GCNEncoder, 'mlp': MLPEncoder }.get(opts.encoder, None) assert encoder_class is not None, "Unknown encoder: {}".format( encoder_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, encoder_class, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Compute number of network parameters print(model) nb_param = 0 for param in model.parameters(): nb_param += np.prod(list(param.data.size())) print('Number of parameters: ', nb_param) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( encoder_class, 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset) opts.val_size = val_dataset.size if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts)
def main(): now = datetime.datetime.now() logger = Logger(args.save_path + '/logs_{}'.format(now.isoformat())) model = getModel(args) cudnn.benchmark = True optimizer = torch.optim.SGD(model.parameters(), args.LR, momentum=args.momentum, weight_decay=args.weight_decay) valSource_dataset = SourceDataset('test', ref.nValViews) valTarget_dataset = TargetDataset('test', ref.nValViews) valSource_loader = torch.utils.data.DataLoader(valSource_dataset, batch_size = 1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_fn_cat) valTarget_loader = torch.utils.data.DataLoader(valTarget_dataset, batch_size = 1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_fn_cat) if args.test: f = {} for split in splits: f['{}'.format(split)] = open('{}/{}.txt'.format(args.save_path, split), 'w') test(args, valSource_loader, model, None, f['valSource'], 'valSource') test(args, valTarget_loader, model, None, f['valTarget'], 'valTarget') return train_dataset = Fusion(SourceDataset, TargetDataset, nViews = args.nViews, targetRatio = args.targetRatio, totalTargetIm = args.totalTargetIm) trainTarget_dataset = train_dataset.targetDataset train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batchSize, shuffle=not args.test, num_workers=args.workers if not args.test else 1, pin_memory=True, collate_fn=collate_fn_cat) trainTarget_loader = torch.utils.data.DataLoader( trainTarget_dataset, batch_size=args.batchSize, shuffle=False, num_workers=args.workers if not args.test else 1, pin_memory=True, collate_fn=collate_fn_cat) M = None if args.shapeWeight > ref.eps: print 'getY...' Y = getY(train_dataset.sourceDataset) M = initLatent(trainTarget_loader, model, Y, nViews = args.nViews, S = args.sampleSource, AVG = args.AVG) print 'Start training...' for epoch in range(1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, args.dropLR) train_mpjpe, train_loss, train_unSuploss = train(args, train_loader, model, optimizer, M, epoch) valSource_mpjpe, valSource_loss, valSource_unSuploss = validate(args, 'Source', valSource_loader, model, None, epoch) valTarget_mpjpe, valTarget_loss, valTarget_unSuploss = validate(args, 'Target', valTarget_loader, model, None, epoch) train_loader.dataset.targetDataset.shuffle() if args.shapeWeight > ref.eps and epoch % args.intervalUpdateM == 0: M = stepLatent(trainTarget_loader, model, M, Y, nViews = args.nViews, lamb = args.lamb, mu = args.mu, S = args.sampleSource) logger.write('{} {} {}\n'.format(train_mpjpe, valSource_mpjpe, valTarget_mpjpe)) logger.scalar_summary('train_mpjpe', train_mpjpe, epoch) logger.scalar_summary('valSource_mpjpe', valSource_mpjpe, epoch) logger.scalar_summary('valTarget_mpjpe', valTarget_mpjpe, epoch) logger.scalar_summary('train_loss', train_loss, epoch) logger.scalar_summary('valSource_loss', valSource_loss, epoch) logger.scalar_summary('valTatget_loss', valTarget_loss, epoch) logger.scalar_summary('train_unSuploss', train_unSuploss, epoch) logger.scalar_summary('valSource_unSuploss', valSource_unSuploss, epoch) logger.scalar_summary('valTarget_unSuploss', valTarget_unSuploss, epoch) if epoch % 10 == 0: torch.save({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, args.save_path + '/checkpoint_{}.pth.tar'.format(epoch)) logger.close()