def _submit_job(self, exp_dict, command, reset, submit_dict={}): """Submit one job. It checks if the experiment exist and manages the special casses, e.g., new experiment, reset, failed, job is already running, completed """ add_job_utils() import haven_jobs_utils as hju # Define paths savedir = os.path.join(self.savedir_base, hu.hash_dict(exp_dict)) fname = hju.get_job_fname(savedir) if not os.path.exists(fname): # Check if the job already exists job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Launching" elif reset: # Check if the job already exists job_id = hu.load_json(fname).get("job_id") hju.kill_job(self.api, job_id) hc.delete_and_backup_experiment(savedir) job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Resetting" else: job_id = hu.load_json(fname).get("job_id") job = hju.get_job(self.api, job_id) if job.alive or job.state == 'SUCCEEDED': # If the job is alive, do nothing message = 'IGNORED: Job %s' % job.state elif job.state in ["FAILED", "CANCELLED"]: message = "SUBMITTED: Retrying %s Job" % job.state job_dict = self.launch_job(exp_dict, savedir, command, job=job) job_id = job_dict['job_id'] # This shouldn't happen else: raise ValueError('wtf') submit_dict[job_id] = message
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) if not os.path.join(savedir, "exp_dict.json"): hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # BCD train # ================== # Ignore the following combinations if not ut.is_valid_exp(exp_dict): return score_list_fname = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_fname): score_list = hu.load_pkl(score_list_fname) else: score_list = train(dataset_name=exp_dict['dataset']['name'], loss_name=exp_dict['dataset']['loss'], block_size=exp_dict['block_size'], partition_rule=exp_dict['partition'], selection_rule=exp_dict['selection'], update_rule=exp_dict['update'], n_iters=exp_dict['max_iters'], L1=exp_dict.get('l1', 0), L2=0, datasets_path=datadir) hu.save_pkl(score_list_fname, score_list) print('Experiment completed.') return score_list
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict)) os.makedirs(savedir, exist_ok=True) if reset: hc.delete_and_backup_experiment(savedir) print("Experiment saved in %s" % savedir) # Dataset # ================== # train set data_transform = A.Compose( [ A.Flip(p=0.3), A.IAAAffine(p=0.3), A.Rotate(p=0.3), A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=15, val_shift_limit=10, p=0.3), A.GaussianBlur(3, p=0.3), A.GaussNoise(30, p=0.3) ], keypoint_params=A.KeypointParams(format='xy'), additional_targets={ 'mask0': 'mask', 'mask1': 'mask', 'mask2': 'mask', 'keypoints0': 'keypoints', 'keypoints1': 'keypoints', 'keypoints2': 'keypoints', 'keypoints3': 'keypoints', 'keypoints4': 'keypoints', 'keypoints5': 'keypoints' }) # random.seed(20201009) random_seed = random.randint(0, 20201009) train_set = HEDataset_Fast(data_dir=datadir, n_classes=exp_dict["n_classes"], transform=data_transform, option="Train", random_seed=random_seed, obj_option=exp_dict["obj"], patch_size=exp_dict["patch_size"], bkg_option=exp_dict["bkg"]) test_transform = A.Compose([A.Resize(1024, 1024)], keypoint_params=A.KeypointParams(format='xy'), additional_targets={ 'mask0': 'mask', 'mask1': 'mask' }) # val set val_set = HEDataset(data_dir=datadir, transform=test_transform, option="Validation") val_loader = DataLoader(val_set, batch_size=1, num_workers=num_workers) # test set test_set = HEDataset(data_dir=datadir, transform=test_transform, option="Test") test_loader = DataLoader(test_set, batch_size=1, num_workers=num_workers) # Model # ================== # torch.manual_seed(20201009) model = models.get_model(exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) # train_sampler = torch.utils.data.RandomSampler( # train_set, replacement=True, num_samples=2*len(val_set)) train_loader = DataLoader(train_set, batch_size=exp_dict["batch_size"], shuffle=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate and Visualize the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join( savedir, "images"), n_images=7) score_dict.update(val_dict) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = len(score_list) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Save Best Checkpoint if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()): hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) # if s_epoch==exp_dict['max_epoch']: # e = s_epoch model.load_state_dict( hu.torch_load(os.path.join(savedir, "model_best.pth"))) test_dict = model.test_on_loader(test_loader) hu.save_pkl(os.path.join(savedir, 'test_iou.pkl'), test_dict) print('Test IoU:{}'.format(test_dict["test_iou"])) print('Experiment completed et epoch %d' % e)
def launch_or_ignore_exp_dict(self, exp_dict, command, reset, savedir, submit_dict={}): """launch or ignore job. It checks if the experiment exist and manages the special casses, e.g., new experiment, reset, failed, job is already running, completed """ # Define paths fname = get_job_fname(savedir) if not os.path.exists(fname): # Check if the job already exists job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Launching" elif reset: # Check if the job already exists job_id = hu.load_json(fname).get("job_id") self.kill_job(job_id) hc.delete_and_backup_experiment(savedir) job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Resetting" else: job_id = hu.load_json(fname).get("job_id") job = self.get_job(job_id) if job['state'] in [ 'SUCCEEDED', 'RUNNING', 'PENDING', 'COMPLETED', 'COMPLETING' ]: # If the job is alive, do nothing message = 'IGNORED: Job %s' % job['state'] elif job['state'] in [ "FAILED", "CANCELLED", "INTERRUPTED", "TIMEOUT" ]: message = "SUBMITTED: Retrying %s Job" % job['state'] job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=job) job_id = job_dict['job_id'] # This shouldn't happen else: raise ValueError('wtf') submit_dict[job_id] = {'exp_dict': exp_dict, 'message': message}
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # test set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="test", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader( val_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) test_loader = DataLoader( test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) model.waiting = 0 model.val_score_best = -np.inf train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(test_set)) train_loader = DataLoader(train_set, sampler=train_sampler, collate_fn=ut.collate_fn, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) score_dict["val_score"] = val_dict["val_score"] # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = e score_dict["waiting"] = model.waiting model.waiting += 1 # Add to score_list and save checkpoint score_list += [score_dict] # Save Best Checkpoint score_df = pd.DataFrame(score_list) if score_dict["val_score"] >= model.val_score_best: test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(test_dict) hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) # score_df.to_csv(os.path.join(savedir, "score_best_df.csv")) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) model.waiting = 0 model.val_score_best = score_dict["val_score"] print("Saved Best: %s" % savedir) # Report & Save score_df = pd.DataFrame(score_list) # score_df.to_csv(os.path.join(savedir, "score_df.csv")) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if model.waiting > 100: break print('Experiment completed et epoch %d' % e)
def trainval(exp_dict, savedir_base, datadir_base, reset=False): # bookkeeping stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # load train and acrtive set train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", datadir_base=datadir_base, exp_dict=exp_dict) active_set = ActiveLearningDataset(train_set, random_state=42) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", datadir_base=datadir_base, exp_dict=exp_dict) val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"]) # Model # ================== model = models.get_model(model_name=exp_dict['model']['name'], exp_dict=exp_dict).cuda() model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) active_set.load_state_dict( hu.load_pkl(os.path.join(savedir, "active_set.pkl"))) score_list = hu.load_pkl(score_list_path) inner_s_epoch = score_list[-1]['inner_epoch'] + 1 s_cycle = score_list[-1]['cycle'] else: # restart experiment score_list = [] inner_s_epoch = 0 s_cycle = 0 # Train & Val # ================== print("Starting experiment at cycle %d epoch %d" % (s_cycle, inner_s_epoch)) for c in range(s_cycle, exp_dict['max_cycle']): # Set seed np.random.seed(c) torch.manual_seed(c) torch.cuda.manual_seed_all(c) if inner_s_epoch == 0: active_set.label_next_batch(model) hu.save_pkl(os.path.join(savedir, "active_set.pkl"), active_set.state_dict()) train_loader = DataLoader(active_set, sampler=samplers.get_sampler( exp_dict['sampler']['train'], active_set), batch_size=exp_dict["batch_size"]) # Visualize the model model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) for e in range(inner_s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} if e == 0: score_dict.update(model.val_on_loader(val_loader)) # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict["epoch"] = len(score_list) score_dict["inner_epoch"] = e score_dict["cycle"] = c score_dict['n_ratio'] = active_set.n_labelled_ratio score_dict["n_train"] = len(train_loader.dataset) score_dict["n_pool"] = len(train_loader.dataset.pool) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) inner_s_epoch = 0
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=1, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(val_set)) train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate and Visualize the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(val_dict) # model.vis_on_loader( # vis_loader, savedir=os.path.join(savedir, "images")) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = len(score_list) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Save Best Checkpoint if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()): hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('Experiment completed et epoch %d' % e)
def trainval(exp_dict, savedir_base, reset): # ================== # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = savedir_base + "/%s/" % exp_id if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(savedir + "exp_dict.json", exp_dict) print("Experiment saved in %s" % savedir) # ================== # Dataset # ================== transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.5, ), (0.5, )) ]) # train set train_set = torchvision.datasets.MNIST(savedir_base, train=True, download=True, transform=transform) train_loader = DataLoader(train_set, shuffle=True, batch_size=exp_dict["batch_size"]) # val set val_set = torchvision.datasets.MNIST(savedir_base, train=False, download=True, transform=transform) val_loader = DataLoader(val_set, shuffle=False, batch_size=exp_dict["batch_size"]) # ================== # Model # ================== model = MLP(n_classes=10).cuda() model.opt = torch.optim.Adam(model.parameters(), lr=exp_dict["lr"]) model_path = savedir + "/model.pth" score_list_path = savedir + "/score_list.pkl" if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = len(score_list) else: # restart experiment score_list = [] s_epoch = 0 # ================== # Train & Val # ================== print("Starting experiment at epoch %d" % s_epoch) for e in range(s_epoch, 100): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict.update(model.val_on_loader(val_loader)) score_dict["epoch"] = e # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()[["epoch", "train_loss", "val_acc"]], "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir_base)