def kill_jobs(self): add_job_utils() import haven_jobs_utils as hju hu.check_duplicates(self.exp_list) pr = hu.Parallel() submit_dict = {} for exp_dict in self.exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) fname = hju.get_job_fname(savedir) if os.path.exists(fname): job_id = hu.load_json(fname)['job_id'] pr.add(hju.kill_job, self.api, job_id) submit_dict[exp_id] = 'KILLED' else: submit_dict[exp_id] = 'NoN-Existent' pr.run() pr.close() pprint.pprint(submit_dict) print("%d/%d experiments killed." % (len([ s for s in submit_dict.values() if 'KILLED' in s]), len(submit_dict))) return submit_dict
def __init__(self, datadir, split, supervision, exp_dict=None, sbd=False): self.split = split self.exp_dict = exp_dict self.supervision = supervision # self.n_classes = 21 self.n_classes = 21 self.datadir = datadir if split == "train": # berkeley addition of images if sbd == True: dataset = torchvision.datasets.SBDataset(os.path.join(datadir, 'sbdataset'), image_set='train', download=False) else: dataset = torchvision.datasets.VOCSegmentation(datadir, year='2012', image_set='train', download=False) elif split in ["val", 'test']: dataset = torchvision.datasets.VOCSegmentation(datadir, image_set='val', download=False) self.point_dict = hu.load_json(os.path.join(datadir, 'VOCdevkit', 'pascal2012_trainval_main.json')) self.dataset = dataset self.transforms = None
def __init__(self, datadir, split, exp_dict=None): self.split = split self.exp_dict = exp_dict self.meta_list = [] self.split_number = exp_dict["dataset"]["%s_split_number" % split] self.stratification = exp_dict["dataset"]["stratification"] meta = hu.load_json( os.path.join(datadir, "splits", self.stratification, "%s_%d.json" % (split, self.split_number))) self.labels = [] self.meta_list = [] for path, attributes in meta.items(): label_dict = attributes["points"] if len(label_dict) == 0: point_list = [] else: point_list = list(list(label_dict.values())[0].values()) fname = os.path.join(datadir, path) meta_dict = { 'fname': fname, 'point_list': point_list, 'count': len(point_list) } self.labels.append(int(len(point_list) > 0)) self.meta_list.append(meta_dict) self.labels = np.array(self.labels) print('Foreground Ratio: %.3f' % self.labels.mean()) self.transforms = None
def __init__(self, rm, vars=None, show_jobs=True, wide_display=True): self.rm_original = rm if vars is None: fname = os.path.join(rm.savedir_base, '.dashboard_history.json') if os.path.exists(fname): self.vars = hu.load_json(fname) else: self.vars = {} self.vars = vars self.show_jobs = show_jobs self.wide_display = wide_display self.layout=widgets.Layout(width='100px') self.layout_label=widgets.Layout(width='200px') self.layout_dropdown = widgets.Layout(width='200px') self.layout_button = widgets.Layout(width='200px') self.t_savedir_base = widgets.Text( value=str(self.vars['savedir_base']), layout=widgets.Layout(width='600px'), disabled=False ) self.t_filterby_list = widgets.Text( value=str(self.vars.get('filterby_list')), layout=widgets.Layout(width='1200px'), description=' filterby_list:', disabled=False )
def launch_exp_dict(self, exp_dict, savedir, command, job=None): """Submit a job job and save job dict and exp_dict.""" # Check for duplicates # if job is not None: # assert self._assert_no_duplicates(job) fname_exp_dict = os.path.join(savedir, "exp_dict.json") hu.save_json(fname_exp_dict, exp_dict) assert (hu.hash_dict( hu.load_json(fname_exp_dict)) == hu.hash_dict(exp_dict)) # Define paths workdir_job = os.path.join(savedir, "code") # Copy the experiment code into the experiment folder hu.copy_code(self.workdir + "/", workdir_job, verbose=0) # Run command job_id = self.submit_job(command, workdir_job, savedir_logs=savedir) # Verbose if self.verbose: print("Job_id: %s command: %s" % (job_id, command)) job_dict = {"job_id": job_id, "command": command} hu.save_json(get_job_fname(savedir), job_dict) return job_dict
def __init__(self, model, nclasses, exp_dict): """ Constructor Args: model: architecture to train nclasses: number of output classes exp_dict: reference to dictionary with the hyperparameters """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.embedding_propagation = EmbeddingPropagation() self.label_propagation = LabelPropagation() self.model.add_classifier(nclasses, modalities=0) self.nclasses = nclasses if self.exp_dict["rotation_weight"] > 0: self.model.add_classifier(4, "classifier_rot") best_accuracy = -1 if self.exp_dict["pretrained_weights_root"] is not None: for exp_hash in os.listdir(self.exp_dict['pretrained_weights_root']): base_path = os.path.join(self.exp_dict['pretrained_weights_root'], exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = haven.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if (loaded_exp_dict["model"]["name"] == 'pretraining' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and # loaded_exp_dict["labelprop_alpha"] == exp_dict["labelprop_alpha"] and # loaded_exp_dict["labelprop_scale"] == exp_dict["labelprop_scale"] and os.path.exists(pkl_path)): accuracy = haven.load_pkl(pkl_path)[-1]["val_accuracy"] try: self.model.load_state_dict(torch.load(os.path.join(base_path, 'checkpoint_best.pth'))['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy except: continue assert(best_accuracy > 0.1) print("Finetuning %s with original accuracy : %f" %(base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) # Add optimizers here self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.exp_dict["lr"], momentum=0.9, weight_decay=self.exp_dict["weight_decay"], nesterov=True) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode="min" if "loss" in self.exp_dict["target_loss"] else "max", patience=self.exp_dict["patience"]) self.model.cuda() if self.ngpu > 1: self.parallel_model = torch.nn.DataParallel(self.model, device_ids=list(range(self.ngpu)))
def _submit_job(self, exp_dict, command, reset, submit_dict={}): """Submit one job. It checks if the experiment exist and manages the special casses, e.g., new experiment, reset, failed, job is already running, completed """ add_job_utils() import haven_jobs_utils as hju # Define paths savedir = os.path.join(self.savedir_base, hu.hash_dict(exp_dict)) fname = hju.get_job_fname(savedir) if not os.path.exists(fname): # Check if the job already exists job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Launching" elif reset: # Check if the job already exists job_id = hu.load_json(fname).get("job_id") hju.kill_job(self.api, job_id) hc.delete_and_backup_experiment(savedir) job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Resetting" else: job_id = hu.load_json(fname).get("job_id") job = hju.get_job(self.api, job_id) if job.alive or job.state == 'SUCCEEDED': # If the job is alive, do nothing message = 'IGNORED: Job %s' % job.state elif job.state in ["FAILED", "CANCELLED"]: message = "SUBMITTED: Retrying %s Job" % job.state job_dict = self.launch_job(exp_dict, savedir, command, job=job) job_id = job_dict['job_id'] # This shouldn't happen else: raise ValueError('wtf') submit_dict[job_id] = message
def generate_obj_from_file(img_file, points_file, obj_params=dict()): obj_params = validate_obj_params(obj_params) img = sio.imread(img_file) points = hu.load_json(points_file) if type(points) is dict: dist = np.stack( [generate_obj(img, v, obj_params)[0] for _, v in points.items()], axis=0) return dist elif type(points) is list: print("It's a List") else: print("Type Error:") return generate_obj(img, points, obj_params=obj_params)
def get_existing_slurm_job_commands(exp_list, savedir_base): existing_job_commands = [] for exp_dict in exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) file_name = os.path.join(savedir, "job_dict.json") if not os.path.exists(file_name): continue job_dict = hu.load_json(file_name) job_id = job_dict["job_id"] job_status = hu.subprocess_call( "scontrol show job %s" % job_id).split("JobState=")[1].split(" ")[0] if job_status == "RUNNING" or job_status == "PENDING": existing_job_commands += [job_dict["command"]] return existing_job_commands
def __init__(self, exp_dict): super().__init__() self.exp_dict = exp_dict self.backbone = get_backbone(exp_dict) self.backbone.cuda() self.min_lr = exp_dict["lr"] * exp_dict["min_lr_decay"] self.optimizer = torch.optim.Adam(self.backbone.parameters(), lr=exp_dict['lr'], weight_decay=1e-4) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=0.1, patience=10, min_lr=self.min_lr / 100, verbose=True) if self.exp_dict["amp"] > 0: self.scaler = amp.GradScaler() pretrained_weights_folder = self.exp_dict.get( "pretrained_weights_folder", None) if pretrained_weights_folder is not None: loaded = False all_exps = os.listdir(pretrained_weights_folder) for exp in all_exps: if exp == "deleted": continue exp_dict = hu.load_json( join(pretrained_weights_folder, exp, 'exp_dict.json')) if exp_dict["seed"] == self.exp_dict["seed"] and \ exp_dict["dataset"]["augmentation"] == self.exp_dict["dataset"]["augmentation"] and \ exp_dict["dataset"]["task"] == self.exp_dict["dataset"]["task"] and \ exp_dict["backbone"]["name"] == self.exp_dict["backbone"]["name"]: try: state = torch.load( join(pretrained_weights_folder, exp, 'model.pth')) self.set_state_dict(state) loaded = True break except Exception as e: print(e) if not loaded: raise RuntimeError( "No matching pre-trained weights were found")
def random_read_subregion(self, file_list, random_seed=False): if random_seed: np.random.seed(random_seed) random_state = np.random.random(size=(2, )) file_reader = sitk.ImageFileReader() file_reader.SetFileName(file_list[0]) file_reader.ReadImageInformation() image_size = file_reader.GetSize() extractindex = [ int((img_dim - self.patch_size) * random_) for img_dim, random_ in zip(image_size, random_state) ] file_reader.SetExtractIndex(extractindex) file_reader.SetExtractSize([self.patch_size, self.patch_size]) return_item = [sitk.GetArrayFromImage(file_reader.Execute())[..., :3]] for file in file_list[1:-1]: file_reader.SetFileName(file) return_item.append(sitk.GetArrayFromImage(file_reader.Execute())) points_crop = dict() for k, v in hu.load_json(file_list[-1]).items(): if len(v) == 0: points_crop[k] = v else: v = np.array(v) ind = np.logical_and( np.logical_and( (v[:, 0] - extractindex[0]) >= 0, (v[:, 0] < extractindex[0] + self.patch_size)), np.logical_and( (v[:, 1] - extractindex[1]) >= 0, (v[:, 1] < extractindex[1] + self.patch_size))) points_crop[k] = v[ind, :] - np.array(extractindex)[None] return_item.append(points_crop) return return_item
def launch_or_ignore_exp_dict(self, exp_dict, command, reset, savedir, submit_dict={}): """launch or ignore job. It checks if the experiment exist and manages the special casses, e.g., new experiment, reset, failed, job is already running, completed """ # Define paths fname = get_job_fname(savedir) if not os.path.exists(fname): # Check if the job already exists job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Launching" elif reset: # Check if the job already exists job_id = hu.load_json(fname).get("job_id") self.kill_job(job_id) hc.delete_and_backup_experiment(savedir) job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Resetting" else: job_id = hu.load_json(fname).get("job_id") job = self.get_job(job_id) if job['state'] in [ 'SUCCEEDED', 'RUNNING', 'PENDING', 'COMPLETED', 'COMPLETING' ]: # If the job is alive, do nothing message = 'IGNORED: Job %s' % job['state'] elif job['state'] in [ "FAILED", "CANCELLED", "INTERRUPTED", "TIMEOUT" ]: message = "SUBMITTED: Retrying %s Job" % job['state'] job_dict = self.launch_exp_dict(exp_dict, savedir, command, job=job) job_id = job_dict['job_id'] # This shouldn't happen else: raise ValueError('wtf') submit_dict[job_id] = {'exp_dict': exp_dict, 'message': message}
parser.add_argument('-e', '--exp_group_list', nargs="+") parser.add_argument('-sb', '--savedir_base', required=True) parser.add_argument('-d', '--datadir', default=None) parser.add_argument("-r", "--reset", default=0, type=int) parser.add_argument("-ei", "--exp_id", default=None) parser.add_argument("-j", "--run_jobs", default=0, type=int) parser.add_argument("-nw", "--num_workers", type=int, default=0) args = parser.parse_args() # Collect experiments # =================== if args.exp_id is not None: # select one experiment savedir = os.path.join(args.savedir_base, args.exp_id) exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json")) exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, datadir=args.datadir, reset=args.reset,
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', '--exp_dict', required=True, type=str) # parser.add_argument('-sb', '--savedir_base', required=True) parser.add_argument('-d', '--datadir', required=True) parser.add_argument("-r", "--reset", default=0, type=int) # parser.add_argument("-ei", "--exp_id", default=None) # parser.add_argument("-j", "--run_jobs", default=0, type=int) parser.add_argument("-nw", "--num_workers", type=int, default=0) args = parser.parse_args() # Collect experiments # =================== exp_dict = hu.load_json(args.exp_dict) # do trainval folddir_10 = glob.glob(os.path.join(args.datadir, '*_fold')) for folddir in folddir_10: savedir_base = os.path.join(folddir, 'Result') os.makedirs(savedir_base, exist_ok=True) # trainval(exp_dict, savedir_base, datadir, folddir, reset=True, num_workers=25) trainval(exp_dict=exp_dict, savedir_base=savedir_base, datadir=args.datadir, reset=args.reset, num_workers=args.num_workers)
def __init__(self, model, n_classes, exp_dict, pretrained_savedir=None, savedir_base=None): """ Constructor Args: model: architecture to train exp_dict: reference to dictionary with the global state of the application """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.predict_method = exp_dict['predict_method'] self.model.add_classifier(n_classes, modalities=0) self.nclasses = n_classes best_accuracy = -1 self.label = exp_dict['model']['backbone'] + "_" + exp_dict['dataset_test'].split('_')[1].replace('-imagenet','') print('=============') print('dataset:', exp_dict["dataset_train"].split('_')[-1]) print('backbone:', exp_dict['model']["backbone"]) print('n_classes:', exp_dict['n_classes']) print('support_size_train:', exp_dict['support_size_train']) if pretrained_savedir is None: # find the best checkpoint savedir_base = exp_dict["finetuned_weights_root"] if not os.path.exists(savedir_base): raise ValueError("Please set the variable named \ 'finetuned_weights_root' with the path of the folder \ with the episodic finetuning experiments") for exp_hash in os.listdir(savedir_base): base_path = os.path.join(savedir_base, exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = hu.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if exp_dict['support_size_train'] in [2,3,4]: support_size_needed = 1 else: support_size_needed = exp_dict['support_size_train'] if (loaded_exp_dict["model"]["name"] == 'finetuning' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and loaded_exp_dict['n_classes'] == exp_dict["n_classes"] and loaded_exp_dict['support_size_train'] == support_size_needed, loaded_exp_dict["embedding_prop"] == exp_dict["embedding_prop"]): model_path = os.path.join(base_path, 'checkpoint_best.pth') try: print("Attempting to load ", model_path) accuracy = hu.load_pkl(pkl_path)[-1]["val_accuracy"] self.model.load_state_dict(torch.load(model_path)['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy except Exception as e: print(e) assert(best_accuracy > 0.1) print("Finetuning %s with original accuracy : %f" %(base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) self.best_accuracy = best_accuracy self.acc_sum = 0.0 self.n_count = 0 self.model.cuda()
def __init__(self, model, n_classes, exp_dict): """ Constructor Args: model: architecture to train exp_dict: reference to dictionary with the global state of the application """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.predict_method = exp_dict['predict_method'] self.model.add_classifier(n_classes, modalities=0) self.nclasses = n_classes if self.exp_dict["rotation_weight"] > 0: self.model.add_classifier(4, "classifier_rot") best_accuracy = -1 self.label = exp_dict['model']['backbone'] + "_" + exp_dict[ 'dataset_test'].split('_')[1].replace('-imagenet', '') if self.exp_dict["pretrained_weights_root"] == 'tinder': best_scores = np.load( '/mnt/datasets/public/research/adaptron_laplace/best_scores.npy', allow_pickle=True) for r in best_scores: backbone_best = r[3] dataset_best = r[4] savedir_best = r[-1] best_accuracy = r[0] shot_best = r[2] if (exp_dict['model']['backbone'] == backbone_best and exp_dict['dataset_test'] == dataset_best and 5 == shot_best): self.best_accuracy = best_accuracy self.model.load_state_dict( torch.load( os.path.join(savedir_best, 'checkpoint_best.pth'))['model']) break elif self.exp_dict["pretrained_weights_root"] == 'csv': best_scores = np.load( '/mnt/datasets/public/research/adaptron_laplace/best_scores.npy', allow_pickle=True) for r in best_scores: backbone_best = r[3] dataset_best = r[4] savedir_best = r[-1] best_accuracy = r[0] shot_best = r[2] if (exp_dict['model']['backbone'] == backbone_best and exp_dict['dataset_test'] == dataset_best and exp_dict['support_size_test'] == shot_best): self.best_accuracy = best_accuracy self.model.load_state_dict( torch.load( os.path.join(savedir_best, 'checkpoint_best.pth'))['model']) break elif self.exp_dict["pretrained_weights_root"] == 'hdf5': fdir = '/mnt/datasets/public/research/adaptron_laplace/embeddings/finetuned' fpos = "%s_1shot_fine_*/test.h5" % (self.label) embeddings_fname = glob.glob(os.path.join(fdir, fpos))[0] self.best_accuracy = float( embeddings_fname.split('/')[-2].split('_')[-1]) / 100. self.sampler = oracle.Sampler(embeddings_fname=embeddings_fname, n_classes=exp_dict['classes_test'], distract_flag=exp_dict.get( 'distract_flag', False)) elif self.exp_dict["pretrained_weights_root"] is not None: for exp_hash in os.listdir( self.exp_dict['pretrained_weights_root']): base_path = os.path.join( self.exp_dict['pretrained_weights_root'], exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = haven.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if not os.path.exists(pkl_path): continue if (loaded_exp_dict["model"]["name"] == 'finetuning' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and loaded_exp_dict["labelprop_alpha"] == exp_dict["labelprop_alpha"] and loaded_exp_dict["labelprop_scale"] == exp_dict["labelprop_scale"] and loaded_exp_dict["support_size_train"] == exp_dict["support_size_train"]): accuracy = haven.load_pkl(pkl_path)[-1]["val_accuracy"] try: self.model.load_state_dict(torch.load( os.path.join(base_path, 'checkpoint_best.pth'))['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy best_score_list = haven.load_pkl(pkl_path) except Exception as e: print(str(e)) assert (best_accuracy > 0.1) self.best_accuracy = best_score_list[-1]['test_accuracy'] print("Finetuning %s with original accuracy : %f" % (base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) else: raise ValueError('weights are not defined') self.acc_sum = 0.0 self.n_count = 0 self.model.cuda()
def __init__(self, split, transform_lvl, datadir_base, n_samples=None, colorjitter=False, val_transform='identity', netA=None): path = datadir_base or '/mnt/datasets/public/imagenet/imagenet-data/raw-data/' self.name = 'imagenet' self.n_classes = 1000 self.image_size = 224 self.nc = 3 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.mean = normalize.mean self.std = normalize.std if split == 'train': if transform_lvl == 0: transform = transforms.Compose([ # transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(256), transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, ]) if netA is not None: transform.transforms.append(netA) elif transform_lvl == 1: transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomCrop(self.image_size, padding=4), # transforms.ToTensor(), # normalize, ]) elif transform_lvl == 1.5: transform = transforms.Compose([ # transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, ]) elif transform_lvl == 2: transform = transforms.Compose([ # transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomCrop(self.image_size, padding=4), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, ]) elif transform_lvl == 2.5: transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomCrop(self.image_size, padding=4), transforms.RandomAffine(10, translate=None, scale=(0.5, 2)), # transforms.ToTensor(), # normalize, ]) elif transform_lvl == 3: transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomCrop(self.image_size, padding=4), transforms.RandomAffine(10, translate=None, scale=(0.5, 2)), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, ]) else: raise ValueError( 'only lvls 0, 1, 1.5, 2, 2.5 and 3 are supported') if colorjitter: transform.transforms.append( transforms.ColorJitter( brightness=0.5, contrast=0.5, saturation=0.5, )) transform.transforms.append(transforms.ToTensor()) transform.transforms.append(normalize) elif split in ['validation', 'test']: # identity transform transform = transforms.Compose([ # transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) self.transform = transform if split in ['train', 'validation']: fname = '/mnt/projects/bilvlda/dataset/imagenet/imagenet_train.json' if not os.path.exists(fname): dataset = dset.ImageFolder(root=os.path.join(path, 'train')) hu.save_json(fname, dataset.imgs) self.imgs = np.array(hu.load_json(fname)) assert (len(self.imgs) == 1281167) elif split == 'test': fname = '/mnt/projects/bilvlda/dataset/imagenet/imagenet_validation.json' if not os.path.exists(fname): dataset = dset.ImageFolder( root=os.path.join(path, 'validation')) hu.save_json(fname, dataset.imgs) self.imgs = np.array(hu.load_json(fname)) assert (len(self.imgs) == 50000) if n_samples is not None: assert n_samples % self.n_classes == 0, 'the number of samples %s must be a multiple of the number of classes %s' % ( n_samples, self.n_classes) with hu.random_seed(10): imgs = np.array(self.imgs) n = int(n_samples / self.n_classes) # number of samples per class # Extract a balanced subset ind = np.hstack([ np.random.choice(np.where(imgs[:, 1] == l)[0], n, replace=False) for l in np.unique(imgs[:, 1]) ]) # ind = np.random.choice(imgs.shape[0], n_samples, replace=False) self.imgs = imgs[ind]
def get_summary_list(self, failed_only=False, columns=None, max_lines=10, wrap_size=8, add_prefix=False, get_logs=True, exp_list=None, savedir_base=None): savedir_base = savedir_base or self.savedir_base exp_list = exp_list or self.exp_list # get job key job_id_list = [] for exp_dict in exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) fname = get_job_fname(savedir) if os.path.exists(fname): job_id_list += [hu.load_json(fname)["job_id"]] jobs_dict = self.get_jobs_dict(job_id_list) # get summaries summary_list = [] for exp_dict in exp_list: result_dict = {} exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) job_fname = get_job_fname(savedir) # General info result_dict = {} result_dict['exp_dict'] = exp_dict result_dict["exp_id"] = exp_id result_dict["job_id"] = None result_dict["job_state"] = 'NEVER LAUNCHED' if os.path.exists(job_fname): job_dict = hu.load_json(job_fname) job_id = job_dict["job_id"] if job_id not in jobs_dict: continue fname_exp_dict = os.path.join(savedir, "exp_dict.json") job = jobs_dict[job_id] if hasattr(job_dict, 'command'): command = job_dict['command'] else: command = None # Job info result_dict['started_at'] = hu.time_to_montreal(fname_exp_dict) result_dict["job_id"] = job_id result_dict["job_state"] = job["state"] result_dict["restarts"] = len(job["runs"]) result_dict["command"] = command if get_logs: # Logs info if job["state"] == "FAILED": logs_fname = os.path.join(savedir, "err.txt") else: logs_fname = os.path.join(savedir, "logs.txt") if os.path.exists(logs_fname): result_dict["logs"] = hu.read_text( logs_fname)[-max_lines:] summary_list += [result_dict] return summary_list
parser = argparse.ArgumentParser() parser.add_argument('-e', '--exp_group_list', nargs='+') parser.add_argument('-sb', '--savedir_base', required=True) parser.add_argument('-r', '--reset', default=0, type=int) parser.add_argument('-ei', '--exp_id', default=None) parser.add_argument('-j', '--run_jobs', default=None) args = parser.parse_args() # Collect experiments # ------------------- if args.exp_id is not None: # select one experiment savedir = os.path.join(args.savedir_base, args.exp_id) exp_dict = hu.load_json(os.path.join(savedir, 'exp_dict.json')) exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # Run experiments # --------------- if not args.run_jobs: # run experiments sequentially for exp_dict in exp_list: # do trainval
def newminimum(exp_id, savedir_base, datadir, name, exp_dict, metrics_flag=True): # bookkeeping # --------------- # get experiment directory old_modeldir = os.path.join(savedir_base, exp_id) savedir = os.path.join(savedir_base, exp_id, name) old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json')) # TODO: compare exp dict for possible errors: # optimizer have to be the same # same network, dataset # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') old_model_path = os.path.join(old_modeldir, 'model.pth') old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl') old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth') score_list = hu.load_pkl(old_score_list_path) model.load_state_dict(torch.load(old_model_path)) opt.load_state_dict(torch.load(old_opt_path)) s_epoch = score_list[-1]['epoch'] + 1 # save current model state for comparison minimum = [] for param in model.parameters(): minimum.append(param.clone()) # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for epoch in range(s_epoch, exp_dict['max_epoch']): # Set seed np.random.seed(exp_dict['runs'] + epoch) torch.manual_seed(exp_dict['runs'] + epoch) # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name='softmax_loss') # metric_name=exp_dict["loss_func"]) # TODO: which loss should be used? (normal or with reguralizer?) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): # images, labels = images.cuda(), labels.cuda() no cuda available opt.zero_grad() loss = loss_function(model, images, labels, minimum, 0.1) # just works for custom loss function loss.backward() opt.step() e_time = time.time() # Record metrics score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) with torch.nograd(): print('Current distance: %f', metrics.computedistance(minimum, model)) print('Experiment completed')
def __getitem__(self, ind): real_ind = ind % self.files_no + 1 image = sio.imread( os.path.join(self.data_dir, self.option, "Norms", self.option.lower() + '_' + str(real_ind) + ".png"))[..., :3] if self.obj_option == "Gauss": obj = sio.imread( os.path.join( self.data_dir, self.option, "GaussObj", self.option.lower() + '_' + str(real_ind) + ".tif")) else: obj = sio.imread( os.path.join( self.data_dir, self.option, "Objs", self.option.lower() + '_' + str(real_ind) + ".tif")) bkg = sio.imread( os.path.join(self.data_dir, self.option, "Bkgs", self.option.lower() + '_' + str(real_ind) + ".tif")) mask = sio.imread( os.path.join(self.data_dir, self.option, "GTs", self.option.lower() + '_' + str(real_ind) + ".tif")) region = sio.imread( os.path.join(self.data_dir, self.option, "Regions", self.option.lower() + '_' + str(real_ind) + ".tif")) points = hu.load_json( os.path.join(self.data_dir, self.option, "Pts", self.option.lower() + '_' + str(real_ind) + ".json")) if self.transform: random_seed = self.random_seeds[ind] random.seed(random_seed) transformed = self.transform(image=image, keypoints=points["1"], keypoints0=points["2"], keypoints1=points["3"], keypoints2=points["4"], keypoints3=points["5"], keypoints4=points["6"], keypoints5=points["7"], mask=mask, mask0=bkg, mask1=obj, mask2=region) image = transformed["image"] points["1"] = np.array(transformed["keypoints"]).astype(int) points["2"] = np.array(transformed["keypoints0"]).astype(int) points["3"] = np.array(transformed["keypoints1"]).astype(int) points["4"] = np.array(transformed["keypoints2"]).astype(int) points["5"] = np.array(transformed["keypoints3"]).astype(int) points["6"] = np.array(transformed["keypoints4"]).astype(int) points["7"] = np.array(transformed["keypoints5"]).astype(int) mask = transformed["mask"] bkg = transformed["mask0"] obj = transformed["mask1"] region = transformed["mask2"] point_label = np.zeros_like(mask) counts = 0 for k, v in points.items(): counts += len(v) if len(v) > 0: point_label[v[:, 1], v[:, 0]] = int(k) return { 'images': torch.FloatTensor(image.transpose(2, 0, 1)) / 255.0, 'points': torch.FloatTensor(point_label), 'bkg': torch.FloatTensor(bkg), 'obj': torch.FloatTensor(obj), 'gt': torch.FloatTensor(mask), 'region': torch.FloatTensor(region), 'counts': counts, 'meta': { 'index': ind } } else: counts = len(points) return { 'images': torch.FloatTensor(image.transpose(2, 0, 1)) / 255.0, 'counts': counts, 'meta': { 'index': ind }, 'gt': torch.FloatTensor(mask) }
def get_summary(self, failed_only=False, columns=None, max_lines=200): """[summary] Returns ------- [type] [description] """ add_job_utils() import haven_jobs_utils as hju # get job ids job_id_list = [] for exp_dict in self.exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) fname = hju.get_job_fname(savedir) if os.path.exists(fname): job_id_list += [hu.load_json(fname)["job_id"]] jobs_dict = hju.get_jobs_dict(self.api, job_id_list) # fill summary summary_dict = {'table':[], 'status':[], 'logs_failed':[], 'logs':[]} for exp_dict in self.exp_list: result_dict = copy.deepcopy(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) result_dict["exp_id"] = exp_id fname = hju.get_job_fname(savedir) # Job results result_dict["job_id"] = None result_dict["job_state"] = 'NEVER LAUNCHED' if os.path.exists(fname): job_dict = hu.load_json(fname) job_id = job_dict["job_id"] if job_id not in jobs_dict: continue job = jobs_dict[job_id] result_dict['started at (Montreal)'] = job_dict["started at (Montreal)"] result_dict["job_id"] = job_id result_dict["job_state"] = job.state summary_dict['table'] += [copy.deepcopy(result_dict)] result_dict["command"] = job.command[2] if job.state == "FAILED": fname = os.path.join(savedir, "err.txt") if os.path.exists(fname): result_dict["logs"] = hu.read_text(fname)[-max_lines:] summary_dict['logs_failed'] += [result_dict] else: if self.verbose: print('%s: err.txt does not exist' % exp_id) else: fname = os.path.join(savedir, "logs.txt") if os.path.exists(fname): result_dict["logs"] = hu.read_text(fname)[-max_lines:] summary_dict['logs'] += [result_dict] else: if self.verbose: print('%s: logs.txt does not exist' % exp_id) else: result_dict['job_state'] = 'NEVER LAUNCHED' summary_dict['table'] += [copy.deepcopy(result_dict)] # get info df = pd.DataFrame(summary_dict['table']) df = df.set_index('exp_id') if columns: df = df[[c for c in columns if (c in df.columns and c not in ['err'])]] if "job_state" in df: stats = np.vstack(np.unique(df['job_state'].fillna("NaN"),return_counts=True)).T status = ([{a:b} for (a,b) in stats]) else: df['job_state'] = None summary_dict['status'] = status summary_dict['table'] = df summary_dict['queuing'] = df[df['job_state']=='QUEUING'] summary_dict['running'] = df[df['job_state']=='RUNNING'] summary_dict['succeeded'] = df[df['job_state']=='SUCCEEDED'] summary_dict['failed'] = df[df['job_state']=='FAILED'] return summary_dict
def __init__(self, split, transform_lvl, datadir_base, n_samples=None, val_transform='identity'): path = datadir_base or '/mnt/projects/bilvlda/dataset/tiny-imagenet-200' self.name = 'tinyimagenet' self.n_classes = 200 self.image_size = 64 self.nc = 3 normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) self.mean = normalize.mean self.std = normalize.std if split == 'train': if transform_lvl == 0: transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.ToTensor(), normalize, ]) elif transform_lvl == 1: transform = transforms.Compose([ transforms.RandomCrop(self.image_size, padding=4), transforms.ToTensor(), normalize, ]) elif transform_lvl == 1.5: transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif transform_lvl == 2: transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.RandomCrop(self.image_size, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif transform_lvl == 2.5: transform = transforms.Compose([ transforms.RandomCrop(self.image_size, padding=4), transforms.RandomAffine(10, translate=None, scale=(0.5, 2)), transforms.ToTensor(), normalize, ]) elif transform_lvl == 3: transform = transforms.Compose([ transforms.RandomCrop(self.image_size, padding=4), transforms.RandomAffine(10, translate=None, scale=(0.5, 2)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: raise ValueError( 'only lvls 0, 1, 1.5, 2, 2.5 and 3 are supported') elif split in ['validation', 'test']: # identity transform if val_transform == 'identity': transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.ToTensor(), normalize ]) elif val_transform == 'rotation': transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.RandomRotation((45, 45)), transforms.ToTensor(), normalize ]) elif val_transform == 'translation': transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.Pad((4, 4, 0, 0)), transforms.CenterCrop(self.image_size), transforms.ToTensor(), normalize ]) elif val_transform == 'zoomin': transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(int(self.image_size * 1.5)), transforms.CenterCrop(self.image_size), transforms.ToTensor(), normalize ]) elif val_transform == 'zoomout': transform = transforms.Compose([ transforms.Lambda(lambda x: x.convert("RGB")), transforms.Resize(int(self.image_size * 0.75)), transforms.Pad(4), transforms.ToTensor(), normalize ]) self.transform = transform if split in ['train', 'validation']: fname = '/mnt/projects/bilvlda/dataset/tiny-imagenet-200/tinyimagenet_train.json' if not os.path.exists(fname): dataset = dset.ImageFolder(root=os.path.join(path, 'train')) hu.save_json(fname, dataset.imgs) self.imgs = np.array(hu.load_json(fname)) assert (len(self.imgs) == 100000) elif split == 'test': fname = '/mnt/projects/bilvlda/dataset/tiny-imagenet-200/tinyimagenet_validation.json' if not os.path.exists(fname): dataset = dset.ImageFolder(root=os.path.join(path, 'val')) hu.save_json(fname, dataset.imgs) self.imgs = np.array(hu.load_json(fname)) assert (len(self.imgs) == 10000) if n_samples is not None: with hu.random_seed(10): imgs = np.array(self.imgs) ind = np.random.choice(imgs.shape[0], n_samples, replace=False) self.imgs = imgs[ind]
'017e7441c2f581b6fee9e3ac6f574edc' ] hash_dct = { 'b04090f27c7c52bcec65f6ba455ed2d8': 'Fully_Supervised', '6d4af38d64b23586e71a198de2608333': 'LCFCN', '84ced18cf5c1fb3ad5820cc1b55a38fa': 'LCFCN+Affinity_(ours)', '63f29eec3dbe1e03364f198ed7d4b414': 'Point-level_Loss ', '017e7441c2f581b6fee9e3ac6f574edc': 'Cross_entropy_Loss+pseudo-mask' } datadir = '/mnt/public/datasets/DeepFish/' score_list = [] for hash_id in hash_list: fname = os.path.join('/mnt/public/predictions/habitat/%s.pkl' % hash_id) exp_dict = hu.load_json( os.path.join(savedir_base, hash_id, 'exp_dict.json')) if os.path.exists(fname): print('FOUND:', fname) val_dict = hu.load_pkl(fname) else: train_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='train', datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) test_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='test',
parser.add_argument('-e', '--exp_group_list', nargs="+") parser.add_argument('-sb', '--savedir_base', required=True) parser.add_argument("-r", "--reset", default=0, type=int) parser.add_argument("-ei", "--exp_id", default=None) parser.add_argument("-v", "--view_experiments", default=None) parser.add_argument("-j", "--create_jupyter", default=None) args = parser.parse_args() # ===================== # Collect experiments # ===================== if args.exp_id is not None: # select one experiment savedir = args.savedir_base + "/%s/" % args.exp_id exp_dict = hu.load_json(savedir + "/exp_dict.json") exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # ===================== # Run experiments or View them # ===================== if args.view_experiments: # view experiments hr.view_experiments(exp_list, savedir_base=args.savedir_base)