def get_stats(args, split, fold, subset): """ Get some stats on the image sizes of specific dataset, split, fold. """ if not os.path.isdir(args.fold_folder): os.makedirs(args.fold_folder) tag = "ds-{}-s-{}-f-{}-subset-{}".format(args.dataset, split, fold, subset) log = open(join(args.fold_folder, "log-stats-ds-{}.txt".format(tag)), 'w') announce_msg("Going to check {}".format(args.dataset.upper())) relative_fold_path = join(args.fold_folder, "split_{}".format(split), "fold_{}".format(fold)) subset_csv = join(relative_fold_path, "{}_s_{}_f_{}.csv".format(subset, split, fold)) rootpath = get_rootpath_2_dataset(args) samples = csv_loader(subset_csv, rootpath) lh, lw = [], [] for el in samples: img = Image.open(el[1], 'r').convert('RGB') w, h = img.size lh.append(h) lw.append(w) msg = "min h {}, \t max h {}".format(min(lh), max(lh)) show_msg(msg, log) msg = "min w {}, \t max w {}".format(min(lw), max(lw)) show_msg(msg, log) fig, axes = plt.subplots(nrows=1, ncols=2) axes[0].hist(lh) axes[0].set_title('Heights') axes[1].hist(lw) axes[1].set_title('Widths') fig.tight_layout() plt.savefig(join(args.fold_folder, "size-stats-{}.png".format(tag))) log.close()
def get_leftover(args, train_csv, rootpath, train_samples ): """ Get the leftover samples. :return: """ tr_leftovers = [] # the leftovers... ids_org = [] # ids of the entire trainset. ids_curt = [] # ids of the current trainset set (full sup only) tr_original = [] # samples of entire trainset # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2. # al_type != AL_WSL. cnd_drop_n = (args.dataset == constants.CAM16) cnd_drop_n &= (args.al_type != constants.AL_WSL) if args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]: tr_original = csv_loader(train_csv, rootpath, drop_normal=cnd_drop_n) ids_org = [z[0] for z in tr_original] ids_curt = [z[0] for z in train_samples] tr_leftovers = [] # the leftovers... t0 = dt.datetime.now() for i, z in enumerate(ids_org): if z not in ids_curt: tr_leftovers.append(deepcopy(tr_original[i])) print("Searching took {}".format(dt.datetime.now() - t0)) ids_leftovers = [z[0] for z in tr_leftovers] # Searching took 0:01:18.894629 for mnist 0 round. this can be done by # diff sets as well. but, we are not sure what set does to the order nor # the randomness. this is loop is safe. # tr_leftovers = [z for z in tr_original if z not in train_samples] return tr_leftovers, ids_org, ids_curt, tr_original
# ========================================================================== # Datasets: load csv, datasets: train, valid, test. # ========================================================================== announce_msg("SPLIT: {} \t FOLD: {}".format(args.split, args.fold)) train_csv, valid_csv, test_csv = get_csv_files(args) rootpath = get_rootpath_2_dataset(args) # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2. # al_type != AL_WSL. cnd_drop_n = (args.dataset == constants.CAM16) cnd_drop_n &= (args.al_type != constants.AL_WSL) train_samples = csv_loader(train_csv, rootpath, drop_normal=cnd_drop_n) valid_samples = csv_loader(valid_csv, rootpath, drop_normal=cnd_drop_n) test_samples = csv_loader(test_csv, rootpath, drop_normal=cnd_drop_n) # remove normal from name classes. if cnd_drop_n: args.name_classes.pop("normal") announce_msg("len original trainset: {}".format(len(train_samples))) # ========================================================================== # START: PREPARE DATA FOR THE CURRENT ACTIVE LEARNING ROUND # BASED ON THE PREVIOUS ROUNDS. # ==========================================================================
def al_split_camelyon16(args): """ Use the provided split: https://github.com/jeromerony/survey_wsl_histology/blob/master/ datasets-split/README.md for active learning. :param args: :return: """ def csv_loader(fname): """ Read a *.csv file. Each line contains: 1. img: str 2. mask: str or '' or None 3. label: str :param fname: Path to the *.csv file. :param rootpath: The root path to the folders of the images. :return: List of elements. Each element is the path to an image: image path, mask path [optional], class name. """ with open(fname, 'r') as f: out = [[row[0], row[1] if row[1] else None, row[2]] for row in csv.reader(f)] return out csv_df = 'folds/camelyon16-split-0-fold-0-512-512-survey' # load survey csv files. trainset = csv_loader(join(csv_df, 'train_s_0_f_0.csv')) validset = csv_loader(join(csv_df, 'valid_s_0_f_0.csv')) testset = csv_loader(join(csv_df, 'test_s_0_f_0.csv')) baseurl = args.baseurl # find all the files fdimg = join(baseurl, 'jpg') tr_set, vl_set, ts_set = [], [], [] idcnt = 0. # count the unique id for each sample stats = { 'train': { 'normal': 0., 'tumor': 0. }, 'valid': { 'normal': 0., 'tumor': 0. }, 'test': { 'normal': 0., 'tumor': 0. } } # train for f in trainset: img = f[0] mask = f[1] label = f[2] tr_set.append((idcnt, img, mask, label)) idcnt += 1. if label == 'normal': stats['train']['normal'] += 1. else: stats['train']['tumor'] += 1. # valid for f in validset: img = f[0] mask = f[1] label = f[2] vl_set.append((idcnt, img, mask, label)) idcnt += 1. if label == 'normal': stats['valid']['normal'] += 1. else: stats['valid']['tumor'] += 1. # test for f in testset: img = f[0] mask = f[1] label = f[2] ts_set.append((idcnt, img, mask, label)) idcnt += 1. if label == 'normal': stats['test']['normal'] += 1. else: stats['test']['tumor'] += 1. dict_classes_names = {"normal": 0, "tumor": 1} outd = args.fold_folder out_fold = join(outd, "split_{}/fold_{}".format(0, 0)) if not os.path.exists(out_fold): os.makedirs(out_fold) readme = "Format: float `id`: 0, str `img`: 1, None `mask`: 2, " \ "str `label`: 3, int `tag`: 4 \n" \ "Possible tags: \n" \ "0: labeled\n" \ "1: unlabeled\n" \ "2: labeled but came from unlabeled set. " \ "[not possible at this level]." # shuffle train for t in range(1000): random.shuffle(tr_set) dump_fold_into_csv_CAM16( tr_set, join(out_fold, "train_s_{}_f_{}.csv".format(0, 0)), constants.U) dump_fold_into_csv_CAM16( vl_set, join(out_fold, "valid_s_{}_f_{}.csv".format(0, 0)), constants.L) dump_fold_into_csv_CAM16(ts_set, join(out_fold, "test_s_{}_f_{}.csv".format(0, 0)), constants.L) # current fold. # dump the coding. with open(join(out_fold, "encoding.yaml"), 'w') as f: yaml.dump(dict_classes_names, f) # dump the seed with open(join(out_fold, "seed.txt"), 'w') as fx: fx.write("MYSEED: " + os.environ["MYSEED"]) with open(join(out_fold, "readme.md"), 'w') as fx: fx.write(readme) with open(join(out_fold, "stats-sets.yaml"), 'w') as fx: total = sum([ stats[el]['normal'] + stats[el]['tumor'] for el in list(stats.keys()) ]) stats['total_samples'] = total yaml.dump(stats, fx) print("Stats:", stats) # folder of folds # readme with open(join(args.fold_folder, "readme.md"), 'w') as fx: fx.write(readme) # coding. with open(join(args.fold_folder, "encoding.yaml"), 'w') as f: yaml.dump(dict_classes_names, f) print("camelyon16 splitting (`{}`) ended with success .... [OK]".format(0))
def get_init_sup_samples(args, sampler, COMMON, train_samples, OUTD ): """ Get the initial full supervised data. :return: """ previous_pairs = dict() previous_errors = False # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2. # al_type != AL_WSL. cnd_drop_n = (args.dataset == constants.CAM16) cnd_drop_n &= (args.al_type != constants.AL_WSL) # round 0 cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]) cnd &= (args.al_it == 0) if cnd: # deterministic function with respect to the original seed. set_default_seed() train_samples = sampler.sample_init_random_samples(train_samples) set_default_seed() # store on disc: remove the rootpath from files to be host-independent. # store relative paths not absolute. base_f = 'train_{}.csv'.format(args.al_it) al_outf = join(COMMON, base_f) csv_writer(clear_rootpath(train_samples, args), al_outf ) shutil.copyfile(al_outf, join(OUTD, base_f)) # round > 0: combine all the samples of the previous al rounds # and the selected samples for this round. cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]) cnd &= (args.al_it > 0) if cnd: # 'train_{i}.csv' contains the selected samples at round i. lfiles = [join( COMMON, 'train_{}.csv'.format(t)) for t in range(args.al_it + 1)] if (args.al_type == constants.AL_LP) and (args.task == constants.SEG): # load previous pairs: # previous pairs are pairs that have been pseudo-labeled in the # previous al round. they are ready to be used as # pseudo-segmented samples. no statistical constraints will be # applied on them. fz = join(COMMON, 'train_pairs_{}.pkl'.format(args.al_it - 1)) with open(fz, 'rb') as fp: previous_pairs = pkl.load(fp) train_samples = [] rootpath = get_rootpath_2_dataset(args) for fx in lfiles: # load using the current host-root-path. train_samples.extend(csv_loader(fx, rootpath, drop_normal=cnd_drop_n ) ) # Force: set all the samples in train_samples to L. for tt in range(len(train_samples)): train_samples[tt][4] = constants.L # ============== block to delete ======================================= # in the case we skipped previous rounds because we restart the # code, if we are in cc and use node, the paths will not match # since they are built upon the job id. so, we need to change it. if "CC_CLUSTER" in os.environ.keys(): for i in range(len(train_samples)): front = os.sep.join(train_samples[i][1].split(os.sep)[:3]) cnd = (front != os.environ["SLURM_TMPDIR"]) if cnd: # update the image input path train_samples[i][1] = train_samples[i][1].replace( front, os.environ["SLURM_TMPDIR"] ) if args.task == constants.SEG: # update the mask path train_samples[i][2] = train_samples[i][2].replace( front, os.environ["SLURM_TMPDIR"] ) previous_errors = True # TODO: remove the above block. no longer necessary. # since we use relative paths in the node, we shouldn't have # mismatching paths when restarting the code. assert not previous_errors, "ERROR." # ====================================================================== set_default_seed() for i in range(100): random.shuffle(train_samples) set_default_seed() return train_samples, previous_pairs, previous_errors
def compute_similarities(args, tag_sims, train_csv, rootpath, DEVICE, SIMS, training_log, placement_node, parent ): """ Compute similarities. :return: """ # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2. # al_type != AL_WSL. cnd_drop_n = (args.dataset == constants.CAM16) cnd_drop_n &= (args.al_type != constants.AL_WSL) if args.al_type != constants.AL_LP: # get out. return 0 # 1. compute sims current_dir = dirname(abspath(__file__)) # compute proximity if not os.path.exists( join("pairwise_sims", '{}.tar.gz'.format(tag_sims))): announce_msg("Going to project samples, and compute apirwise " "similarities") all_train_samples = csv_loader(train_csv, rootpath, drop_normal=cnd_drop_n ) for ii, el in enumerate(all_train_samples): el[4] = constants.L # just for the loader consistency. # masks are not used when computing the pairwise similarity. set_default_seed() compute_sim = PairwiseSimilarity(task=args.task) set_default_seed() t0 = dt.datetime.now() if args.task == constants.CL: set_default_seed() compute_sim(data=all_train_samples, args=args, device=DEVICE, outd=SIMS) set_default_seed() elif args.task == constants.SEG: # it has to be done differently. the similarity is measured # only between samples within the same class. for k in args.name_classes.keys(): samples_in_same_class = [ sx for sx in all_train_samples if sx[3] == k] print("Computing similarities for class {}:".format(k)) set_default_seed() compute_sim(data=samples_in_same_class, args=args, device=DEVICE, outd=SIMS, label=k) set_default_seed() msg = "Time to compute sims {}: {}".format( tag_sims, dt.datetime.now() - t0 ) print(msg) log(training_log, msg) # compress, move files. if "CC_CLUSTER" in os.environ.keys(): # if CC cmdx = "cd {} && " \ "cd .. && " \ "tar -cf {}.tar.gz {} && " \ "cp {}.tar.gz {} && " \ "cd {} ".format( SIMS, tag_sims, tag_sims, tag_sims, join(current_dir, "pairwise_sims"), current_dir ) else: cmdx = "cd {} && " \ "tar -cf {}.tar.gz {} && " \ "cd {} ".format( "./pairwise_sims", tag_sims, tag_sims, current_dir ) tt = dt.datetime.now() print("Running bash-cmds: \n{}".format(cmdx.replace("&& ", "\n"))) subprocess.run(cmdx, shell=True, check=True) msg += "\n time to run the command {}: {}".format( cmdx, dt.datetime.now() - tt) print(msg) log(training_log, msg) else: # unzip if necessary. cmdx = None if "CC_CLUSTER" in os.environ.keys(): # if CC, copy to node. pr = join(placement_node, parent, "pairwise_sims") folder = join(pr, tag_sims) uncomp = False if not os.path.exists(folder): uncomp = True else: if len(os.listdir(folder)) == 0: uncomp = True if uncomp: cmdx = "cp {}/{}.tar.gz {} && " \ "cd {} && " \ "tar -xf {}.tar.gz && " \ "cd {} ".format( "./pairwise_sims", tag_sims, pr, pr, tag_sims, current_dir ) else: folder = join('./pairwise_sims', tag_sims) uncomp = False if not os.path.exists(folder): uncomp = True else: if len(os.listdir(folder)) == 0: uncomp = True if uncomp: cmdx = "cd {} && " \ "tar -xf {}.tar.gz && " \ "cd {} ".format( "./pairwise_sims", tag_sims, current_dir ) if cmdx is not None: tt = dt.datetime.now() print("Running bash-cmds: \n{}".format(cmdx.replace("&& ", "\n"))) subprocess.run(cmdx, shell=True, check=True) msg = "runtime of ALL the bash-cmds: {}".format( dt.datetime.now() - tt) print(msg) log(training_log, msg) return 0