示例#1
0
def get_stats(args, split, fold, subset):
    """
    Get some stats on the image sizes of specific dataset, split, fold.
    """
    if not os.path.isdir(args.fold_folder):
        os.makedirs(args.fold_folder)

    tag = "ds-{}-s-{}-f-{}-subset-{}".format(args.dataset, split, fold, subset)
    log = open(join(args.fold_folder, "log-stats-ds-{}.txt".format(tag)), 'w')
    announce_msg("Going to check {}".format(args.dataset.upper()))

    relative_fold_path = join(args.fold_folder, "split_{}".format(split),
                              "fold_{}".format(fold))

    subset_csv = join(relative_fold_path,
                      "{}_s_{}_f_{}.csv".format(subset, split, fold))
    rootpath = get_rootpath_2_dataset(args)
    samples = csv_loader(subset_csv, rootpath)

    lh, lw = [], []
    for el in samples:
        img = Image.open(el[1], 'r').convert('RGB')
        w, h = img.size
        lh.append(h)
        lw.append(w)

    msg = "min h {}, \t max h {}".format(min(lh), max(lh))
    show_msg(msg, log)
    msg = "min w {}, \t max w {}".format(min(lw), max(lw))
    show_msg(msg, log)

    fig, axes = plt.subplots(nrows=1, ncols=2)
    axes[0].hist(lh)
    axes[0].set_title('Heights')
    axes[1].hist(lw)
    axes[1].set_title('Widths')
    fig.tight_layout()
    plt.savefig(join(args.fold_folder, "size-stats-{}.png".format(tag)))

    log.close()
def get_leftover(args,
                 train_csv,
                 rootpath,
                 train_samples
                 ):
    """
    Get the leftover samples.
    :return:
    """
    tr_leftovers = []  # the leftovers...
    ids_org = []  # ids of the entire trainset.
    ids_curt = []  # ids of the current trainset set (full sup only)
    tr_original = []  # samples of entire trainset

    # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2.
    # al_type != AL_WSL.
    cnd_drop_n = (args.dataset == constants.CAM16)
    cnd_drop_n &= (args.al_type != constants.AL_WSL)

    if args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]:
        tr_original = csv_loader(train_csv, rootpath, drop_normal=cnd_drop_n)

        ids_org = [z[0] for z in tr_original]
        ids_curt = [z[0] for z in train_samples]
        tr_leftovers = []  # the leftovers...
        t0 = dt.datetime.now()
        for i, z in enumerate(ids_org):
            if z not in ids_curt:
                tr_leftovers.append(deepcopy(tr_original[i]))
        print("Searching took {}".format(dt.datetime.now() - t0))
        ids_leftovers = [z[0] for z in tr_leftovers]
        # Searching took 0:01:18.894629 for mnist 0 round. this can be done by
        # diff sets as well. but, we are not sure what set does to the order nor
        # the randomness. this is loop is safe.
        # tr_leftovers = [z for z in tr_original if z not in train_samples]

    return tr_leftovers, ids_org, ids_curt, tr_original
    # ==========================================================================
    # Datasets: load csv, datasets: train, valid, test.
    # ==========================================================================

    announce_msg("SPLIT: {} \t FOLD: {}".format(args.split, args.fold))

    train_csv, valid_csv, test_csv = get_csv_files(args)

    rootpath = get_rootpath_2_dataset(args)

    # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2.
    # al_type != AL_WSL.
    cnd_drop_n = (args.dataset == constants.CAM16)
    cnd_drop_n &= (args.al_type != constants.AL_WSL)

    train_samples = csv_loader(train_csv, rootpath, drop_normal=cnd_drop_n)
    valid_samples = csv_loader(valid_csv, rootpath, drop_normal=cnd_drop_n)
    test_samples = csv_loader(test_csv, rootpath, drop_normal=cnd_drop_n)

    # remove normal from name classes.
    if cnd_drop_n:
        args.name_classes.pop("normal")


    announce_msg("len original trainset: {}".format(len(train_samples)))


    # ==========================================================================
    #       START: PREPARE DATA FOR THE CURRENT ACTIVE LEARNING ROUND
    #                 BASED ON THE PREVIOUS ROUNDS.
    # ==========================================================================
示例#4
0
def al_split_camelyon16(args):
    """
    Use the provided split:
    https://github.com/jeromerony/survey_wsl_histology/blob/master/
    datasets-split/README.md
    for active learning.

    :param args:
    :return:
    """
    def csv_loader(fname):
        """
        Read a *.csv file. Each line contains:
         1. img: str
         2. mask: str or '' or None
         3. label: str

        :param fname: Path to the *.csv file.
        :param rootpath: The root path to the folders of the images.
        :return: List of elements.
        Each element is the path to an image: image path, mask path [optional],
        class name.
        """
        with open(fname, 'r') as f:
            out = [[row[0], row[1] if row[1] else None, row[2]]
                   for row in csv.reader(f)]

        return out

    csv_df = 'folds/camelyon16-split-0-fold-0-512-512-survey'
    # load survey csv files.
    trainset = csv_loader(join(csv_df, 'train_s_0_f_0.csv'))
    validset = csv_loader(join(csv_df, 'valid_s_0_f_0.csv'))
    testset = csv_loader(join(csv_df, 'test_s_0_f_0.csv'))

    baseurl = args.baseurl

    # find all the files
    fdimg = join(baseurl, 'jpg')
    tr_set, vl_set, ts_set = [], [], []
    idcnt = 0.  # count the unique id for each sample

    stats = {
        'train': {
            'normal': 0.,
            'tumor': 0.
        },
        'valid': {
            'normal': 0.,
            'tumor': 0.
        },
        'test': {
            'normal': 0.,
            'tumor': 0.
        }
    }

    # train
    for f in trainset:
        img = f[0]
        mask = f[1]
        label = f[2]
        tr_set.append((idcnt, img, mask, label))
        idcnt += 1.
        if label == 'normal':
            stats['train']['normal'] += 1.
        else:
            stats['train']['tumor'] += 1.

    # valid
    for f in validset:
        img = f[0]
        mask = f[1]
        label = f[2]
        vl_set.append((idcnt, img, mask, label))
        idcnt += 1.

        if label == 'normal':
            stats['valid']['normal'] += 1.
        else:
            stats['valid']['tumor'] += 1.

    # test
    for f in testset:
        img = f[0]
        mask = f[1]
        label = f[2]
        ts_set.append((idcnt, img, mask, label))
        idcnt += 1.

        if label == 'normal':
            stats['test']['normal'] += 1.
        else:
            stats['test']['tumor'] += 1.

    dict_classes_names = {"normal": 0, "tumor": 1}

    outd = args.fold_folder
    out_fold = join(outd, "split_{}/fold_{}".format(0, 0))
    if not os.path.exists(out_fold):
        os.makedirs(out_fold)

    readme = "Format: float `id`: 0, str `img`: 1, None `mask`: 2, " \
             "str `label`: 3, int `tag`: 4 \n" \
             "Possible tags: \n" \
             "0: labeled\n" \
             "1: unlabeled\n" \
             "2: labeled but came from unlabeled set. " \
             "[not possible at this level]."

    # shuffle train
    for t in range(1000):
        random.shuffle(tr_set)

    dump_fold_into_csv_CAM16(
        tr_set, join(out_fold, "train_s_{}_f_{}.csv".format(0, 0)),
        constants.U)
    dump_fold_into_csv_CAM16(
        vl_set, join(out_fold, "valid_s_{}_f_{}.csv".format(0, 0)),
        constants.L)
    dump_fold_into_csv_CAM16(ts_set,
                             join(out_fold, "test_s_{}_f_{}.csv".format(0, 0)),
                             constants.L)

    # current fold.

    # dump the coding.
    with open(join(out_fold, "encoding.yaml"), 'w') as f:
        yaml.dump(dict_classes_names, f)

    # dump the seed
    with open(join(out_fold, "seed.txt"), 'w') as fx:
        fx.write("MYSEED: " + os.environ["MYSEED"])

    with open(join(out_fold, "readme.md"), 'w') as fx:
        fx.write(readme)

    with open(join(out_fold, "stats-sets.yaml"), 'w') as fx:
        total = sum([
            stats[el]['normal'] + stats[el]['tumor']
            for el in list(stats.keys())
        ])
        stats['total_samples'] = total
        yaml.dump(stats, fx)
        print("Stats:", stats)

    # folder of folds

    # readme
    with open(join(args.fold_folder, "readme.md"), 'w') as fx:
        fx.write(readme)
    # coding.
    with open(join(args.fold_folder, "encoding.yaml"), 'w') as f:
        yaml.dump(dict_classes_names, f)

    print("camelyon16 splitting (`{}`) ended with success .... [OK]".format(0))
def get_init_sup_samples(args,
                         sampler,
                         COMMON,
                         train_samples,
                         OUTD
                         ):
    """
    Get the initial full supervised data.
    :return:
    """
    previous_pairs = dict()
    previous_errors = False

    # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2.
    # al_type != AL_WSL.
    cnd_drop_n = (args.dataset == constants.CAM16)
    cnd_drop_n &= (args.al_type != constants.AL_WSL)

    # round 0
    cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL])
    cnd &= (args.al_it == 0)

    if  cnd:
        # deterministic function with respect to the original seed.
        set_default_seed()
        train_samples = sampler.sample_init_random_samples(train_samples)
        set_default_seed()
        # store on disc: remove the rootpath from files to be host-independent.
        # store relative paths not absolute.
        base_f = 'train_{}.csv'.format(args.al_it)
        al_outf = join(COMMON, base_f)
        csv_writer(clear_rootpath(train_samples, args),
                   al_outf
                   )
        shutil.copyfile(al_outf, join(OUTD, base_f))

    # round > 0: combine all the samples of the previous al rounds
    # and the selected samples for this round.
    cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL])
    cnd &= (args.al_it > 0)
    if cnd:
        # 'train_{i}.csv' contains the selected samples at round i.
        lfiles = [join(
            COMMON, 'train_{}.csv'.format(t)) for t in range(args.al_it + 1)]

        if (args.al_type == constants.AL_LP) and (args.task == constants.SEG):
            # load previous pairs:
            # previous pairs are pairs that have been pseudo-labeled in the
            # previous al round. they are ready to be used as
            # pseudo-segmented samples. no statistical constraints will be
            # applied on them.
            fz = join(COMMON, 'train_pairs_{}.pkl'.format(args.al_it - 1))
            with open(fz, 'rb') as fp:
                previous_pairs = pkl.load(fp)

        train_samples = []
        rootpath = get_rootpath_2_dataset(args)
        for fx in lfiles:
            # load using the current host-root-path.
            train_samples.extend(csv_loader(fx,
                                            rootpath,
                                            drop_normal=cnd_drop_n
                                            )
                                 )

        # Force: set all the samples in train_samples to L.
        for tt in range(len(train_samples)):
            train_samples[tt][4] = constants.L

        # ============== block to delete =======================================
        # in the case we skipped previous rounds because we restart the
        # code, if we are in cc and use node, the paths will not match
        # since they are built upon the job id. so, we need to change it.
        if "CC_CLUSTER" in os.environ.keys():
            for i in range(len(train_samples)):
                front = os.sep.join(train_samples[i][1].split(os.sep)[:3])
                cnd = (front != os.environ["SLURM_TMPDIR"])
                if cnd:
                    # update the image input path
                    train_samples[i][1] = train_samples[i][1].replace(
                        front, os.environ["SLURM_TMPDIR"]
                    )

                    if args.task == constants.SEG:
                        # update the mask path
                        train_samples[i][2] = train_samples[i][2].replace(
                            front, os.environ["SLURM_TMPDIR"]
                        )

                    previous_errors = True

            # TODO: remove the above block. no longer necessary.
            # since we use relative paths in the node, we shouldn't have
            # mismatching paths when restarting the code.
            assert not previous_errors, "ERROR."
        # ======================================================================

        set_default_seed()
        for i in range(100):
            random.shuffle(train_samples)
        set_default_seed()

    return train_samples, previous_pairs, previous_errors
def compute_similarities(args,
                         tag_sims,
                         train_csv,
                         rootpath,
                         DEVICE,
                         SIMS,
                         training_log,
                         placement_node,
                         parent
                         ):
    """
    Compute similarities.
    :return:
    """
    # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2.
    # al_type != AL_WSL.
    cnd_drop_n = (args.dataset == constants.CAM16)
    cnd_drop_n &= (args.al_type != constants.AL_WSL)

    if args.al_type != constants.AL_LP:  # get out.
        return 0

    # 1. compute sims
    current_dir = dirname(abspath(__file__))

    # compute proximity
    if not os.path.exists(
            join("pairwise_sims", '{}.tar.gz'.format(tag_sims))):
        announce_msg("Going to project samples, and compute apirwise "
                     "similarities")

        all_train_samples = csv_loader(train_csv,
                                       rootpath,
                                       drop_normal=cnd_drop_n
                                       )
        for ii, el in enumerate(all_train_samples):
            el[4] = constants.L  # just for the loader consistency.
            # masks are not used when computing the pairwise similarity.

        set_default_seed()
        compute_sim = PairwiseSimilarity(task=args.task)
        set_default_seed()
        t0 = dt.datetime.now()
        if args.task == constants.CL:
            set_default_seed()
            compute_sim(data=all_train_samples, args=args, device=DEVICE,
                        outd=SIMS)
            set_default_seed()
        elif args.task == constants.SEG:
            # it has to be done differently. the similarity is measured
            # only between samples within the same class.

            for k in args.name_classes.keys():
                samples_in_same_class = [
                    sx for sx in all_train_samples if sx[3] == k]
                print("Computing similarities for class {}:".format(k))
                set_default_seed()
                compute_sim(data=samples_in_same_class, args=args,
                            device=DEVICE, outd=SIMS, label=k)
                set_default_seed()

        msg = "Time to compute sims {}: {}".format(
            tag_sims, dt.datetime.now() - t0
        )
        print(msg)
        log(training_log, msg)

        # compress, move files.

        if "CC_CLUSTER" in os.environ.keys():  # if CC
            cmdx = "cd {} && " \
                   "cd .. && " \
                   "tar -cf {}.tar.gz {} && " \
                   "cp {}.tar.gz {} && " \
                   "cd {} ".format(
                    SIMS,
                    tag_sims,
                    tag_sims,
                    tag_sims,
                    join(current_dir, "pairwise_sims"),
                    current_dir
                    )
        else:
            cmdx = "cd {} && " \
                   "tar -cf {}.tar.gz {} && " \
                   "cd {} ".format(
                    "./pairwise_sims",
                    tag_sims,
                    tag_sims,
                    current_dir
                    )

        tt = dt.datetime.now()
        print("Running bash-cmds: \n{}".format(cmdx.replace("&& ", "\n")))
        subprocess.run(cmdx, shell=True, check=True)
        msg += "\n time to run the command {}: {}".format(
            cmdx, dt.datetime.now() - tt)
        print(msg)
        log(training_log, msg)

    else:  # unzip if necessary.
        cmdx = None
        if "CC_CLUSTER" in os.environ.keys():  # if CC, copy to node.
            pr = join(placement_node, parent, "pairwise_sims")
            folder = join(pr, tag_sims)
            uncomp = False
            if not os.path.exists(folder):
                uncomp = True
            else:
                if len(os.listdir(folder)) == 0:
                    uncomp = True
            if uncomp:
                cmdx = "cp {}/{}.tar.gz {} && " \
                       "cd {} && " \
                       "tar -xf {}.tar.gz && " \
                       "cd {} ".format(
                            "./pairwise_sims",
                            tag_sims,
                            pr,
                            pr,
                            tag_sims,
                            current_dir
                            )

        else:
            folder = join('./pairwise_sims', tag_sims)
            uncomp = False
            if not os.path.exists(folder):
                uncomp = True
            else:
                if len(os.listdir(folder)) == 0:
                    uncomp = True

            if uncomp:
                cmdx = "cd {} && " \
                       "tar -xf {}.tar.gz && " \
                       "cd {} ".format(
                            "./pairwise_sims",
                            tag_sims,
                            current_dir
                            )

        if cmdx is not None:
            tt = dt.datetime.now()
            print("Running bash-cmds: \n{}".format(cmdx.replace("&& ", "\n")))
            subprocess.run(cmdx, shell=True, check=True)
            msg = "runtime of ALL the bash-cmds: {}".format(
                dt.datetime.now() - tt)
            print(msg)
            log(training_log, msg)

    return 0