def generate_single_file(sources, savefile, ifreader, statuses, cell_lines, versions, locations, multi_label, indices, verbose=False):
    savefile = open(savefile, 'w')
    for (x,y, plate_names) in ifreader.data_generator(
            sources,
            indices,
            statuses,
            cell_lines,
            versions,
            locations,
            multi_label,
            verbose=verbose):

        (x,y,plate_names) = utils.shuffle_lists([x,y,plate_names]) 
        for lx, ly in zip(x,y):
            for l in lx:
                savefile.write(str(l))
                savefile.write(',')
            
            savefile.write(' ')
            for l in ly:
               savefile.write(str(l))
               savefile.write(',')

            savefile.write('\n')

    savefile.close()
示例#2
0
    def shuffle(self):
        '''shuffle order of dataset'''
        dataset = self._dataset

        names = dataset['name']
        features = dataset['feature']
        labels = dataset['label']
        names, features, labels = utils.shuffle_lists(names, features, labels)

        self._dataset.update({
            'name': names,
            'feature': features,
            'label': labels
        })
        return self
示例#3
0
def write_synthetic_data(args):
    '''
    make data where data points are (y,s,e)
    - s : sequence of integers
    - idx : index which will be s[0]. depending on condition, there will exist a map f : s_idx --> e_idx that allows for retrieval-based approaches
    - m,n are ints
    - r,d are ints
    - y : labels given as 1[#m > #n] in the vanilla setup. in one condition, this holds if s[1]==1 else 1[#r> #d]
    - e : is an "explanation" of the data point, which gives information the features that cause the label

    NUMBERS START AT 1. no zeros appear in s
    '''
    print("Writing data...")
    if args.num_relevant_points < 0 and args.num_tasks < 0:  # is neither parameter provided...
        args.num_relevant_points = args.context_size + 1
    # DETERMINE NUM_RELEVANT_POINTS HERE IF NOT PROVIDED
    if args.num_relevant_points < 0 and args.num_tasks > 0:
        assert args.num_train_synthetic % args.num_tasks == 0, "please make n_train divisible by num_tasks"
        args.num_relevant_points = args.num_train_synthetic // args.num_tasks
    n_train = args.num_train_synthetic
    n_dev = 10000  # NOTE these may be slightly modified to allow for even group sizes wrt k
    n_test = 50000  # NOTE these may be slightly modified to allow for even group sizes wrt k
    if args.small_data:
        n_train, n_dev, n_test = [int(args.small_size) for n in range(3)]
        if n_train % args.num_relevant_points != 0:
            n_train = ((n_train // args.num_relevant_points + 1) *
                       args.num_relevant_points)
    # slightly modify n_train if needed
    if n_train % args.num_relevant_points != 0:
        n_train = ((n_train // args.num_relevant_points + 1) *
                   args.num_relevant_points)
    assert n_train % (
        args.num_relevant_points
    ) == 0, "please make n_train divisible by num_relevant_points"
    if args.num_relevant_points > 1:
        if not args.num_relevant_points % 2 == 0:
            print(
                "\n Note that num_relevant_points is odd! Hence balancing is not precisely 50/50 \n"
            )

    # get train_use_idx.
    num_per_train_idx = args.num_relevant_points
    n_train_idx = n_train // num_per_train_idx
    max_idx = args.max_int**2 if args.max_idx < 0 else args.max_idx
    assert n_train_idx <= max_idx, "need to decrease num_relevant_points to increase the numbers of tasks, or increase the num possible tasks by increasing args.max_int"
    train_use_idx = np.random.choice(np.arange(1, max_idx + 1),
                                     size=n_train_idx,
                                     replace=False)
    # test time idx are seen in training by default, or can be flagged to make them new
    if not args.disjoint_test_idx:
        dev_use_idx = train_use_idx
        test_use_idx = train_use_idx
    elif args.disjoint_test_idx:
        eligible_idx = np.setdiff1d(np.arange(1, max_idx + 1), train_use_idx)
        dev_use_idx = np.random.choice(
            eligible_idx, size=n_train_idx, replace=True
        )  # will need to replace=True when n_test > max_idx*num_per_idx
        test_use_idx = dev_use_idx
    num_per_dev_idx = round(n_dev / len(dev_use_idx))
    num_per_test_idx = round(n_test / len(test_use_idx))

    # modify n_dev and n_test if needed
    n_dev += (len(dev_use_idx) * num_per_dev_idx - n_dev)
    n_test += (len(test_use_idx) * num_per_test_idx - n_test)

    # make labels in advance
    labels_list = [
        utils.balanced_array(size=n, prop=.5)
        for n in [n_train, n_dev, n_test]
    ]
    utils.shuffle_lists(labels_list)
    train_labels, dev_labels, test_labels = labels_list

    # make mn and rds. make idx to z dict
    max_mn = int(np.sqrt(max_idx))
    mn_and_rds, collected = [], []

    # here is the normal procedure: (See below for special case)
    order_counter = 123  # unique ints to start. will start at 0123 -> 1234 given +1
    unique_idx = set(np.concatenate(
        [train_use_idx,
         dev_use_idx]))  # by default this just turns into train_use_idx
    if not (args.use_mn_only and args.ordered_mnrd):
        while len(mn_and_rds) < len(unique_idx):
            # in this condition, randomly sample mnrd and simply avoid repeats
            if not args.ordered_mnrd:
                proposal = np.random.choice(np.arange(1, max_mn + 1),
                                            size=4,
                                            replace=False)
                if str(
                        proposal
                ) not in collected:  # weird truth value ambiguous just checking if proposal in mn_and_rds
                    mn_and_rds.append(proposal)
                    collected.append(str(proposal))

            # in this condition, gradually increment values of m/n/r/d so that the task information is dense in integer space
            if args.ordered_mnrd:
                assert max_idx <= 10000, "right now ordered_mnrd without use_mn_only has a max_idx of 10k"
                str_mnrd = '%04d' % order_counter
                while len(set(str_mnrd)) != len(
                        list(str_mnrd)):  # if list is not unique characters
                    order_counter += 1
                    str_mnrd = '%04d' % order_counter
                mnrd = np.array([int(_int) + 1 for _int in list(str_mnrd)])
                mn_and_rds.append(mnrd)
                order_counter += 1

    # this is a special condition where we need to order based on mn only, so we overwrite the above
    if args.use_mn_only and args.ordered_mnrd:
        mn_and_rds, collected = [], []
        integers = np.array([1, 1])
        while len(mn_and_rds) < len(unique_idx):
            proposal = integers
            # increment if not all integers unique
            while len(set(proposal)) != len(proposal):
                last_idx_where_valid = [
                    idx for idx in range(len(integers)) if integers[idx] < 100
                ][-1]
                integers[last_idx_where_valid] += 1
            mn = np.array([int(_int) for _int in integers])
            distractors = np.random.choice(np.setdiff1d(np.arange(1, 101), mn),
                                           size=2,
                                           replace=False)
            mnrd = np.concatenate([mn, distractors])
            mn_and_rds.append(mnrd)
            # increment
            last_idx_where_valid = [
                idx for idx in range(len(integers)) if integers[idx] < 100
            ][-1]
            if last_idx_where_valid != 1:
                integers[-1] = 1
            integers[last_idx_where_valid] += 1

    # order things if doing smooth_idx_to_z
    if args.smooth_idx_to_z:
        train_use_idx = np.sort(train_use_idx)
        mn_and_rds = sorted(
            mn_and_rds,
            key=lambda x: x[0] + 1e-3 * x[1] + 1e-6 * x[2] + 1e-9 * x[3]
        )  # this takes advantage of known scale of num_tasks to break ties by each next element of the mnrd array

    idx_to_z_dict = {idx: mn_and_rds[i] for i, idx in enumerate(unique_idx)}
    '''
    now want a few other properties, per idx per dataset
    - mn or rd balance: use_mn_or_rd within each idx
    - #counts balance: want mn/rd #-counts to swap half the time, so there is no bias in size
    - distractor feature: want the non-causal feature (mn or rd, depending on above indicator) to correlate with the causal one 50% of the time
    '''
    train_idx_to_info = {
        idx: {
            'use_mn_or_rd':
            utils.balanced_array(
                size=num_per_train_idx,
                prop=.5),  # pick whether to use mn, or rd for 
            'swap_samples':
            utils.balanced_array(
                size=num_per_train_idx, prop=.5
            ),  # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4)
            'distractor_correlates':
            utils.balanced_array(
                size=num_per_train_idx, prop=args.weak_feature_correlation
            ),  # whether to have the non-causal feature (mn or rd) correlate with the causal one
            'mnrd':
            idx_to_z_dict[idx]
        }
        for idx in train_use_idx
    }
    dev_idx_to_info = {
        idx: {
            'use_mn_or_rd':
            utils.balanced_array(
                size=num_per_dev_idx,
                prop=.5),  # pick whether to use mn, or rd for 
            'swap_samples':
            utils.balanced_array(
                size=num_per_dev_idx, prop=.5
            ),  # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4)
            'distractor_correlates':
            utils.balanced_array(
                size=num_per_dev_idx, prop=.5
            ),  # whether to have the non-causal feature (mn or rd) correlate with the causal one
            'mnrd':
            idx_to_z_dict[idx]
        }
        for idx in dev_use_idx
    }
    test_idx_to_info = {
        idx: {
            'use_mn_or_rd':
            utils.balanced_array(
                size=num_per_test_idx,
                prop=.5),  # pick whether to use mn, or rd for 
            'swap_samples':
            utils.balanced_array(
                size=num_per_test_idx, prop=.5
            ),  # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4)
            'distractor_correlates':
            utils.balanced_array(
                size=num_per_test_idx, prop=.5
            ),  # whether to have the non-causal feature (mn or rd) correlate with the causal one
            'mnrd':
            idx_to_z_dict[idx]
        }
        for idx in test_use_idx
    }

    # make splits
    train_s_list, train_e_list = make_split(args,
                                            train_labels,
                                            train_use_idx,
                                            num_per_train_idx,
                                            train_idx_to_info,
                                            ignore_list=None)
    dev_s_list, dev_e_list = make_split(args,
                                        dev_labels,
                                        dev_use_idx,
                                        num_per_dev_idx,
                                        dev_idx_to_info,
                                        ignore_list=train_s_list)
    test_s_list, test_e_list = make_split(args,
                                          test_labels,
                                          test_use_idx,
                                          num_per_test_idx,
                                          test_idx_to_info,
                                          ignore_list=train_s_list)

    assert len(train_s_list) == n_train
    assert len(dev_s_list) == n_dev

    # make dfs and write splits
    train_df = pd.DataFrame({
        'unique_id': i,
        's': train_s_list[i],
        'e': train_e_list[i],
        'label': train_labels[i]
    } for i in range(n_train))
    dev_df = pd.DataFrame({
        'unique_id': i + n_train,
        's': dev_s_list[i],
        'e': dev_e_list[i],
        'label': dev_labels[i]
    } for i in range(n_dev))
    test_df = pd.DataFrame({
        'unique_id': i + n_train + n_dev,
        's': test_s_list[i],
        'e': test_e_list[i],
        'label': test_labels[i]
    } for i in range(n_test))
    folder = args.data_dir + '_' + args.experiment_name
    if not os.path.exists(folder): os.mkdir(folder)
    paths = [
        os.path.join(folder, split_name) + '.csv'
        for split_name in ['train', 'dev', 'test']
    ]
    train_df.to_csv(paths[0], index=False)
    dev_df.to_csv(paths[1], index=False)
    test_df.to_csv(paths[2], index=False)

    print("\nData statistics:")
    print(
        f"\t Num train idx / tasks: {len(train_use_idx)} | Num per train idx: {num_per_train_idx}"
    )
    print(
        f"\t Num dev idx / tasks:   {len(dev_use_idx)} | Num per dev idx:   {num_per_dev_idx}"
    )
    print(
        f"\t Num test idx / tasks:  {len(test_use_idx)} | Num per test idx:  {num_per_test_idx}"
    )

    return train_use_idx
示例#4
0
from dataloader import Gleason2019SaveDISK
from model import Unet
from utils import shuffle_lists

# Data preparation
generate_sub_images = True
root_path = './MICCAI_2019_pathology_challenge/'
folder_to_save_train_samples = './train_samples'
folder_to_save_val_samples = './val_samples'
train_imgs = sorted(
    glob.glob(os.path.join(root_path, 'Train Imgs/Train Imgs/*.jpg')))

# You have to generate the labels first by running the script
labels_final = sorted(glob.glob('./labels/*.png'))
assert len(labels_final) == len(train_imgs)
train_imgs, labels_final = shuffle_lists(train_imgs, labels_final)
val_loader = Gleason2019SaveDISK('val',
                                 train_imgs,
                                 labels_final, (0.8, 0.2), (512, 512),
                                 samples=10)
train_loader = Gleason2019SaveDISK('train',
                                   train_imgs,
                                   labels_final, (0.8, 0.2), (512, 512),
                                   samples=40)
if generate_sub_images:
    val_loader.generate_data(folder_to_save_val_samples)
    train_loader.generate_data(folder_to_save_train_samples)
else:
    print('You have to generate the image samples once')
    train_loader.load_paths()
    val_loader.load_paths()
def generate_data_files(base, 
                        ifreader, 
                        statuses,
                        source,
                        cell_lines,
                        versions,
                        locations,
                        multi_label,
                        indices,
                        num_files=5, 
                        verbose=False):
    """
    Generates a number of data files on the format
    features[1]\none-hot-classes[1]\n...

    Will generate and save the datafiles without checking if they already exist and as such
    may overwrite already existing data files with the same names.

    Parameters:
        base    :   The base name for the data files.
                    The files will be named base-0, base-1, etc.

        ifreader:   The ifreader which has read the IF_images file that should be used for this.
                    See the if_reader.IFReader class for more information.

        statuses:   What statuses that should be included from the original data.
                    Should be a list of integers as strings.
                    if None, all statuses are included.

        source  :   The original data files. 
                    Must be a list of filenames that contains the protein data on a comma (,) 
                    delimited csv format.

        cell_lines: What cell should be included from the original data.
                    Should be a list of strings.
                    if None, all cell lines are included.

        versions:   What versions that should be included from the original data.
                    Should be a list of integers as strings.
                    if None, all versions are included.

        locations:  What locations that should be included from the original data.
                    Should be a list of strings.
                    if None, all locations are included.

        multi_label:True if the data files should include multi_label instances.

        indices :   A list of integers that corresponds to the indices of the features that 
                    should be included in the data.

        num_files:  The number of data files to generate.
                    Defaults to 5.

        verbose :   True if verbose output should be printed.
                    Defaults to False.
    """
    if verbose:
        print('Generating data files')
    files = []
    plate_name_files = []
    for i in xrange(num_files):
        f = open(base + '-' + str(i), 'w')
        pnf = open(base + '-' + str(i) + '-platenames', 'w')
        files.append(f)
        plate_name_files.append(pnf)

    curr_index = 0
    for (x,y, plate_names) in ifreader.data_generator(
            source,
            indices,
            statuses,
            cell_lines,
            versions,
            locations,
            multi_label,
            verbose=verbose):

       (x,y,plate_names) = utils.shuffle_lists([x,y,plate_names]) 

       for (lx, ly, plate_name) in zip(x,y,plate_names):
           plate_name_files[curr_index].write(plate_name)
           plate_name_files[curr_index].write(' %d %d' % (curr_index, files[curr_index].tell()))
           plate_name_files[curr_index].write('\n')

           for l in lx:
               files[curr_index].write(str(l))
               files[curr_index].write(' ')
           files[curr_index].write('\n')

           for l in ly:
               files[curr_index].write(str(l))
               files[curr_index].write(' ')
           files[curr_index].write('\n')


           curr_index += 1
           curr_index %= num_files

    for (f,pnf) in zip(files, plate_name_files):
        f.close()
        pnf.close()