Exemplo n.º 1
0
def get_cat_dogs_dataset(
    dirs: str = "/app/data/data_cat_dogs/*",
    extension: str = "*.jpg",
    test_size: float = 0.2,
    random_state: int = 42,
    tag_file_path: tp.Optional[str] = None,
) -> tp.Tuple[tp.Dict[str, tp.Any], tp.Dict[str, tp.Any], int]:
    dataset = utils.create_dataset(dirs=dirs, extension=extension)
    df = utils.create_dataframe(dataset, columns=["class", "filepath"])

    tag_to_label = utils.get_dataset_labeling(df, "class")
    if tag_file_path is not None:
        with open(tag_file_path, "w") as file:
            json.dump(tag_to_label, file)

    df_with_labels = utils.map_dataframe(
        df,
        tag_column="class",
        class_column="label",
        tag2class=tag_to_label,
        verbose=False,
    )

    train_data, valid_data = utils.split_dataframe_train_test(
        df_with_labels, test_size=test_size, random_state=random_state)
    return (
        train_data.to_dict("records"),
        valid_data.to_dict("records"),
        len(tag_to_label),
    )
Exemplo n.º 2
0
    def build(self):
        SEED = 42
        data = pd.read_csv(self.data)
        ab = data.label

        ############################################
        transforms = [
            RescaleIntensity((0, 1)),
            RandomAffine(),
            transformss.ToTensor(),
        ]
        transform = Compose(transforms)
        #############################################

        dataset_dir = self.dataset_dir
        dataset_dir = Path(dataset_dir)

        images_dir = dataset_dir
        labels_dir = dataset_dir
        image_paths = sorted(images_dir.glob('**/*.nii'))
        label_paths = sorted(labels_dir.glob('**/*.nii'))
        assert len(image_paths) == len(label_paths)

        # These two names are arbitrary
        MRI = 'features'
        BRAIN = 'targets'

        #split dataset into training and validation
        from catalyst.utils import split_dataframe_train_test

        train_image_paths, valid_image_paths = split_dataframe_train_test(
            image_paths, test_size=0.2, random_state=SEED)

        #training data
        subjects = []
        i = 0
        for (image_path, label_path) in zip(train_image_paths, label_paths):
            subject_dict = {
                MRI: torchio.Image(image_path, torchio.INTENSITY),
                BRAIN: ab[i],
            }
            i = i + 1
            subject = torchio.Subject(subject_dict)
            subjects.append(subject)
        train_data = torchio.ImagesDataset(subjects)

        #validation data
        subjects = []
        for (image_path, label_path) in zip(valid_image_paths, label_paths):
            subject_dict = {
                MRI: torchio.Image(image_path, torchio.INTENSITY),
                BRAIN: ab[i],
            }
            i = i + 1
            subject = torchio.Subject(subject_dict)
            subjects.append(subject)
        test_data = torchio.ImagesDataset(subjects)
        return train_data, test_data
Exemplo n.º 3
0
def balance_data(csv_path: str,
                 test_size: float = 0.2,
                 random_state: int = 123):
    df = pd.read_csv(csv_path)
    # first class has large number of samples as compares to others
    # one way to balance is by sampling smaller amount of data
    class_0 = df[df['diagnosis'] == 0]
    class_0 = class_0.sample(400)
    class_0_train, class_0_test = split_dataframe_train_test(
        class_0, test_size=test_size, random_state=random_state)
    df_train = class_0_train
    df_test = class_0_test

    class_1 = df[df['diagnosis'] == 1]
    class_1_train, class_1_test = split_dataframe_train_test(
        class_1, test_size=test_size, random_state=random_state)
    df_train = df_train.append(class_1_train)
    df_test = df_test.append(class_1_test)

    # sub sampling data for Moderate category
    class_2 = df[df['diagnosis'] == 2]
    class_2 = class_2.sample(400)
    class_2_train, class_2_test = split_dataframe_train_test(
        class_2, test_size=test_size, random_state=random_state)
    df_train = df_train.append(class_2_train)
    df_test = df_test.append(class_2_test)

    class_3 = df[df['diagnosis'] == 3]
    class_3_train, class_3_test = split_dataframe_train_test(
        class_3, test_size=test_size, random_state=random_state)
    df_train = df_train.append(class_3_train)
    df_test = df_test.append(class_3_test)

    class_4 = df[df['diagnosis'] == 4]
    class_4_train, class_4_test = split_dataframe_train_test(
        class_4, test_size=test_size, random_state=random_state)
    df_train = df_train.append(class_4_train)
    df_test = df_test.append(class_4_test)

    return df_train, df_test
Exemplo n.º 4
0
def get_datasets(config):
    train_df = pd.read_csv(config.root + 'train.csv')
    test_df = pd.read_csv(config.root + 'test.csv')
    dataset = create_dataset(root_dir=config.root_images,
                             mask="Train*.jpg",
                             config=config)
    df_path = create_dataframe(dataset, columns=["image_id", "filepath"])
    df_with_labels = pd.merge(df_path,
                              train_df,
                              left_on='image_id',
                              right_on='image_id')
    df_with_labels["disease_type"] = df_with_labels["healthy"] * 0 + df_with_labels["multiple_diseases"] * 1 + \
                                     df_with_labels["rust"] * 2 + df_with_labels["scab"] * 3
    df_with_labels.head(10)

    train_data, valid_data = split_dataframe_train_test(
        df_with_labels, test_size=0.3, random_state=config.seed)
    train_data, valid_data = train_data.to_dict('records'), valid_data.to_dict(
        'records')

    return train_data, valid_data
Exemplo n.º 5
0
utils.prepare_cudnn(deterministic=True)

train_df = pd.read_csv(config.root + 'train.csv')
test_df = pd.read_csv(config.root + 'test.csv')
dataset = create_dataset(root_dir=config.root_images, mask="Train*.jpg")
df_path = create_dataframe(dataset, columns=["image_id", "filepath"])
df_with_labels = pd.merge(df_path,
                          train_df,
                          left_on='image_id',
                          right_on='image_id')
df_with_labels["disease_type"] = df_with_labels["healthy"] * 0 + df_with_labels["multiple_diseases"] * 1 + \
                                 df_with_labels["rust"] * 2 + df_with_labels["scab"] * 3
df_with_labels.head(10)

train_data, valid_data = split_dataframe_train_test(df_with_labels,
                                                    test_size=0.3,
                                                    random_state=config.seed)
train_data, valid_data = train_data.to_dict('records'), valid_data.to_dict(
    'records')

open_fn = ReaderCompose([
    ImageReader(input_key="filepath",
                output_key="features",
                rootpath=config.root_images),
    ScalarReader(input_key="disease_type",
                 output_key="targets",
                 default_value=-1,
                 dtype=np.int64),
    ScalarReader(input_key="disease_type",
                 output_key="targets_one_hot",
                 default_value=-1,