Exemplo n.º 1
0
def load_dataset_d7(dataset_dir, batch_size, test_batch_size, seq_len, horizon,
                    **kwargs):
    data = {}
    data['train'] = PeMSD7(dataset_dir,
                           'train',
                           seq_len=seq_len,
                           horizon=horizon)
    mean = data['train'].mean
    std = data['train'].std
    data['val'] = PeMSD7(dataset_dir, 'val', mean, std, seq_len, horizon)
    data['test'] = PeMSD7(dataset_dir, 'test', mean, std, seq_len, horizon)

    data['train_loader'] = TorchDataLoader(data['train'],
                                           batch_size=batch_size,
                                           shuffle=True)
    data['train_loader'].num_batch = len(data['train_loader'])
    data['val_loader'] = TorchDataLoader(data['val'],
                                         batch_size=test_batch_size,
                                         shuffle=False)
    data['val_loader'].num_batch = len(data['val_loader'])
    data['test_loader'] = TorchDataLoader(data['test'],
                                          batch_size=test_batch_size,
                                          shuffle=False)
    data['test_loader'].num_batch = len(data['test_loader'])

    data['scaler'] = data['train'].scaler

    return data
Exemplo n.º 2
0
def run_experiment(experiment: Experiment, debug_pipeline: bool = False) -> List[Result]:

    pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline)

    dfs = experiment.train_test_data_frames()
    directories = experiment.train_test_directories()

    dataset = TorchConcatDataset(
        [APTOSDataset(df, directory, pipeline) for df, directory in zip(dfs, directories)]
    )
    if DEVELOP_MODE:
        dataset, _ = random_split(dataset, [DEVELOP_MODE_SAMPLES, len(dataset) - DEVELOP_MODE_SAMPLES])

    results = []
    for cv_iteration in range(1,  CROSS_VALIDATION_ITERATIONS + 1):
        LOGGER.info("Cross validation iteration: %s", cv_iteration)

        test_size = experiment.test_size()
        train_ds, test_ds = random_split(
            dataset,
            [round((1 - test_size) * len(dataset)), round(test_size * len(dataset))]
        )

        train_loader = TorchDataLoader(
            train_ds,
            batch_size=experiment.batch_size(),
            num_workers=DATA_LOADER_WORKERS,
        )

        test_loader = TorchDataLoader(
            test_ds,
            batch_size=experiment.batch_size(),
            num_workers=DATA_LOADER_WORKERS,
        )

        model = experiment.model(input_shape=train_ds[0][0].shape)

        optimizer_class, optim_kwargs = experiment.optimizer()
        optimizer = optimizer_class(model.parameters(), **optim_kwargs)

        metric_df = pd.DataFrame(columns=["experiment_id", "epoch", "test_loss", "test_accuracy"])
        for epoch in range(1, experiment.max_epochs() + 1):
            LOGGER.info("Epoch: %s", epoch)

            train(1, model, train_loader, optimizer, epoch)
            predictions_proba, predictions,  targets = test(model, test_loader)

        predictions = predictions.tolist()
        targets = targets.tolist()

        results_df = pd.DataFrame({
            "experiment_id": [experiment.id() for _ in range(len(targets))],
            "cross_validation_iteration": [cv_iteration for _ in range(len(targets))],
            "targets": targets,
            "predictions": predictions,
        })

        results.append(Result(experiment, metric_df, results_df))

    return results
Exemplo n.º 3
0
def setup_data_loaders(processed, log):
    batch = 128

    train_ds = ParquetIterableDataset(
        f'file:{processed}/sales_series_melt.parquet', log, '.*part.(?!1).*')
    valid_ds = ParquetIterableDataset(
        f'file:{processed}/sales_series_melt.parquet', log, '.*part.1.*')
    test_ds = ParquetIterableDataset(
        f'file:{processed}/test_series_melt.parquet', log)

    train_dl = TorchDataLoader(train_ds,
                               batch_size=batch,
                               shuffle=False,
                               num_workers=0,
                               drop_last=False)
    valid_dl = TorchDataLoader(valid_ds,
                               batch_size=batch,
                               shuffle=False,
                               num_workers=0,
                               drop_last=False)
    test_dl = TorchDataLoader(test_ds,
                              batch_size=batch,
                              shuffle=False,
                              num_workers=0,
                              drop_last=False)

    data = OrderedDict()
    data["train"] = train_dl
    data["valid"] = valid_dl
    data["test"] = test_dl

    return data
Exemplo n.º 4
0
def test_data_loading_augment():
    target = "cone"
    data_dir = f"data/datasets/debug_data/{target}"
    dataset = fgbg.AugmentedTripletDataset(
        hdf5_file=f"{data_dir}/data.hdf5",
        json_file=f"{data_dir}/data.json",
        background_images_directory="data/datasets/dtd",
        blur=True,
        fg_augmentation=True,
    )
    dataloader = TorchDataLoader(dataset, 9, shuffle=True)
    for batch in dataloader:
        print(f'mean {batch["observation"].mean()}, '
              f'std {batch["observation"].std()}, '
              f'min {batch["observation"].min()}, '
              f'max {batch["observation"].max()}')
        grid_observation = torchvision.utils.make_grid(batch["observation"],
                                                       nrow=3)
        plt.imshow(grid_observation.permute(1, 2, 0).numpy())
        plt.title("observation")
        plt.show()
        grid_positive = torchvision.utils.make_grid(batch["positive"], nrow=3)
        plt.imshow(grid_positive.permute(1, 2, 0).numpy())
        plt.title("positive")
        plt.show()
        grid_negative = torchvision.utils.make_grid(batch["negative"], nrow=3)
        plt.imshow(grid_negative.permute(1, 2, 0).numpy())
        plt.title("negative")
        plt.show()
        break
Exemplo n.º 5
0
 def gen_data_loaders(self, batch_size, train=True):
     return [
         TorchDataLoader(self.train if train else self.test,
                         collate_fn=Batch.collate([]),
                         batch_size=batch_size // 2,
                         shuffle=True) for i in range(3)
     ]
Exemplo n.º 6
0
def DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
               batch_sampler=None, num_workers=0, collate_fn=default_collate,
               pin_memory=False, drop_last=False, timeout=0,
               worker_init_fn=None, prefetch_factor=2, persistent_workers=False):
  return TorchDataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                         sampler=sampler, batch_sampler=batch_sampler,
                         num_workers=num_workers, collate_fn=collate_fn,
                         pin_memory=pin_memory, drop_last=drop_last,
                         timeout=timeout, worker_init_fn=worker_init_fn,
                         prefetch_factor=prefetch_factor,
                         persistent_workers=persistent_workers)
Exemplo n.º 7
0
    def build(self, train_mode: bool, max_items: int, validation_ratio=0.0):
        """
        Builds the dataloader based on input params.

        Returns a tuple (train_dl, validation_dl).
        """
        # check validation_ratio
        if validation_ratio < 0 or validation_ratio > 1:
            raise ValueError("validation_ratio must be in [0,1] interval")

        # check type of dataloader required
        if train_mode:
            self.data_folder = os.path.join(self.data_folder, 'train')
        else:
            self.data_folder = os.path.join(self.data_folder, 'test')
            validation_ratio = 0.0

        # build base dataset
        base_set = CustomDataset(self.data_folder, self.transformation)

        # check items count
        if (len(base_set) > max_items) and (max_items > 0):
            base_set.images = base_set.images[:max_items]
            base_set.labels = base_set.labels[:max_items]

        # split base dataset into Subset
        main_set_len = int(len(base_set) - (len(base_set) * validation_ratio))
        val_set_len = len(base_set) - main_set_len
        main_set, val_set = torch.utils.data.random_split(
            base_set, [main_set_len, val_set_len])

        # build dataloaders
        self.main_dl = TorchDataLoader(
            main_set, batch_size=self.batch_size, shuffle=self.shuffle, drop_last=True)
        self.val_dl = TorchDataLoader(
            val_set, batch_size=self.batch_size, shuffle=False, drop_last=True)

        return (self.main_dl, self.val_dl)
Exemplo n.º 8
0
def main(
    checkpoint_file_path: str,
    data_directory: str = "../input/aptos2019-blindness-detection/test_images",
    data_frame: str = "../input/aptos2019-blindness-detection/test.csv",
    sample_submission:
    str = "../input/aptos2019-blindess-detection/sample_submission.csv"):
    print("Beginning submission")
    # Always load the first iteration from cross validation? Should really re-train on the whole dataset
    checkpoint = torch.load(checkpoint_file_path)
    print("Loaded checkpoint")

    experiment_state_dict = checkpoint["experiment"]
    experiment_state_dict.update(train_test_directories=[data_directory],
                                 train_test_data_frames=[data_frame])
    experiment = Experiment.from_dict(experiment_state_dict)
    print("Initialised experiment: %s", experiment)

    pipeline = Pipeline(experiment.pipeline_stages())
    print("Initialised pipeline")

    dfs = experiment.train_test_data_frames()
    directories = experiment.train_test_directories()

    dataset = TorchConcatDataset([
        APTOSSubmissionDataset(df, directory, pipeline)
        for df, directory in zip(dfs, directories)
    ])
    print("Initialised dataset")

    loader = TorchDataLoader(
        dataset,
        batch_size=experiment.batch_size(),
    )
    print("Initialised loader")

    model = checkpoint["model"]
    model.load_state_dict(checkpoint['state_dict'])
    print("Initialised model")

    print("Beginning inference")
    predictions_proba, predictions, ids = inference(model, loader, "cpu")

    sample = pd.read_csv(sample_submission)
    sample.diagnosis = predictions

    sample.to_csv("submission.csv", index=False)
Exemplo n.º 9
0
def create_dataloader(cfg,
                      mode=None,
                      domain=None,
                      name=None,
                      authority=None,
                      train_type=None,
                      items=None):
    """
    :param cfg:
    :param items:
    :return:

    create the dataset(search for the dataset class)
    init the sampler
    create the loader
    """
    train_name_factory = {
        'source': cfg.dataset.train.source,
        'target': cfg.dataset.train.target,
    }
    if mode is 'train':
        name = train_name_factory[domain].name
    dataset = create_dataset(cfg,
                             mode=mode,
                             domain=domain,
                             name=name,
                             authority=authority,
                             train_type=train_type,
                             items=items)
    # from DataLoaders.Datasets.market1501 import Market1501
    # dataset = Market1501(cfg, items=items)
    sampler_factory = {
        'train': cfg.dataloader.train,
        'test': cfg.dataloader.test
    }
    sampler = get_sampler(cfg, dataset, sampler_factory[mode])

    data_loader = TorchDataLoader(
        dataset=dataset,
        batch_size=sampler_factory[mode].batch_size,
        sampler=sampler,
        num_workers=cfg.dataloader.num_workers,
        pin_memory=True,
        drop_last=sampler_factory[mode].drop_last,
    )
    return data_loader
Exemplo n.º 10
0
 def create_a_loader(dataset):
     if cfg.batch_type == 'seq':
         sampler = SequentialSampler(dataset)
     elif cfg.batch_type == 'random':
         sampler = RandomSampler(dataset)
     elif cfg.batch_type == 'pk':
         sampler = RandomIdentitySampler(dataset, cfg.pk.k)
     else:
         raise NotImplementedError
     loader = TorchDataLoader(
         dataset,
         batch_size=cfg.batch_size,
         sampler=sampler,
         num_workers=cfg.num_workers,
         pin_memory=True,
         drop_last=cfg.drop_last,
     )
     return loader
Exemplo n.º 11
0
def test_data_loading_clean():
    target = "gate"
    data_dir = f"data/datasets/gate_cone_line/{target}"

    dataset = fgbg.CleanDataset(
        hdf5_file=f"{data_dir}/data.hdf5",
        json_file=f"{data_dir}/data.json",
    )
    dataloader = TorchDataLoader(dataset, 9, shuffle=True)
    for batch in dataloader:
        print(f'mean {batch["observation"].mean()}, '
              f'std {batch["observation"].std()}, '
              f'min {batch["observation"].min()}, '
              f'max {batch["observation"].max()}')
        grid = torchvision.utils.make_grid(batch["observation"], nrow=3)
        plt.imshow(grid.permute(1, 2, 0).numpy())
        plt.show()
        break
Exemplo n.º 12
0
def test_data_loading_real_images():
    target = "gate"
    data_dir = "data/datasets/bebop_real"

    dataset = fgbg.ImagesDataset(
        target=target,
        dir_name=data_dir,
        input_size=(3, 200, 200),
        output_size=(200, 200),
    )
    dataloader = TorchDataLoader(dataset, 9, shuffle=True)
    for batch in dataloader:
        print(f'mean {batch["observation"].mean()}, '
              f'std {batch["observation"].std()}, '
              f'min {batch["observation"].min()}, '
              f'max {batch["observation"].max()}')
        grid = torchvision.utils.make_grid(batch["observation"], nrow=3)
        plt.imshow(grid.permute(1, 2, 0).numpy())
        plt.show()
        break
Exemplo n.º 13
0
 def gen_data_loaders(self,
                      size,
                      batch_size,
                      train=True,
                      use_distributed_sampling=False):
     loaders = []
     for i in range(2):
         dataset = combined_syn.get_dataset(
             "graph", size // 2,
             np.arange(self.min_size + 1, self.max_size + 1))
         sampler = torch.utils.data.distributed.DistributedSampler(
             dataset, num_replicas=hvd.size(), rank=hvd.rank()) if \
                 use_distributed_sampling else None
         loaders.append(
             TorchDataLoader(dataset,
                             collate_fn=Batch.collate([]),
                             batch_size=batch_size //
                             2 if i == 0 else batch_size // 2,
                             sampler=sampler,
                             shuffle=False))
     loaders.append([None] * (size // batch_size))
     return loaders
Exemplo n.º 14
0
def create_dataloader(cfg,
                      dataset_cfg,
                      samples=None):  # cfg : dataloader  dataset_cfg:dataset
    dataset = create_dataset(dataset_cfg,
                             samples=samples)  # dataset : Market1501
    if cfg.batch_type == 'seq':  #test取序取数据集元素
        sampler = SequentialSampler(dataset)
    elif cfg.batch_type == 'random':  #train到这里
        sampler = RandomSampler(dataset)
    elif cfg.batch_type == 'pk':
        sampler = RandomIdentitySampler(dataset, cfg.pk.k)
    else:
        raise NotImplementedError
    loader = TorchDataLoader(
        dataset,
        batch_size=cfg.batch_size,
        sampler=sampler,
        num_workers=cfg.num_workers,
        pin_memory=True,
        drop_last=cfg.drop_last,
    )
    return loader
Exemplo n.º 15
0
 def gen_data_loaders(self,
                      size,
                      batch_size,
                      train=True,
                      use_distributed_sampling=False):
     loaders = []
     for i in range(2):
         neighs = []
         for j in range(size // 2):
             graph, neigh = utils.sample_neigh(
                 self.train_set if train else self.test_set,
                 random.randint(self.min_size, self.max_size))
             neighs.append(graph.subgraph(neigh))
         dataset = GraphDataset(GraphDataset.list_to_graphs(neighs))
         loaders.append(
             TorchDataLoader(dataset,
                             collate_fn=Batch.collate([]),
                             batch_size=batch_size //
                             2 if i == 0 else batch_size // 2,
                             sampler=None,
                             shuffle=False))
     loaders.append([None] * (size // batch_size))
     return loaders
Exemplo n.º 16
0
    def __init__(self, purposes=["train", "val", "test"], **kwargs):
        super().__init__()
        self.config = kwargs

        if "datasets" in self.config:
            self.datasets_config = self.config["datasets"]
        elif "dataset" in self.config:
            self.datasets_config = [self.config["dataset"]]
        else:
            raise ValueError

        subsets = {}
        sampling_weights = {}
        for purpose in purposes:
            subsets[purpose] = []
            sampling_weights[purpose] = []

        for dataset_config in self.datasets_config:
            dataset_type = DatasetEnum(dataset_config["type"])

            dataset_sampling_weights = dataset_config.get(
                "sampling_weights", {
                    "train": 1,
                    "val": 1,
                    "test": 1
                })

            assert all(purpose in dataset_sampling_weights.keys()
                       for purpose in purposes)

            transforms = {}
            transforms_config = dataset_config.get("transforms", {})
            for purpose in purposes:
                purpose_transforms_config = transforms_config.get(purpose, [])
                if len(purpose_transforms_config) > 0:
                    transforms[purpose] = Transformer(
                        purpose, purpose_transforms_config)
                else:
                    transforms[purpose] = None

            if "split" in dataset_config:
                split = dataset_config["split"]

                assert dataset_type != DatasetEnum.HDF5
                assert list(split.keys()) == purposes

                total_split_sum = np.array(list(split.values())).sum().item()
                for purpose in split.keys():
                    split[purpose] /= total_split_sum

                dataset = DATASETS[dataset_type](config=dataset_config,
                                                 dataset_path=pathlib.Path(
                                                     dataset_config["path"]))

                indices = np.arange(len(dataset))

                if self.config.get("shuffle", True):
                    rng = np.random.RandomState(seed=0)
                    rng.shuffle(indices)

                start_idx = 0
                for purpose in purposes:
                    len_subset = int(split[purpose] * len(dataset))
                    subset_indices = indices[start_idx:start_idx + len_subset]

                    if dataset_sampling_weights[purpose] <= 0:
                        # we do not want to add the dataset if its not going to get sampled
                        start_idx += len_subset
                        continue

                    # we need to separately create a subset dataset because we need to apply purpose-specific transforms
                    if transforms[purpose] is not None:
                        subset_dataset = DATASETS[dataset_type](
                            config=dataset_config,
                            dataset_path=pathlib.Path(dataset_config["path"]),
                            purpose=purpose,
                            transform=transforms[purpose])
                        subset = Subset(subset_dataset, subset_indices)
                    else:
                        subset = Subset(dataset, subset_indices)

                    subsets[purpose].append(subset)
                    sampling_weights[purpose].append(
                        dataset_sampling_weights[purpose])

                    start_idx += len_subset
            else:
                for purpose in purposes:
                    if dataset_sampling_weights[purpose] <= 0:
                        # we do not want to add the dataset if its not going to get sampled
                        continue

                    subset_dataset = DATASETS[dataset_type](
                        config=dataset_config,
                        dataset_path=pathlib.Path(dataset_config["path"]),
                        purpose=purpose,
                        transform=transforms[purpose])
                    subsets[purpose].append(subset_dataset)
                    sampling_weights[purpose].append(
                        dataset_sampling_weights[purpose])

        self.dataloaders = {}
        for purpose in purposes:
            if len(subsets[purpose]) > 1:
                purpose_dataset = ConcatDataset(datasets=subsets[purpose])
            elif len(subsets[purpose]) == 1:
                purpose_dataset = subsets[purpose][0]
            else:
                raise ValueError

            shuffle = self.config.get("shuffle", True)
            if purpose in ["test"]:
                shuffle = False

            if shuffle:
                weights = np.ones(shape=(len(purpose_dataset), ))
                start_idx = 0
                for subset, subset_sampling_weight in zip(
                        subsets[purpose], sampling_weights[purpose]):
                    end_idx = start_idx + len(subset)
                    weights[start_idx:end_idx] = subset_sampling_weight
                    start_idx = end_idx

                sampler = WeightedRandomSampler(
                    weights=weights,
                    num_samples=len(purpose_dataset),
                    replacement=False)

            else:
                for dataset_sampling_weight in sampling_weights[purpose]:
                    if dataset_sampling_weight != 1:
                        raise ValueError(
                            "Currently we do not support weighted, sequential sampling"
                        )

                sampler = SequentialSampler(purpose_dataset)

            self.dataloaders[purpose] = TorchDataLoader(
                dataset=purpose_dataset,
                batch_size=self.config["batch_size"],
                sampler=sampler,
                num_workers=self.config["num_workers"])
Exemplo n.º 17
0
    ) / batch_size  # this assertion only holds if len(datset) is
    # divisible by batch size

    start = time()
    for _ in range(num_epochs):
        batches = 0
        for _ in mt:
            batches += 1
    stop = time()
    print('high performance batchgenerators %03.4f seconds' % (stop - start))

    from torch.utils.data import DataLoader as TorchDataLoader

    trainloader = TorchDataLoader(cifar_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  pin_memory=pin_memory,
                                  collate_fn=default_collate)

    batches = 0
    for _ in iter(trainloader):
        batches += 1
    assert len(_['data'].shape) == 4

    start = time()
    for _ in range(num_epochs):
        batches = 0
        for _ in trainloader:
            batches += 1
    stop = time()
    print('pytorch took %03.4f seconds' % (stop - start))
Exemplo n.º 18
0
def visualize_network_inference(args):

    # Input argument handling
    assert os.path.exists(
        args.input_params_path
    ), 'Expected input_params_path "{}" to exist, but it does not.'.format(
        args.input_params_path)

    if args.input_config_path:
        input_config_path = args.input_config_path
    else:
        # Use params filepath to infer the config filepath
        input_config_path = os.path.splitext(
            args.input_params_path)[0] + ".yaml"

    assert os.path.exists(
        input_config_path
    ), 'Expected input_config_path "{}" to exist, but it does not.'.format(
        input_config_path)

    assert os.path.exists(
        args.dataset_path
    ), 'Expected dataset_path "{}" to exist, but it does not.'.format(
        args.dataset_path)

    # Determine what types of visualizations to do
    print("visualization types: {}".format(args.visualization_types))
    do_kp_overlay_raw = True if KP_OVERLAY_RAW in args.visualization_types else False
    do_kp_overlay_net_input = (True if KP_OVERLAY_NET_INPUT
                               in args.visualization_types else False)
    do_kp_belief_overlay_raw = (True if KP_BELIEF_OVERLAY_RAW
                                in args.visualization_types else False)
    do_belief_overlay_raw = (True if BELIEF_OVERLAY_RAW
                             in args.visualization_types else False)

    videos_to_make = []
    needs_belief_maps = False
    if do_kp_overlay_raw:
        idx_kp_overlay_raw = len(videos_to_make)
        videos_to_make.append({
            "frames_dir":
            os.path.join(args.output_dir, "frames_kp_overlay_raw"),
            "output_path":
            os.path.join(args.output_dir, "kp_overlay_raw.mp4"),
            "frame": [],
        })
    if do_kp_overlay_net_input:
        idx_kp_overlay_net_input = len(videos_to_make)
        videos_to_make.append({
            "frames_dir":
            os.path.join(args.output_dir, "frames_kp_overlay_net_input"),
            "output_path":
            os.path.join(args.output_dir, "kp_overlay_net_input.mp4"),
            "frame": [],
        })
    if do_kp_belief_overlay_raw:
        idx_kp_belief_overlay_raw = len(videos_to_make)
        needs_belief_maps = True
        videos_to_make.append({
            "frames_dir":
            os.path.join(args.output_dir, "frames_kp_belief_overlay_raw"),
            "output_path":
            os.path.join(args.output_dir, "kp_belief_overlay_raw.mp4"),
            "frame": [],
        })
    if do_belief_overlay_raw:
        idx_belief_overlay_raw = len(videos_to_make)
        needs_belief_maps = True
        videos_to_make.append({
            "frames_dir":
            os.path.join(args.output_dir, "frames_belief_overlay_raw"),
            "output_path":
            os.path.join(args.output_dir, "belief_overlay_raw.mp4"),
            "frame": [],
        })

    if len(videos_to_make) == 0:
        print("No visualizations have been selected.")
        sys.exit(0)

    dream.utilities.makedirs(args.output_dir, exist_ok=args.force_overwrite)
    for video in videos_to_make:
        if os.path.exists(video["frames_dir"]):
            assert args.force_overwrite, 'Frames directory "{}" already exists.'.format(
                video["frames_dir"])
            shutil.rmtree(video["frames_dir"])
        dream.utilities.makedirs(video["frames_dir"],
                                 exist_ok=args.force_overwrite)

    # Create parser
    data_parser = YAML(typ="safe")

    with open(input_config_path, "r") as f:
        network_config = data_parser.load(f)

    # Overwrite GPU
    # If nothing is specified at the command line, None is the default, which uses all GPUs
    # TBD - think about a better way of doing this
    network_config["training"]["platform"]["gpu_ids"] = args.gpu_ids

    # Load network
    dream_network = dream.create_network_from_config_data(network_config)
    dream_network.model.load_state_dict(torch.load(args.input_params_path))
    dream_network.enable_evaluation()

    # Use image preprocessing specified by config by default, unless user specifies otherwise
    if args.image_preproc_override:
        image_preprocessing = args.image_preproc_override
    else:
        image_preprocessing = dream_network.image_preprocessing()

    if args.keypoint_ids is None or len(args.keypoint_ids) == 0:
        idx_keypoints = list(range(dream_network.n_keypoints))
    else:
        idx_keypoints = args.keypoint_ids
    n_idx_keypoints = len(idx_keypoints)

    sample_results = []

    dataset_to_viz = dream.utilities.find_ndds_data_in_dir(args.dataset_path)
    dataset_file_dict_list = dataset_to_viz[
        0]  # list of data file dictionaries; each dictionary indicates the files names for rgb, depth, seg, ...
    dataset_meta_dict = dataset_to_viz[
        1]  # dictionary of camera, object files, etc.

    if dataset_file_dict_list:

        # Downselect based on frame name
        if args.start_frame or args.end_frame:
            sample_names = [x["name"] for x in dataset_file_dict_list]
            start_idx = sample_names.index(
                args.start_frame) if args.start_frame else 0
            end_idx = (sample_names.index(args.end_frame) +
                       1 if args.end_frame else len(dataset_file_dict_list))

            dataset_to_viz = (
                dataset_file_dict_list[start_idx:end_idx],
                dataset_meta_dict,
            )

        image_raw_resolution = dream.utilities.load_image_resolution(
            dataset_meta_dict["camera"])
        (
            network_input_res_inf,
            network_output_res_inf,
        ) = dream_network.net_resolutions_from_image_raw_resolution(
            image_raw_resolution,
            image_preprocessing_override=image_preprocessing)

        manip_dataset_debug_mode = dream.datasets.ManipulatorNDDSDatasetDebugLevels[
            "LIGHT"]
        manip_dataset = dream.datasets.ManipulatorNDDSDataset(
            dataset_to_viz,
            dream_network.manipulator_name,
            dream_network.keypoint_names,
            network_input_res_inf,
            network_output_res_inf,
            dream_network.image_normalization,
            image_preprocessing,
            augment_data=False,
            include_ground_truth=not args.no_ground_truth,
            debug_mode=manip_dataset_debug_mode,
        )

        # TODO: set batch size and num_workers at command line
        batch_size = 8
        num_workers = 4
        training_data = TorchDataLoader(manip_dataset,
                                        batch_size=batch_size,
                                        num_workers=num_workers,
                                        shuffle=False)

        # Network inference on dataset
        with torch.no_grad():

            for batch_idx, sample in enumerate(tqdm(training_data)):

                this_batch_size = len(sample["config"]["name"])

                # Conduct inference
                network_image_input = sample["image_rgb_input"].cuda()
                (
                    belief_maps_batch,
                    detected_kp_projs_netout_batch,
                ) = dream_network.inference(network_image_input)

                for b in range(this_batch_size):

                    input_image_path = sample["config"]["image_paths"]["rgb"][
                        b]

                    if needs_belief_maps:
                        belief_maps = belief_maps_batch[b]
                        selected_belief_maps_copy = (
                            belief_maps[idx_keypoints, :, :].detach().clone())
                    else:
                        selected_belief_maps_copy = []

                    detected_kp_projs_netout = np.array(
                        detected_kp_projs_netout_batch[b], dtype=float)
                    selected_detected_kp_projs_netout = detected_kp_projs_netout[
                        idx_keypoints, :]
                    selected_detected_kp_projs_netin = dream.image_proc.convert_keypoints_to_netin_from_netout(
                        selected_detected_kp_projs_netout,
                        network_output_res_inf,
                        network_input_res_inf,
                    )
                    selected_detected_kp_projs_raw = dream.image_proc.convert_keypoints_to_raw_from_netin(
                        selected_detected_kp_projs_netin,
                        network_input_res_inf,
                        image_raw_resolution,
                        image_preprocessing,
                    )

                    if args.no_ground_truth:
                        selected_gt_kp_projs_raw = []
                        selected_gt_kp_projs_netin = []
                    else:
                        selected_gt_kp_projs_raw = np.array(
                            sample["keypoint_projections_raw"][b][
                                idx_keypoints, :],
                            dtype=float,
                        )
                        selected_gt_kp_projs_netin = np.array(
                            sample["keypoint_projections_input"][b][
                                idx_keypoints, :],
                            dtype=float,
                        )

                    input_image_raw = PILImage.open(input_image_path).convert(
                        "RGB")
                    image_net_input = dream.image_proc.image_from_tensor(
                        sample["image_rgb_input_viz"][b])

                    sample_results.append((
                        input_image_raw,
                        image_net_input,
                        selected_belief_maps_copy,
                        selected_detected_kp_projs_raw,
                        selected_detected_kp_projs_netin,
                        selected_gt_kp_projs_raw,
                        selected_gt_kp_projs_netin,
                    ))

    else:
        # Probably a directory of images - fix this later to avoid code duplication
        dirlist = os.listdir(args.dataset_path)
        dirlist.sort()
        png_image_names = [f for f in dirlist if f.endswith(".png")]
        jpg_image_names = [f for f in dirlist if f.endswith(".jpg")]
        image_names = (png_image_names
                       if len(png_image_names) > len(jpg_image_names) else
                       jpg_image_names)

        if args.start_frame or args.end_frame:
            sample_names = [os.path.splitext(i)[0] for i in image_names]
            start_idx = sample_names.index(
                args.start_frame) if args.start_frame else 0
            end_idx = (sample_names.index(args.end_frame) +
                       1 if args.end_frame else len(sample_names))

            image_names = image_names[start_idx:end_idx]

        # Just use a heuristic to determine the image extension
        image_paths = [os.path.join(args.dataset_path, i) for i in image_names]

        for input_image_path in tqdm(image_paths):

            input_image_raw = PILImage.open(input_image_path).convert("RGB")
            detection_result = dream_network.keypoints_from_image(
                input_image_raw,
                image_preprocessing_override=image_preprocessing,
                debug=True,
            )

            selected_detected_kps_raw = detection_result["detected_keypoints"][
                idx_keypoints, :]
            selected_detected_kps_netin = detection_result[
                "detected_keypoints_net_input"][idx_keypoints, :]
            image_net_input = detection_result["image_rgb_net_input"]
            selected_belief_maps = (
                detection_result["belief_maps"][idx_keypoints, :, :]
                if needs_belief_maps else [])
            selected_gt_kps_raw = []
            selected_gt_kps_netin = []

            sample_results.append((
                input_image_raw,
                image_net_input,
                selected_belief_maps,
                selected_detected_kps_raw,
                selected_detected_kps_netin,
                selected_gt_kps_raw,
                selected_gt_kps_netin,
            ))

    # Iterate through inferred results
    idx_this_frame = 1
    print("Creating visualizations...")
    for (
            image_raw,
            input_image,
            belief_maps,
            detected_kp_projs_raw,
            detected_kp_projs_net_input,
            gt_kp_projs_raw,
            gt_kp_projs_net_input,
    ) in tqdm(sample_results):

        show_gt_keypoints = (
            not args.no_ground_truth) and len(gt_kp_projs_raw) > 0

        image_raw_resolution = image_raw.size
        net_input_resolution = input_image.size

        if do_kp_overlay_net_input:
            videos_to_make[idx_kp_overlay_net_input]["frame"] = input_image

        if do_kp_overlay_raw:
            videos_to_make[idx_kp_overlay_raw]["frame"] = image_raw

        if do_kp_belief_overlay_raw:
            flattened_belief_tensor = belief_maps.sum(dim=0)
            flattened_belief_image = dream.image_proc.image_from_belief_map(
                flattened_belief_tensor,
                colormap="hot",
                normalization_method=6)
            flattened_belief_image_netin = dream.image_proc.convert_image_to_netin_from_netout(
                flattened_belief_image, net_input_resolution)
            flattened_belief_image_raw = dream.image_proc.inverse_preprocess_image(
                flattened_belief_image_netin, image_raw_resolution,
                image_preprocessing)
            videos_to_make[idx_kp_belief_overlay_raw][
                "frame"] = PILImage.blend(image_raw,
                                          flattened_belief_image_raw,
                                          alpha=0.5)

            # Previous code here, but the overlays don't look as nice
            # Note - this seems pretty slow
            # I = np.asarray(flattened_belief_image_raw.convert('L'))
            # I_black = I < 20
            # mask = PILImage.fromarray(np.uint8(255*I_black))
            # temp = PILImage.composite(image_raw, flattened_belief_image_raw, mask)
            # videos_to_make[idx_kp_belief_overlay_raw]['frame'] = PILImage.blend(image_raw, temp, alpha=0.75)
            # #PILImage.alpha_composite(flattened_belief_image_raw.convert('RGBA'), image_raw.convert('RGBA'))

        if do_belief_overlay_raw:
            flattened_belief_tensor = belief_maps.sum(dim=0)
            flattened_belief_image = dream.image_proc.image_from_belief_map(
                flattened_belief_tensor,
                colormap="hot",
                normalization_method=6)
            flattened_belief_image_netin = dream.image_proc.convert_image_to_netin_from_netout(
                flattened_belief_image, net_input_resolution)
            flattened_belief_image_raw = dream.image_proc.inverse_preprocess_image(
                flattened_belief_image_netin, image_raw_resolution,
                image_preprocessing)
            videos_to_make[idx_belief_overlay_raw]["frame"] = PILImage.blend(
                image_raw, flattened_belief_image_raw, alpha=0.5)

        for n in range(n_idx_keypoints):
            detected_kp_proj_raw = detected_kp_projs_raw[n, :]
            detected_kp_proj_net_input = detected_kp_projs_net_input[n, :]

            if show_gt_keypoints:
                gt_kp_proj_raw = gt_kp_projs_raw[n, :]
                gt_kp_proj_net_input = gt_kp_projs_net_input[n, :]

            # Overlay
            if do_kp_overlay_net_input:
                # Heuristic to make point diameter look good for larger raw resolutions
                pt_diameter = (12.0 if image_raw_resolution[0] *
                               image_raw_resolution[1] > 500000 else 6.0)
                if show_gt_keypoints:
                    videos_to_make[idx_kp_overlay_net_input][
                        "frame"] = dream.image_proc.overlay_points_on_image(
                            videos_to_make[idx_kp_overlay_net_input]["frame"],
                            [gt_kp_proj_net_input],
                            annotation_color_dot="green",
                            annotation_color_text="white",
                            point_thickness=2,
                            point_diameter=pt_diameter,
                        )

                videos_to_make[idx_kp_overlay_net_input][
                    "frame"] = dream.image_proc.overlay_points_on_image(
                        videos_to_make[idx_kp_overlay_net_input]["frame"],
                        [detected_kp_proj_net_input],
                        annotation_color_dot="red",
                        annotation_color_text="white",
                        point_diameter=pt_diameter,
                    )

            if do_kp_overlay_raw:
                # Heuristic to make point diameter look good for larger raw resolutions
                pt_diameter = (12.0 if image_raw_resolution[0] *
                               image_raw_resolution[1] > 500000 else 6.0)

                if show_gt_keypoints:
                    videos_to_make[idx_kp_overlay_raw][
                        "frame"] = dream.image_proc.overlay_points_on_image(
                            videos_to_make[idx_kp_overlay_raw]["frame"],
                            [gt_kp_proj_raw],
                            annotation_color_dot="green",
                            annotation_color_text="white",
                            point_thickness=2,
                            point_diameter=pt_diameter + 2,
                        )

                videos_to_make[idx_kp_overlay_raw][
                    "frame"] = dream.image_proc.overlay_points_on_image(
                        videos_to_make[idx_kp_overlay_raw]["frame"],
                        [detected_kp_proj_raw],
                        annotation_color_dot="red",
                        annotation_color_text="white",
                        point_diameter=pt_diameter,
                    )

            if do_kp_belief_overlay_raw:
                # Heuristic to make point diameter look good for larger raw resolutions
                pt_diameter = (12.0 if image_raw_resolution[0] *
                               image_raw_resolution[1] > 500000 else 6.0)

                if show_gt_keypoints:
                    videos_to_make[idx_kp_belief_overlay_raw][
                        "frame"] = dream.image_proc.overlay_points_on_image(
                            videos_to_make[idx_kp_belief_overlay_raw]["frame"],
                            [gt_kp_proj_raw],
                            annotation_color_dot="green",
                            annotation_color_text="white",
                            point_thickness=2,
                            point_diameter=pt_diameter + 2,
                        )

                videos_to_make[idx_kp_belief_overlay_raw][
                    "frame"] = dream.image_proc.overlay_points_on_image(
                        videos_to_make[idx_kp_belief_overlay_raw]["frame"],
                        [detected_kp_proj_raw],
                        annotation_color_dot="red",
                        annotation_color_text="white",
                        point_diameter=pt_diameter,
                    )

        frame_output_filename = str(idx_this_frame).zfill(6) + ".png"
        for video in videos_to_make:
            video["frame"].save(
                os.path.join(video["frames_dir"], frame_output_filename))

        idx_this_frame += 1

    # Call to ffmpeg
    for video in videos_to_make:
        video_from_frames(video["frames_dir"], video["output_path"],
                          args.framerate)
        shutil.rmtree(video["frames_dir"])
Exemplo n.º 19
0
def run_experiment(experiment: Experiment,
                   debug_pipeline: bool = False,
                   develop_mode: bool = False,
                   data_loader_workers: int = 1,
                   cross_validation_iterations: int = 3,
                   device: str = "cpu",
                   develop_mode_sampls: int = 10) -> List[Result]:
    LOGGER.info("Beginning experiment: %s, %s", experiment.id(),
                experiment.description())

    #preprocessing
    pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline)
    #augmentations
    augmentations = AugmentedCollate(experiment.augmentation_stages())
    test_augmentations = AugmentedCollate(
        experiment.test_augmentation_stages())

    dfs = experiment.train_test_data_frames()
    directories = experiment.train_test_directories()

    # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers,
    # can safely be used in each cross validation run
    cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0)
    LOGGER.info("Initialised cache: %s", cache)

    LOGGER.info("Creating APTOSDataset for the following directories: %s",
                directories)

    dataset = TorchConcatDataset([
        APTOSDataset(df, directory, pipeline, cache)
        for df, directory in zip(dfs, directories)
    ])

    # To facilitate software development this makes running end to end tests feasible
    if develop_mode:
        LOGGER.warn(
            "Running in develop mode, using a fraction of the whole dataset")
        dataset, _ = torch_random_split(
            dataset, [develop_mode_sampls,
                      len(dataset) - develop_mode_sampls])

    results = []

    # Stratified ShuffleSplit cross-validator, Provides train/test indices to split data in train/test sets.
    sss = StratifiedShuffleSplit(n_splits=cross_validation_iterations,
                                 test_size=experiment.test_size(),
                                 train_size=1 - experiment.test_size(),
                                 random_state=0)
    #TODO: will probably need debugging when more than one datasets are added
    labels = np.asarray([x for x in dfs[0]["diagnosis"]])
    split_generator = sss.split(np.zeros(labels.shape), labels)

    for cv_iteration, (train_index, test_index) in zip(
            range(1, cross_validation_iterations + 1), split_generator):
        LOGGER.info("Cross validation iteration: %s", cv_iteration)

        with APTOSMonitor(experiment, cv_iteration) as monitor:
            LOGGER.info(
                f'tensorboard --logdir "{monitor._summary_writer.log_dir}"')

            test_ds = Subset(dataset, test_index)
            train_ds = Subset(dataset, train_index)

            LOGGER.info("train data size: {}".format(train_ds.__len__()))
            LOGGER.info("Histogram of classses {}".format(
                np.histogram(labels[train_index], 5)))
            class_data = np.histogram(labels[train_index], 5)[0]
            class_weights = class_data.sum() / (class_data.shape[0] *
                                                class_data)

            LOGGER.info("test data size: {}".format(test_ds.__len__()))
            LOGGER.info("Histogram of classses {}".format(
                np.histogram(labels[test_index], 5)))

            sampler, sampler_kwargs = experiment.sampler()
            sampler = sampler(train_ds, **sampler_kwargs)

            train_loader = TorchDataLoader(
                train_ds,
                batch_size=experiment.batch_size(),
                num_workers=data_loader_workers,
                # Potentially an unconventional use of collate_fn, but it does make the
                # train data loader responsible for augmentations which is nice.
                collate_fn=augmentations,
                sampler=sampler)

            test_loader = TorchDataLoader(test_ds,
                                          batch_size=experiment.batch_size(),
                                          num_workers=data_loader_workers,
                                          collate_fn=test_augmentations)

            model = experiment.model(input_shape=train_ds[0][0].shape)
            print(torch_summary(model.cuda(), train_ds[0][0].shape))

            optimizer_class, optim_kwargs = experiment.optimizer()
            optimizer = optimizer_class(model.parameters(), **optim_kwargs)

            lr_scheduler, scheduler_kwargs = experiment.lr_scheduler()
            lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs)

            monitor.on_cv_start(train_ds, augmentations)

            #add parameter alpha for class weights
            criterion = FocalLoss(num_class=5, gamma=2, alpha=class_weights)

            for epoch in range(1, experiment.max_epochs() + 1):

                LOGGER.info("Epoch: %s", epoch)

                train(model, train_loader, optimizer, device, criterion,
                      monitor)
                lr_scheduler.step()

                predictions_proba, predictions, targets, ids, losses = test(
                    model, test_loader, device, criterion, monitor)

                if epoch % 2 == 0:
                    checkpoint = {
                        'model': model,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'experiment': experiment.state_dict()
                    }

                    checkpoint_directory = f'results/{experiment.id()}'
                    if not os.path.isdir(checkpoint_directory):
                        os.mkdir(checkpoint_directory)

                    torch.save(
                        checkpoint,
                        os.path.join(checkpoint_directory,
                                     f'{cv_iteration}-{epoch}-checkpoint.pth'))

            monitor.on_cv_end()

        predictions = predictions.tolist()
        targets = targets.tolist()

        results_df = pd.DataFrame({
            "experiment_id": [experiment.id() for _ in range(len(targets))],
            "cross_validation_iteration":
            [cv_iteration for _ in range(len(targets))],
            "targets":
            targets,
            "predictions":
            predictions,
            "id_code":
            ids
        })

        results.append(Result(experiment, results_df))

    # Deletes content on disk... (until experiments have a unique hash this make sense)
    cache.clear()

    return results
Exemplo n.º 20
0
def main(checkpoint_file_path: str,
         data_directory:
         str = "../input/aptos2019-blindness-detection/test_images",
         data_frame: str = "../input/aptos2019-blindness-detection/test.csv",
         sample_submission:
         str = "../input/aptos2019-blindness-detection/sample_submission.csv",
         device: str = "cuda:0",
         samples_to_visualise: int = 30):
    print("Beginning submission")
    print(f"Using {device} for submissions")

    # Always load the first iteration from cross validation? Should really re-train on the whole dataset
    checkpoint = torch.load(checkpoint_file_path)
    print("Loaded checkpoint")

    experiment_state_dict = checkpoint["experiment"]
    experiment_state_dict.update(train_test_directories=[data_directory],
                                 train_test_data_frames=[data_frame])
    experiment = Experiment.from_dict(experiment_state_dict)
    print("Initialised experiment: %s", experiment)

    pipeline = Pipeline(experiment.pipeline_stages())
    print("Initialised pipeline")

    dfs = experiment.train_test_data_frames()
    directories = experiment.train_test_directories()
    test_augmentations = AugmentedCollate(
        experiment.test_augmentation_stages())

    dataset = TorchConcatDataset([
        APTOSSubmissionDataset(df, directory, pipeline)
        for df, directory in zip(dfs, directories)
    ])
    print("Initialised dataset")

    loader = TorchDataLoader(dataset,
                             batch_size=experiment.batch_size(),
                             collate_fn=test_augmentations)
    print("Initialised loader")

    model = checkpoint["model"]
    model.load_state_dict(checkpoint['state_dict'])

    print("Initialised model")

    print("Beginning inference")
    predictions_proba, predictions, ids = inference(model, loader, device)

    sample = pd.read_csv(sample_submission)
    sample.diagnosis = predictions

    sample.to_csv("submission.csv", index=False)

    visualisations_directory = os.path.join("samples", experiment.id())
    if not os.path.isdir(visualisations_directory):
        os.makedirs(visualisations_directory, exist_ok=True)

    sample_indexes = random.sample(population=list(range(len(ids))),
                                   k=samples_to_visualise)
    for sample_index in sample_indexes:
        id_ = ids[sample_index]
        prediction = predictions[sample_index]
        proba = predictions_proba[sample_index].cpu().numpy()

        # Second argument is mask
        image, _ = eight_bit_normalization(
            cv2.cvtColor(
                cv2.imread(os.path.join(data_directory, f"{id_}.png")),
                cv2.COLOR_BGR2RGB),
            # No mask
            None)

        plt.figure(figsize=(30, 15))
        fig, axes = plt.subplots(1, 2)
        fig.set_dpi(150)
        axes[0].imshow(image)
        axes[0].axis("off")
        axes[0].title.set_text(f"Raw image")

        bar_list = axes[1].bar(x=CLASSES, height=proba - proba.min())
        bar_list[prediction].set_color('r')
        axes[1].title.set_text(f"Classes")
        plt.setp(axes[1].get_xticklabels(), rotation=45)

        plt.tight_layout()
        plt.savefig(os.path.join(visualisations_directory, f"{id_}.jpeg"))
        plt.clf()
Exemplo n.º 21
0
def train_network(args):

    # Input argument handling
    assert (
        args.epochs > 0
    ), "The number of training epochs must be greater than 0, but it is {}.".format(
        args.epochs)
    assert (
        args.batch_size > 0
    ), "The training batch size must be greater than 0, but it is {}.".format(
        args.batch_size)
    assert (
        args.num_workers >= 0
    ), "The number of subprocesses used for training data loading must be greater than or equal to 0, but it is {}.".format(
        args.num_workers)

    # Parse training fraction
    assert (
        0.0 < args.training_data_fraction and args.training_data_fraction < 1.0
    ), "Expected training_data_fraction to be within 0. and 1., but it is {}.".format(
        args.training_data_fraction)
    validation_data_fraction = 1.0 - args.training_data_fraction

    if args.output_dir:
        save_results = True
        if not args.resume_training:
            dream.utilities.makedirs(args.output_dir,
                                     exist_ok=args.force_overwrite)
    else:
        assert (not args.resume_training
                ), "Cannot resume training; output directory not provided."
        save_results = False

    training_start_time = time.time()

    if args.resume_training:

        # Find the latest network we have
        dirlist = os.listdir(args.output_dir)
        epoch_weight_paths_unsorted = [
            x for x in dirlist if x.startswith("epoch") and x.endswith(".pth")
        ]
        epoch_numbers_unsorted = []
        for net_path in epoch_weight_paths_unsorted:
            epoch_number = int(net_path.split("_")[1].split(".")[0])
            epoch_numbers_unsorted.append(epoch_number)

        temp = sorted(
            zip(epoch_weight_paths_unsorted, epoch_numbers_unsorted),
            key=lambda pair: pair[1],
            reverse=True,
        )
        epoch_weight_paths = [x[0] for x in temp]
        epoch_numbers = [x[1] for x in temp]

        # Most recent network
        most_recent_epoch_weight_path = epoch_weight_paths[0]
        start_epoch = epoch_numbers[0]

        assert (
            start_epoch < args.epochs
        ), "Network is already trained for the number of requested epochs."

        # Find the best network to determine its validation loss
        best_valid_network_config_path = os.path.join(args.output_dir,
                                                      "best_network.yaml")
        assert os.path.exists(
            best_valid_network_config_path
        ), "Could not determine the best validation loss."

        valid_parser = YAML(typ="safe")
        with open(best_valid_network_config_path, "r") as f:
            best_valid_network_config = valid_parser.load(f)
        best_valid_loss = best_valid_network_config["training"]["results"][
            "validation_loss"]["mean"]

        # Load in the old training log
        if os.path.exists(os.path.join(args.output_dir, "training_log.pkl")):
            train_log_path = os.path.join(args.output_dir, "training_log.pkl")
            with open(train_log_path, "rb") as f:
                train_log = pickle.load(f)
            # Move this to make this consistent as if we're in the middle of training
            os.rename(
                train_log_path,
                os.path.join(args.output_dir,
                             "training_log_e{}.pkl".format(start_epoch)),
            )

        elif os.path.exists(
                os.path.join(args.output_dir,
                             "training_log_e{}.pkl".format(start_epoch))):
            train_log_path = os.path.join(
                args.output_dir, "training_log_e{}.pkl".format(start_epoch))
            with open(train_log_path, "rb") as f:
                train_log = pickle.load(f)
        else:
            assert False, "Could not determine training log file to resume."

        # Get the random seed that was used here - we need to to ensure test/valid splits are right
        random_seed = train_log["random_seed"]

        # Set the random seed here because it's different
        if not isinstance(train_log["start_time"], list):
            # Convert to a list
            train_log["start_time"] = [train_log["start_time"]]

        train_log["start_time"].append(training_start_time)

        # Also log the fact that we resumed
        if "epochs_resumed" in train_log:
            train_log["epochs_resumed"].append(start_epoch + 1)
        else:
            train_log["epochs_resumed"] = [start_epoch + 1]

    else:
        # Determine the random seed
        random_seed = (args.random_seed
                       if args.random_seed else random.randint(0, 999999))

        train_log = {
            "epochs": [],
            "losses": [],
            "validation_losses": [],
            "batch_training_losses": [],
            "batch_validation_losses": [],
            "batch_training_sample_names": [],
            "batch_validation_sample_names": [],
            "start_time": training_start_time,
            "timestamps": [],
            "random_seed": random_seed,
        }
        best_valid_loss = float("Inf")

    dream.utilities.set_random_seed(random_seed)

    enable_augment_data = not args.not_augment_data

    gpu_ids = args.gpu_ids if args.gpu_ids else []

    try:
        user = os.getlogin()
    except:
        user = "******"

    # Parse input data
    input_data_path = args.input_data_path
    # Attempt path contraction to make path portable between different platforms
    input_data_abs_path = os.path.abspath(input_data_path)
    input_data_abs_path_split = input_data_abs_path.split("/")
    if (len(input_data_abs_path_split) >= 3
            and input_data_abs_path_split[0] == ""
            and input_data_abs_path_split[1] == "home"
            and input_data_abs_path_split[2] == user):
        # Change the path to use the tilde shortcut
        input_data_path = os.path.join("~", *input_data_abs_path_split[3:])

    # Find data in provided directory
    found_data = dream.utilities.find_ndds_data_in_dir(input_data_path)
    found_data_config = found_data[1]
    image_raw_resolution = dream.utilities.load_image_resolution(
        found_data_config["camera"])

    # Parse manipulation configuration file
    yaml_parser = YAML(typ="safe")
    assert os.path.exists(
        args.manipulator_config_path
    ), 'Expected manipulator_config_path "{}" to exist, but it does not.'.format(
        args.manipulator_config_path)
    with open(args.manipulator_config_path, "r") as f:
        manipulator_config_file = yaml_parser.load(f)
    assert (
        "manipulator" in manipulator_config_file
    ), 'Expected key "manipulator" to exist in the manipulator config file, but it does not.'
    manipulator_config = manipulator_config_file["manipulator"]

    # Parse architecture
    assert os.path.exists(
        args.architecture_config
    ), 'Expected architecture_config file "{}" to exist, but it does not.'.format(
        args.architecture_config)
    with open(args.architecture_config, "r") as f:
        architecture_config_file = yaml_parser.load(f)
    assert (
        "architecture" in architecture_config_file
    ), 'Expected key "architecture" to exist in the architecture config file, but it does not.'
    architecture_config = architecture_config_file["architecture"]

    assert (
        "training" in architecture_config_file
    ), 'Expected key "training" to exist in the architecture config file, but it does not.'
    assert (
        "config" in architecture_config_file["training"]
    ), 'Expected key "config" to exist in training dictionary in the architecture config file, but it does not.'
    training_config = architecture_config_file["training"]["config"]
    assert (
        "image_preprocessing" in training_config
    ), 'Expected key "image_preprocessing" to exist in the training config in the architecture config file, but it does not.'
    training_image_preprocessing = training_config["image_preprocessing"]
    assert (
        "net_input_resolution" in training_config
    ), 'Expected key "net_input_resolution" to exist in the training config in the architecture config file, but it does not.'
    training_net_input_resolution = training_config["net_input_resolution"]
    # TODO: possibly read in other arguments here, such as optimizer, instead of using command line defaults

    if "image_preprocessing" in architecture_config:
        # This could happen if we're trying to resume training.
        assert (
            architecture_config["image_preprocessing"] ==
            training_image_preprocessing
        ), 'If defined, "image_preprocessing" in the architecture and training record must be consistent for this script to work properly.'
    else:
        architecture_config[
            "image_preprocessing"] = training_image_preprocessing

    if enable_augment_data:
        # TODO: specify the types of image augmentation
        data_augment_config = odict([("image_rgb", True)])
    else:
        data_augment_config = False

    network_config = odict([
        ("data_path", input_data_path),
        ("manipulator", manipulator_config),
        ("architecture", architecture_config),
        (
            "training",
            odict([
                (
                    "config",
                    odict([
                        ("epochs", args.epochs),
                        (
                            "training_data_fraction",
                            args.training_data_fraction,
                        ),
                        (
                            "validation_data_fraction",
                            validation_data_fraction,
                        ),
                        ("batch_size", args.batch_size),
                        ("data_augmentation", data_augment_config),
                        ("worker_size", args.num_workers),
                        (
                            "optimizer",
                            odict([
                                ("type", args.optimizer),
                                ("learning_rate", args.learning_rate),
                            ]),
                        ),
                        (
                            "image_preprocessing",
                            training_image_preprocessing,
                        ),
                        (
                            "image_raw_resolution",
                            list(image_raw_resolution),
                        ),
                        (
                            "net_input_resolution",
                            training_net_input_resolution,
                        ),
                    ]),
                ),  # net_output_resolution is set below
                (
                    "platform",
                    odict([
                        ("user", user),
                        ("hostname", socket.gethostname()),
                        ("gpu_ids", gpu_ids),
                    ]),
                ),
                ("results", odict([("epochs_trained", 0)])),
            ]),
        ),
    ])

    # Now check against existing network configuration if we are resuming training
    if args.resume_training:

        # Load corresponding config file to ensure we're consistent
        most_recent_config_path = most_recent_epoch_weight_path.replace(
            "pth", "yaml")
        config_parser = YAML(typ="safe")

        with open(os.path.join(args.output_dir, most_recent_config_path),
                  "r") as f:
            most_recent_network_config_file = config_parser.load(f)

        # Do a bunch of network consistency checks
        assert (most_recent_network_config_file["data_path"] ==
                network_config["data_path"])
        assert (most_recent_network_config_file["manipulator"] ==
                network_config["manipulator"])
        assert (most_recent_network_config_file["architecture"] ==
                network_config["architecture"])
        assert (most_recent_network_config_file["training"]["config"]
                ["training_data_fraction"] == network_config["training"]
                ["config"]["training_data_fraction"])
        assert (most_recent_network_config_file["training"]["config"]
                ["validation_data_fraction"] == network_config["training"]
                ["config"]["validation_data_fraction"])
        assert (
            most_recent_network_config_file["training"]["config"]["batch_size"]
            == network_config["training"]["config"]["batch_size"])
        assert (most_recent_network_config_file["training"]["config"]
                ["data_augmentation"] == network_config["training"]["config"]
                ["data_augmentation"])
        assert (most_recent_network_config_file["training"]["config"]
                ["worker_size"] == network_config["training"]["config"]
                ["worker_size"])
        assert (
            most_recent_network_config_file["training"]["config"]["optimizer"]
            == network_config["training"]["config"]["optimizer"])
        assert (most_recent_network_config_file["training"]["config"]
                ["image_preprocessing"] == network_config["training"]["config"]
                ["image_preprocessing"])
        assert (most_recent_network_config_file["training"]["config"]
                ["image_raw_resolution"] == network_config["training"]
                ["config"]["image_raw_resolution"])
        assert (most_recent_network_config_file["training"]["config"]
                ["net_input_resolution"] == network_config["training"]
                ["config"]["net_input_resolution"])

        # Use this one instead!
        network_config = most_recent_network_config_file

        print("~~ RESUMING TRAINING FROM {} ~~".format(
            most_recent_epoch_weight_path))
        print("")

    else:
        start_epoch = 0

    # Print to screen
    print("Network configuration: {}".format(network_config))
    dream_network = dream.create_network_from_config_data(network_config)
    if args.resume_training:
        dream_network.model.load_state_dict(
            torch.load(
                os.path.join(args.output_dir, most_recent_epoch_weight_path)))
    dream_network.enable_training()

    # The following ensures the config is consistent with the dataloader
    (
        trained_net_input_res,
        trained_net_output_res,
    ) = dream_network.net_resolutions_from_image_raw_resolution(
        image_raw_resolution)
    assert dream_network.trained_net_input_resolution(
    ) == trained_net_input_res
    assert dream_network.trained_net_output_resolution(
    ) == trained_net_output_res
    dream_network.network_config["training"]["config"][
        "net_output_resolution"] = trained_net_output_res

    # Create NDDS dataset and loader
    training_debug_mode = dream.datasets.ManipulatorNDDSDatasetDebugLevels[
        "NONE"]
    network_requires_belief_maps = (
        dream_network.network_config["architecture"]["target"] == "belief_maps"
    )
    found_dataset = dream.datasets.ManipulatorNDDSDataset(
        found_data,
        manipulator_config["name"],
        dream_network.keypoint_names,
        trained_net_input_res,
        trained_net_output_res,
        dream_network.image_normalization,
        dream_network.image_preprocessing(),
        augment_data=enable_augment_data,
        include_ground_truth=True,
        include_belief_maps=network_requires_belief_maps,
        debug_mode=training_debug_mode,
    )

    # Split into train and validation subsets
    n_data = len(found_dataset)
    n_train_data = int(round(n_data * args.training_data_fraction))
    n_valid_data = n_data - n_train_data
    train_dataset, valid_dataset = torch.utils.data.random_split(
        found_dataset, [n_train_data, n_valid_data])

    train_data_loader = TorchDataLoader(train_dataset,
                                        batch_size=args.batch_size,
                                        num_workers=args.num_workers)

    valid_data_loader = TorchDataLoader(valid_dataset,
                                        batch_size=args.batch_size,
                                        num_workers=args.num_workers)

    # Train the network
    print("")
    print(
        "TRAINING NETWORK ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    print("")

    last_epoch_timestamp = 0.0

    for e in tqdm(range(start_epoch, args.epochs)):
        this_epoch = e + 1
        print("Epoch {} ------------".format(this_epoch))

        # Training Phase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        if args.verbose:
            print("")
            print("~~ Training Phase ~~")

        dream_network.enable_training()

        training_batch_losses = []
        training_batch_sample_names = []

        for batch_idx, sample in enumerate(tqdm(train_data_loader)):

            this_batch_sample_names = sample["config"]["name"]
            this_batch_size = sample["image_rgb_input"].shape[0]

            if args.verbose:
                print("Processing batch index {} for training...".format(
                    batch_idx))
                print("Sample names in this training batch: {}".format(
                    this_batch_sample_names))
                print("This training batch size: {}".format(this_batch_size))

            # New unified training
            network_input_heads = []
            network_input_heads.append(sample["image_rgb_input"].cuda())

            if dream_network.network_config["architecture"][
                    "target"] == "belief_maps":
                training_labels = sample["belief_maps"].cuda()
            elif dream_network.network_config["architecture"][
                    "target"] == "keypoints":
                training_labels = sample["keypoint_projections_output"].cuda()
            else:
                assert (
                    False
                ), "Could not determine how to provide training labels to network."

            loss = dream_network.train(network_input_heads, training_labels)

            training_loss_this_batch = loss.item()
            training_batch_losses.append(training_loss_this_batch)
            if args.verbose:
                print("Training loss for this batch: {}".format(
                    training_loss_this_batch))
                print("")
            training_batch_sample_names.append(this_batch_sample_names)

        mean_training_loss_per_batch = np.mean(training_batch_losses)
        std_training_loss_per_batch = np.std(training_batch_losses)

        # Evaluation Phase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        if args.verbose:
            print("")
            print("~~ Validation Phase ~~")

        dream_network.enable_evaluation()

        with torch.no_grad():

            valid_batch_losses = []
            valid_batch_sample_names = []

            for valid_batch_idx, valid_sample in enumerate(
                    tqdm(valid_data_loader)):

                this_valid_batch_sample_names = valid_sample["config"]["name"]
                this_valid_batch_size = valid_sample["image_rgb_input"].shape[
                    0]

                if args.verbose:
                    print("Processing batch index {} for validation...".format(
                        valid_batch_idx))
                    print("Sample names in this validation batch: {}".format(
                        this_valid_batch_sample_names))
                    print("This validation batch size: {}".format(
                        this_valid_batch_size))

                # New unified validation
                valid_network_input_heads = []
                valid_network_input_heads.append(
                    valid_sample["image_rgb_input"].cuda())

                if (dream_network.network_config["architecture"]["target"] ==
                        "belief_maps"):
                    valid_labels = valid_sample["belief_maps"].cuda()
                elif (dream_network.network_config["architecture"]["target"] ==
                      "keypoints"):
                    valid_labels = valid_sample[
                        "keypoint_projections_output"].cuda()
                else:
                    assert (
                        False
                    ), "Could not determine how to provide validation labels to network."

                valid_loss = dream_network.loss(valid_network_input_heads,
                                                valid_labels)

                valid_loss_this_batch = valid_loss.item()
                valid_batch_losses.append(valid_loss_this_batch)
                if args.verbose:
                    print("Validation loss for this batch: {}".format(
                        valid_loss_this_batch))
                    print("")
                valid_batch_sample_names.append(this_valid_batch_sample_names)

            mean_valid_loss_per_batch = np.mean(valid_batch_losses)
            std_valid_loss_per_batch = np.std(valid_batch_losses)

        # Bookkeeping and print info
        dream_network.network_config["training"]["results"][
            "epochs_trained"] += 1
        dream_network.network_config["training"]["results"][
            "training_loss"] = odict([
                ("mean", float(mean_training_loss_per_batch)),
                ("stdev", float(std_training_loss_per_batch)),
            ])
        dream_network.network_config["training"]["results"][
            "validation_loss"] = odict([
                ("mean", float(mean_valid_loss_per_batch)),
                ("stdev", float(std_valid_loss_per_batch)),
            ])
        print("Training Loss (batch-wise mean +- 1 stdev): {} +- {}".format(
            mean_training_loss_per_batch, std_training_loss_per_batch))
        print("Validation Loss (batch-wise mean +- 1 stdev): {} +- {}".format(
            mean_valid_loss_per_batch, std_valid_loss_per_batch))

        # Save network if it's better than anything trained so far
        if mean_valid_loss_per_batch < best_valid_loss:

            print("Best network result so far.")
            best_valid_loss = mean_valid_loss_per_batch

            if save_results:
                dream_network.save_network(args.output_dir,
                                           "best_network",
                                           overwrite=True)

        this_epoch_timestamp = time.time() - training_start_time
        print("This epoch took {} seconds.".format(this_epoch_timestamp -
                                                   last_epoch_timestamp))
        last_epoch_timestamp = this_epoch_timestamp
        print("")

        # Append to history
        train_log["epochs"].append(this_epoch)
        train_log["losses"].append(mean_training_loss_per_batch)
        train_log["validation_losses"].append(mean_valid_loss_per_batch)
        train_log["batch_training_losses"].append(training_batch_losses)
        train_log["batch_validation_losses"].append(valid_batch_losses)
        train_log["batch_training_sample_names"].append(
            training_batch_sample_names)
        train_log["batch_validation_sample_names"].append(
            valid_batch_sample_names)
        train_log["timestamps"].append(this_epoch_timestamp)

        if save_results:
            # Write training log so far
            epoch_training_log_path = os.path.join(
                args.output_dir, "training_log_e{}.pkl".format(this_epoch))
            with open(epoch_training_log_path, "wb") as f:
                pickle.dump(train_log, f)

            # Remove old training log
            last_epoch_training_log_path = os.path.join(
                args.output_dir, "training_log_e{}.pkl".format(e))
            if os.path.exists(last_epoch_training_log_path):
                os.remove(last_epoch_training_log_path)

            # Save this epoch
            dream_network.save_network(args.output_dir,
                                       "epoch_{}".format(this_epoch),
                                       overwrite=True)

    # Save results
    if save_results:
        # Rename the final training log instead of re-writing it
        training_log_path = os.path.join(args.output_dir, "training_log.pkl")
        os.rename(epoch_training_log_path, training_log_path)

    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    print("")
    print("Done.")
    print("")
    print("Total training time: {} seconds.".format(time.time() -
                                                    training_start_time))
    print("")
Exemplo n.º 22
0
    def __init__(self,
                 data_loader_type,
                 variant_loader,
                 batch_size=32,
                 shuffle=True,
                 num_workers=4,
                 sample_encoder=PileupEncoder(
                     window_size=100,
                     max_reads=100,
                     layers=[PileupEncoder.Layer.READ]),
                 label_encoder=ZygosityLabelEncoder()):
        """Construct a data loader.

        Args:
            data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST)
            variant_loader : A loader class for variants
            batch_size : batch size for data loader [32]
            shuffle : shuffle dataset [True]
            num_workers : numbers of parallel data loader threads [4]
            sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100]
            label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable
            when type=TRAIN/EVAL)

        Returns:
            Instance of class.
        """
        super().__init__()
        self.data_loader_type = data_loader_type
        self.variant_loader = variant_loader
        self.sample_encoder = sample_encoder
        self.label_encoder = label_encoder

        class DatasetWrapper(TorchDataset):
            """A wrapper around Torch dataset class to generate individual samples."""
            def __init__(self, data_loader_type, sample_encoder,
                         variant_loader, label_encoder):
                """Construct a dataset wrapper.

                Args:
                    data_loader_type : Type of data loader
                    sample_encoder : Custom pileup encoder for variant
                    variant_loader : A loader class for variants
                    label_encoder : Custom label encoder for variant

                Returns:
                    Instance of class.
                """
                super().__init__()
                self.variant_loader = variant_loader
                self.label_encoder = label_encoder
                self.sample_encoder = sample_encoder
                self.data_loader_type = data_loader_type

            def __len__(self):
                return len(self.variant_loader)

            def __getitem__(self, idx):
                sample = self.variant_loader[idx]

                if self.data_loader_type == ReadPileupDataLoader.Type.TEST:
                    sample = self.sample_encoder(sample)

                    return sample
                else:
                    encoding = self.sample_encoder(sample)
                    label = self.label_encoder(sample)

                    return label, encoding

        dataset = DatasetWrapper(data_loader_type, self.sample_encoder,
                                 self.variant_loader, self.label_encoder)
        self.dataloader = TorchDataLoader(dataset,
                                          batch_size=batch_size,
                                          shuffle=shuffle,
                                          num_workers=num_workers)
Exemplo n.º 23
0
def run_experiment(
        experiment: Experiment,
        debug_pipeline: bool = False,
        develop_mode: bool = False,
        data_loader_workers: int = 1,
        cross_validation_iterations: int = 3,
        device: str = "cpu",
        develop_mode_sampls: int = 10
) -> List[Result]:
    LOGGER.info("Beginning experiment: %s, %s", experiment.id(), experiment.description())

    pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline)
    augmentations = AugmentedCollate(experiment.augmentation_stages())

    dfs = experiment.train_test_data_frames()
    directories = experiment.train_test_directories()

    # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers,
    # can safely be used in each cross validation run
    cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0)
    LOGGER.info("Initialised cache: %s", cache)

    LOGGER.info("Creating APTOSDataset for the following directories: %s", directories)
    dataset = TorchConcatDataset(
        [APTOSDataset(df, directory, pipeline, cache) for df, directory in zip(dfs, directories)]
    )
    # To facilitate software development this makes running end to end tests feasible
    if develop_mode:
        LOGGER.warn("Running in develop mode, using a fraction of the whole dataset")
        dataset, _ = torch_random_split(dataset, [develop_mode_sampls, len(dataset) - develop_mode_sampls])

    results = []
    for cv_iteration in range(1,  cross_validation_iterations + 1):
        LOGGER.info("Cross validation iteration: %s", cv_iteration)

        with APTOSMonitor(experiment, cv_iteration) as monitor:
            LOGGER.info(f'tensorboard --logdir "{monitor._summary_writer.log_dir}"')

            test_size = experiment.test_size()
            train_ds, test_ds = torch_random_split(
                dataset,
                [round((1 - test_size) * len(dataset)), round(test_size * len(dataset))]
            )

            sampler, sampler_kwargs = experiment.sampler()
            sampler = sampler(train_ds, **sampler_kwargs)

            train_loader = TorchDataLoader(
                train_ds,
                batch_size=experiment.batch_size(),
                num_workers=data_loader_workers,
                # Potentially an unconventional use of collate_fn, but it does make the
                # train data loader responsible for augmentations which is nice.
                collate_fn=augmentations,
                sampler=sampler
            )

            test_loader = TorchDataLoader(
                test_ds,
                batch_size=experiment.batch_size(),
                num_workers=data_loader_workers,
            )

            model = experiment.model(input_shape=train_ds[0][0].shape)
            print(torch_summary(model.cuda(), train_ds[0][0].shape))

            optimizer_class, optim_kwargs = experiment.optimizer()
            optimizer = optimizer_class(model.parameters(), **optim_kwargs)

            lr_scheduler, scheduler_kwargs = experiment.lr_scheduler()
            lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs)

            monitor.on_cv_start(train_ds, augmentations)

            for epoch in range(1, experiment.max_epochs() + 1):

                LOGGER.info("Epoch: %s", epoch)

                train(model, train_loader, optimizer, device, monitor)
                lr_scheduler.step()

                predictions_proba, predictions,  targets, ids, losses = test(model, test_loader, device, monitor)

                if epoch % 2 == 0:
                    checkpoint = {
                        'model': model,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'experiment': experiment.state_dict()
                    }

                    checkpoint_directory = f'results/{experiment.id()}'
                    if not os.path.isdir(checkpoint_directory):
                        os.mkdir(checkpoint_directory)

                    torch.save(checkpoint, os.path.join(checkpoint_directory, f'{cv_iteration}-{epoch}-checkpoint.pth'))

            monitor.on_cv_end()

        predictions = predictions.tolist()
        targets = targets.tolist()

        results_df = pd.DataFrame({
            "experiment_id": [experiment.id() for _ in range(len(targets))],
            "cross_validation_iteration": [cv_iteration for _ in range(len(targets))],
            "targets": targets,
            "predictions": predictions,
            "id_code": ids
        })

        results.append(Result(experiment, results_df))

    # Deletes content on disk... (until experiments have a unique hash this make sense)
    cache.clear()

    return results
def evaluate(settings: Settings,
             model,
             loss_handlers,
             device,
             epoch,
             global_step,
             eval_results,
             eval_data_set,
             return_detailed=False):

    if settings.optimization_settings.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data_set)
    else:
        eval_sampler = DistributedSampler(eval_data_set)
    eval_data_loader = TorchDataLoader(
        eval_data_set,
        sampler=eval_sampler,
        batch_size=settings.optimization_settings.predict_batch_size,
        collate_fn=collate_fn)

    model.eval()
    all_results = OrderedDict()
    logger.info("Start evaluating")

    if settings.show_step_progress:
        batch_iterator = tqdm(eval_data_loader, desc="Evaluating")
    else:
        batch_iterator = eval_data_loader

    total_loss = 0
    total_count = 0
    losses_to_write = OrderedDict()
    losses_to_write_counts = OrderedDict()
    for batch in batch_iterator:
        # if len(all_results) % 1000 == 0:
        #     logger.info("Processing example: %d" % (len(all_results)))
        for k in batch:
            batch[k] = batch[k].to(device)
        with torch.no_grad():
            predictions = model(batch, eval_data_set)
            loss_result = OrderedDict(
                (h.field, (h.weight,
                           h(True,
                             epoch,
                             global_step,
                             batch,
                             predictions,
                             return_detailed=return_detailed,
                             apply_weight=False,
                             as_numpy=True,
                             reduction='none'))) for h in loss_handlers)
            if return_detailed:
                loss_dict = OrderedDict()
                for k in loss_result:
                    weight, (summary, detailed) = loss_result[k]
                    loss_dict[k] = weight, summary
                    if k not in all_results:
                        all_results[k] = list()
                    all_results[k].extend(detailed)
            else:
                loss_dict = loss_result
            for data_key in loss_dict:
                weight, (data_loss, data_valid_count) = loss_dict[data_key]
                if data_key not in losses_to_write:
                    losses_to_write[data_key] = 0
                    losses_to_write_counts[data_key] = 0
                if data_valid_count == 0:
                    current = np.nan
                else:
                    current = np.sum(data_loss)
                    losses_to_write[data_key] += current
                    losses_to_write_counts[data_key] += data_valid_count

                if data_valid_count > 0:
                    kind = eval_data_set.response_data_kind(data_key)
                    if data_key in settings.loss_tasks or kind in settings.loss_tasks:
                        total_loss += current
                        total_count += data_valid_count

    for h in loss_handlers:
        if hasattr(h, 'after_eval_batches'):
            h.after_eval_batches(epoch, global_step)

    for k in losses_to_write:
        if losses_to_write_counts[k] == 0:
            losses_to_write[k] = np.nan
        else:
            losses_to_write[k] /= losses_to_write_counts[k]
        eval_results.add_result(k, epoch, global_step, losses_to_write[k])

    if total_count > 0:
        if len(losses_to_write) < 4:
            logger.info('eval:  {:<#8.6}, '.format(total_loss / total_count) +
                        ', '.join([
                            '{}: {:<#8.6}'.format(k, losses_to_write[k])
                            for k in losses_to_write
                        ]))
        else:
            logger.info('eval:  {}'.format(total_loss / total_count))
    else:
        if len(losses_to_write) < 4:
            logger.info('eval:  {:<#8.6}, '.format(total_loss / total_count) +
                        ', '.join([
                            '{}: {:<#8.6}'.format(k, losses_to_write[k])
                            for k in losses_to_write
                        ]))
        else:
            logger.info('eval:  {}'.format(np.nan))

    if return_detailed:
        return all_results
Exemplo n.º 25
0
    def __init__(self,
                 data_loader_type,
                 variant_loaders,
                 batch_size=32,
                 shuffle=True,
                 num_workers=4,
                 sample_encoder=PileupEncoder(
                     window_size=100,
                     max_reads=100,
                     layers=[PileupEncoder.Layer.READ]),
                 label_encoder=ZygosityLabelEncoder()):
        """Construct a data loader.

        Args:
            data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST)
            variant_loaders : A list of loader classes for variants
            batch_size : batch size for data loader [32]
            shuffle : shuffle dataset [True]
            num_workers : numbers of parallel data loader threads [4]
            sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100]
            label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable
            when type=TRAIN/EVAL)

        Returns:
            Instance of class.
        """
        super().__init__()
        self.data_loader_type = data_loader_type
        self.variant_loaders = variant_loaders
        self.sample_encoder = sample_encoder
        self.label_encoder = label_encoder

        class DatasetWrapper(TorchDataset):
            """A wrapper around Torch dataset class to generate individual samples."""
            def __init__(self, data_loader_type, sample_encoder,
                         variant_loaders, label_encoder):
                """Construct a dataset wrapper.

                Args:
                    data_loader_type : Type of data loader
                    sample_encoder : Custom pileup encoder for variant
                    variant_loaders : A list of loader classes for variants
                    label_encoder : Custom label encoder for variant

                Returns:
                    Instance of class.
                """
                super().__init__()
                self.variant_loaders = variant_loaders
                self.label_encoder = label_encoder
                self.sample_encoder = sample_encoder
                self.data_loader_type = data_loader_type

                self._len = sum(
                    [len(loader) for loader in self.variant_loaders])

            def _map_idx_to_sample(self, sample_idx):
                file_idx = 0
                while (file_idx < len(self.variant_loaders)):
                    if sample_idx < len(self.variant_loaders[file_idx]):
                        return self.variant_loaders[file_idx][sample_idx]
                    else:
                        sample_idx -= len(self.variant_loaders[file_idx])
                        file_idx += 1
                raise RuntimeError(
                    "Could not map sample index to file. This is a bug.")

            def __len__(self):
                return self._len

            def __getitem__(self, idx):
                sample = self._map_idx_to_sample(idx)

                if self.data_loader_type == ReadPileupDataLoader.Type.TEST:
                    sample = self.sample_encoder(sample)

                    return sample
                else:
                    encoding = self.sample_encoder(sample)
                    label = self.label_encoder(sample)

                    return label, encoding

        dataset = DatasetWrapper(data_loader_type, self.sample_encoder,
                                 self.variant_loaders, self.label_encoder)
        self.dataloader = TorchDataLoader(dataset,
                                          batch_size=batch_size,
                                          shuffle=shuffle,
                                          num_workers=num_workers)
Exemplo n.º 26
0
def _cast_data(data,
               cfg: Config,
               mode: str,
               datatype: str = "",
               preproc: bool = True):
    assert mode in ("train", "test")

    kwargs = {"batch_size": cfg.batch_size}  # for torch DataLoader
    if cfg.num_workers:
        kwargs["num_workers"] = cfg.num_workers

    if isinstance(data, (CsvDataset)):  # TODO scvi.dataset.GeneExpression
        expression = data.X
        if preproc:
            expression = np.log(expression + 1.)
        tensors = (Tensor(expression), Tensor(data.batch_indices))
        if mode == "test":
            tensors += (Tensor(data.labels), )
        ds = TensorDataset(*tensors)
        return TorchDataLoader(ds, **kwargs)
    elif isinstance(data, (tuple, list, Array)):
        items_shape = data[0].shape
        for item in data:
            if item.shape[0] != items_shape[0]:
                raise ValueError(
                    f"Size mismatch {items_shape[0]} and {item.shape[0]}")
        '''
        if (len(data) != 2 and mode == "train") or (len(data) != 3 and mode == "test"):
            raise ValueError(
                f"Expected {datatype} data \
                with dim=2, got dim={len(data)}"
            )
        '''

        n = 3 if mode == "test" else 2
        tensors = [Tensor(data[i]) for i in range(n)]
        if preproc:
            tensors[0] = np.log(tensors[0] + 1.)
        ds = TensorDataset(*tensors)

        return TorchDataLoader(ds, **kwargs)
    elif isinstance(data, dict):
        if "expression" in data.keys():
            expression = data["expression"]
        elif "X" in data.keys():
            expression = data["X"]
        else:
            raise KeyError(
                f"{datatype} data must contains 'X' or 'expression' key")

        if preproc:
            expression = np.log(expression + 1.)

        try:
            batches = data["batch_indices"]
            labels = data["labels"]
        except KeyError as err:
            raise KeyError(f"{datatype} data must contains {str(err)} key")

        tensors = (expression, batches, labels)
        ds = TensorDataset(*tensors)
        return TorchDataLoader(ds,
                               batch_size=cfg.batch_size,
                               num_workers=cfg.num_workers)
    elif isinstance(data, AnnData):
        raise NotImplementedError()
    elif isinstance(data, TorchDataLoader):
        return data
    elif isinstance(data, TensorDataset):
        return TorchDataLoader(ds,
                               batch_size=cfg.batch_size,
                               num_workers=cfg.num_workers)
    else:
        raise NotImplementedError()
Exemplo n.º 27
0
 def CreateLoader(self):
     return TorchDataLoader(self, **self.loader_args)
def train(settings: Settings,
          output_validation_path: str,
          output_test_path: str,
          output_model_path: str,
          train_data_set: PreparedDataDataset,
          validation_data_set: PreparedDataDataset,
          test_data_set: Optional[PreparedDataDataset],
          n_gpu: int,
          device,
          load_from_path: str = None):

    output_train_curve_path = os.path.join(
        os.path.split(output_validation_path)[0], 'train_curve.npz')
    output_validation_curve_path = os.path.join(
        os.path.split(output_validation_path)[0], 'validation_curve.npz')

    num_train_steps = int(
        len(train_data_set) / settings.optimization_settings.train_batch_size /
        settings.optimization_settings.gradient_accumulation_steps *
        settings.optimization_settings.num_train_epochs)

    num_epochs_prediction_head_only_train = settings.optimization_settings.num_epochs_train_prediction_heads_only
    if num_epochs_prediction_head_only_train < 0:
        num_epochs_prediction_head_only_train = settings.optimization_settings.num_train_epochs
    start_final_epochs_prediction_head_only_train = int(
        settings.optimization_settings.num_train_epochs - settings.
        optimization_settings.num_final_epochs_train_prediction_heads_only)

    prediction_heads, token_supplemental_key_to_shape, pooled_supplemental_key_to_shape, loss_handlers = \
        setup_prediction_heads_and_losses(settings, train_data_set)

    # Prepare model
    model_loader = BertMultiPredictionHead.from_fine_tuned \
        if load_from_path is not None else BertMultiPredictionHead.from_pretrained
    model = model_loader(
        load_from_path if load_from_path is not None else settings.bert_model,
        map_location=lambda storage, loc: None
        if loc == 'cpu' else storage.cuda(device.index),
        prediction_head_settings=prediction_heads,
        token_supplemental_key_to_shape=token_supplemental_key_to_shape,
        pooled_supplemental_key_to_shape=pooled_supplemental_key_to_shape)

    if settings.optimization_settings.fp16:
        model.half()
    model.to(device)
    if settings.optimization_settings.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[settings.optimization_settings.local_rank],
            output_device=settings.optimization_settings.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if settings.optimization_settings.fp16:
        param_optimizer = [
            (n, param.clone().detach().to('cpu').float().requires_grad_())
            for n, param in model.named_parameters()
        ]
    elif settings.optimization_settings.optimize_on_cpu:
        param_optimizer = [(n,
                            param.clone().detach().to('cpu').requires_grad_())
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']

    non_prediction_head_parameters = None
    if num_epochs_prediction_head_only_train > 0 or start_final_epochs_prediction_head_only_train:
        non_prediction_head_parameters = [
            p for n, p in param_optimizer
            if not n.startswith('prediction_head.')
        ]
        for p in non_prediction_head_parameters:
            p.requires_grad = False

    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=settings.optimization_settings.learning_rate,
        warmup=settings.optimization_settings.warmup_proportion,
        t_total=num_train_steps)

    global_step = 0
    train_results = TaskResults()
    validation_results = TaskResults()
    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(train_data_set))
    # for now we set max_sequence_length so these are never split
    logger.info("  Num split examples = %d", len(train_data_set))
    logger.info("  Batch size = %d",
                settings.optimization_settings.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    if settings.optimization_settings.local_rank == -1:
        train_sampler = RandomSampler(train_data_set)
    else:
        train_sampler = DistributedSampler(train_data_set)
    train_data_loader = TorchDataLoader(
        train_data_set,
        sampler=train_sampler,
        batch_size=settings.optimization_settings.train_batch_size,
        collate_fn=collate_fn)

    if settings.show_epoch_progress:
        epoch_range = trange(int(
            settings.optimization_settings.num_train_epochs),
                             desc="Epoch")
    else:
        epoch_range = range(
            int(settings.optimization_settings.num_train_epochs))

    for index_epoch in epoch_range:

        logger.info('Starting epoch {}'.format(index_epoch))

        model.train()

        if index_epoch == start_final_epochs_prediction_head_only_train:
            for p in non_prediction_head_parameters:
                p.requires_grad = False
        elif index_epoch == num_epochs_prediction_head_only_train:
            for p in non_prediction_head_parameters:
                p.requires_grad = True

        if settings.show_step_progress:
            batch_iterator = tqdm(train_data_loader, desc="Iteration")
        else:
            batch_iterator = train_data_loader

        for step, batch in enumerate(batch_iterator):
            if n_gpu == 1:
                for k in batch:
                    batch[k] = batch[k].to(device)
            predictions = model(batch, train_data_set)
            loss_dict = OrderedDict((h.field, (h.weight,
                                               h(False,
                                                 index_epoch,
                                                 global_step,
                                                 batch,
                                                 predictions,
                                                 apply_weight=False)))
                                    for h in loss_handlers)

            # free up memory
            del predictions
            del batch

            loss = None
            losses_to_write = OrderedDict()
            for data_key in loss_dict:
                weight, data_loss = loss_dict[data_key]
                no_valid_inputs = isinstance(
                    data_loss, str) and data_loss == 'no_valid_inputs'
                kind = train_data_set.response_data_kind(data_key)
                if (data_key in settings.loss_tasks or kind
                        in settings.loss_tasks) and not no_valid_inputs:
                    current = weight * data_loss
                    losses_to_write[
                        data_key] = np.nan if no_valid_inputs else data_loss.detach(
                        ).cpu().numpy().item()
                    if loss is None:
                        loss = current
                    else:
                        loss += current
                train_result = np.nan if no_valid_inputs else data_loss.detach(
                ).cpu().numpy().item()
                train_results.add_result(data_key, index_epoch, global_step,
                                         train_result)

            del loss_dict

            if loss is not None:
                if len(losses_to_write) < 4:
                    logger.info(
                        'train: {:<#8.6}, '.format(loss.item()) + ', '.join([
                            '{}: {:<#8.6}'.format(k, losses_to_write[k])
                            for k in losses_to_write
                        ]))
                else:
                    logger.info('train: {}'.format(loss.item()))
                if n_gpu > 1:  # hmm - not sure how this is supposed to work
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if settings.optimization_settings.fp16 and settings.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * settings.loss_scale
                if settings.optimization_settings.gradient_accumulation_steps > 1:
                    loss = loss / settings.optimization_settings.gradient_accumulation_steps
                loss.backward()

            if (
                    step + 1
            ) % settings.optimization_settings.gradient_accumulation_steps == 0:
                if settings.optimization_settings.fp16 or settings.optimization_settings.optimize_on_cpu:
                    if settings.optimization_settings.fp16 and settings.loss_scale != 1.0:
                        # scale down gradients for fp16 training
                        for param in model.parameters():
                            param.grad.data = param.grad.data / settings.loss_scale
                    is_nan = set_optimizer_params_grad(
                        param_optimizer,
                        model.named_parameters(),
                        test_nan=True)
                    if is_nan:
                        logger.info(
                            "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                        )
                        settings.loss_scale = settings.loss_scale / 2
                        model.zero_grad()
                        continue
                    optimizer.step()
                    copy_optimizer_params_to_model(model.named_parameters(),
                                                   param_optimizer)
                else:
                    optimizer.step()
                model.zero_grad()
                global_step += 1

            # we're being super aggressive about releasing memory here because
            # we're right on the edge of fitting in gpu
            del loss
            gc.collect()
            torch.cuda.empty_cache()

        write_loss_curve(output_train_curve_path, train_results)
        if len(validation_data_set) > 0:
            evaluate(settings, model, loss_handlers, device, index_epoch,
                     global_step, validation_results, validation_data_set)
            write_loss_curve(output_validation_curve_path, validation_results)

    logger.info("***** Running predictions *****")
    logger.info("  Num orig examples = %d", len(validation_data_set))
    logger.info("  Num split examples = %d", len(validation_data_set))
    logger.info("  Batch size = %d",
                settings.optimization_settings.predict_batch_size)

    if len(validation_data_set) > 0:
        all_validation = evaluate(
            settings,
            model,
            loss_handlers,
            device,
            settings.optimization_settings.num_train_epochs - 1,
            global_step,
            TaskResults(),
            validation_data_set,
            return_detailed=True)
    else:
        all_validation = {}

    if len(test_data_set) > 0:
        all_test = evaluate(settings,
                            model,
                            loss_handlers,
                            device,
                            settings.optimization_settings.num_train_epochs -
                            1,
                            global_step,
                            TaskResults(),
                            test_data_set,
                            return_detailed=True)
    else:
        all_test = {}

    write_predictions(output_validation_path, all_validation,
                      validation_data_set, settings)
    write_predictions(output_test_path, all_test, test_data_set, settings)

    # Save a trained model and the associated configuration
    if not os.path.exists(output_model_path):
        os.makedirs(output_model_path)
    model.save(output_model_path)

    # clean up after we're done to try to release CUDA resources to other people when there are no more tasks
    gc.collect()
    torch.cuda.empty_cache()
Exemplo n.º 29
0
    def __init__(
        self,
        hdf_file,
        batch_size=32,
        shuffle=True,
        num_workers=4,
        tensor_keys=["encodings", "labels"],
        tensor_dtypes=[torch.float32, torch.int64],
        tensor_dims=[('B', 'C', 'W', 'H'), tuple('B')],
        tensor_neural_types=[
            ReadPileupNeuralType(),
            VariantZygosityNeuralType()
        ],
    ):
        """Constructor for data loader.

        Args:
            hdf_file : Path to HDF file with pileup encodings
            batch_size : batch size for data loader [32]
            shuffle : shuffle dataset [True]
            num_workers : numbers of parallel data loader threads [4]
            tensor_keys : List with keys of tensors to load. ["encodings", "labels"]
            tensor_dtypes : torch data types for tensor. [torch.float32, torch.int64]
            tensor_dims : NeuralModule axes for tensors. [('B', 'C', 'W', 'H'), ('B')]
            tensor_neural_types : NeuralTypes for tensors. [SummaryPileupNeuralType(), HaploidNeuralType()]

        Returns:
            Instance of class.
        """
        super().__init__()
        self.hdf_file = hdf_file
        self.tensor_keys = tensor_keys
        self.tensor_dtypes = tensor_dtypes
        self.tensor_dims = tensor_dims
        self.tensor_neural_types = tensor_neural_types

        class DatasetWrapper(TorchDataset):
            """A wrapper around Torch dataset class to generate individual samples."""
            def __init__(self, hdf_file, tensor_dtypes, tensor_keys):
                """Constructor for dataset wrapper.

                Args:
                    hdf_file : Path to HDF5 file.
                    tensor_keys : List with keys of tensors to load.
                    tensor_dtypes : torch data types for tensor.

                Returns:
                    Instance of class.
                """
                super().__init__()
                self.hdf_file = hdf_file
                self.tensor_dtypes = tensor_dtypes
                self.tensor_keys = tensor_keys
                with h5py.File(self.hdf_file, "r") as hdf:
                    self.len = len(hdf.get(self.tensor_keys[0]))
                self._h5_gen = None

            def __len__(self):
                return self.len

            def __getitem__(self, idx):
                # Using generator to keep the file handle to HDF5
                # file open during the life of the process.
                if self._h5_gen is None:
                    self._h5_gen = self._get_generator()
                    next(self._h5_gen)
                return self._h5_gen.send(idx)

            def _get_generator(self):
                hrecs = {}
                hdf = h5py.File(self.hdf_file, "r")
                for key in hdf.keys():
                    hrecs[key] = hdf.get(key)

                idx = yield
                while True:
                    outputs = []
                    for i, key in enumerate(self.tensor_keys):
                        data = hrecs[key]
                        tensor = torch.tensor(data[idx],
                                              dtype=self.tensor_dtypes[i])
                        outputs.append(tensor)
                    idx = yield tuple(outputs)

        dataset = DatasetWrapper(self.hdf_file, self.tensor_dtypes,
                                 self.tensor_keys)

        sampler = None
        if self._placement == DeviceType.AllGpu:
            sampler = torch.utils.data.distributed.DistributedSampler(
                self._dataset)

        self.dataloader = TorchDataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle if sampler is None else False,
            num_workers=num_workers,
            pin_memory=True,
            sampler=sampler)
Exemplo n.º 30
0
    def __init__(self,
                 data_loader_type,
                 hdf_file,
                 batch_size=32,
                 shuffle=True,
                 num_workers=4,
                 encoding_dtype=torch.float32,
                 label_dtype=torch.int64,
                 hdf_encoding_key="encodings",
                 hdf_label_key="labels"):
        """Constructor for data loader.

        Args:
            data_loader_type : Type of data loader (HDFPileupDataLoader.Type.TRAIN/EVAL/TEST)
            hdf_file : Path to HDF file with pileup encodings
            batch_size : batch size for data loader [32]
            shuffle : shuffle dataset [True]
            num_workers : numbers of parallel data loader threads [4]
            encoding_dtype : Torch data type for encoding [torch.float32]
            label_dtype : Torch data type for label [torch.int64]
            hdf_encoding_key : HDF5 key for encodings. [encodings]
            hdf_label_key : HDF5 key for labels. [labels]

        Returns:
            Instance of class.
        """
        super().__init__()
        self.data_loader_type = data_loader_type
        self.hdf_file = hdf_file

        class DatasetWrapper(TorchDataset):
            """A wrapper around Torch dataset class to generate individual samples."""
            def __init__(self, data_loader_type, hdf_file, encoding_dtype,
                         label_dtype, hdf_encoding_key, hdf_label_key):
                """Constructor for dataset wrapper.

                Args:
                    data_loader_type : Type of data loader.
                    hdf_file : Path to HDF5 file.
                    encoding_dtype : Torch type for encoding.
                    label_dtype : Torch type for label.
                    hdf_encoding_key : HDF5 key for encodings.
                    hdf_label_key : HDF5 key for labels.

                Returns:
                    Instance of class.
                """
                super().__init__()
                self.data_loader_type = data_loader_type
                self.hdf_file = hdf_file
                self.encoding_dtype = encoding_dtype
                self.label_dtype = label_dtype
                self.hdf_encoding_key = hdf_encoding_key
                self.hdf_label_key = hdf_label_key

            def __len__(self):
                hdf = h5py.File(self.hdf_file, "r")
                return len(hdf.get(self.hdf_encoding_key))

            def __getitem__(self, idx):
                hdf = h5py.File(self.hdf_file, "r")

                if self.data_loader_type == HDFPileupDataLoader.Type.TEST:
                    encoding = hdf.get(self.hdf_encoding_key)[idx]

                    return torch.tensor(encoding, dtype=self.encoding_dtype)
                else:
                    encoding_data = hdf.get(self.hdf_encoding_key)
                    label_data = hdf.get(self.hdf_label_key)

                    encoding = torch.tensor(encoding_data[idx],
                                            dtype=self.encoding_dtype)
                    label = torch.tensor(label_data[idx],
                                         dtype=self.label_dtype)

                    return label, encoding

        dataset = DatasetWrapper(data_loader_type, self.hdf_file,
                                 encoding_dtype, label_dtype, hdf_encoding_key,
                                 hdf_label_key)
        self.dataloader = TorchDataLoader(dataset,
                                          batch_size=batch_size,
                                          shuffle=shuffle,
                                          num_workers=num_workers)