def load_dataset_d7(dataset_dir, batch_size, test_batch_size, seq_len, horizon, **kwargs): data = {} data['train'] = PeMSD7(dataset_dir, 'train', seq_len=seq_len, horizon=horizon) mean = data['train'].mean std = data['train'].std data['val'] = PeMSD7(dataset_dir, 'val', mean, std, seq_len, horizon) data['test'] = PeMSD7(dataset_dir, 'test', mean, std, seq_len, horizon) data['train_loader'] = TorchDataLoader(data['train'], batch_size=batch_size, shuffle=True) data['train_loader'].num_batch = len(data['train_loader']) data['val_loader'] = TorchDataLoader(data['val'], batch_size=test_batch_size, shuffle=False) data['val_loader'].num_batch = len(data['val_loader']) data['test_loader'] = TorchDataLoader(data['test'], batch_size=test_batch_size, shuffle=False) data['test_loader'].num_batch = len(data['test_loader']) data['scaler'] = data['train'].scaler return data
def run_experiment(experiment: Experiment, debug_pipeline: bool = False) -> List[Result]: pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline) dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() dataset = TorchConcatDataset( [APTOSDataset(df, directory, pipeline) for df, directory in zip(dfs, directories)] ) if DEVELOP_MODE: dataset, _ = random_split(dataset, [DEVELOP_MODE_SAMPLES, len(dataset) - DEVELOP_MODE_SAMPLES]) results = [] for cv_iteration in range(1, CROSS_VALIDATION_ITERATIONS + 1): LOGGER.info("Cross validation iteration: %s", cv_iteration) test_size = experiment.test_size() train_ds, test_ds = random_split( dataset, [round((1 - test_size) * len(dataset)), round(test_size * len(dataset))] ) train_loader = TorchDataLoader( train_ds, batch_size=experiment.batch_size(), num_workers=DATA_LOADER_WORKERS, ) test_loader = TorchDataLoader( test_ds, batch_size=experiment.batch_size(), num_workers=DATA_LOADER_WORKERS, ) model = experiment.model(input_shape=train_ds[0][0].shape) optimizer_class, optim_kwargs = experiment.optimizer() optimizer = optimizer_class(model.parameters(), **optim_kwargs) metric_df = pd.DataFrame(columns=["experiment_id", "epoch", "test_loss", "test_accuracy"]) for epoch in range(1, experiment.max_epochs() + 1): LOGGER.info("Epoch: %s", epoch) train(1, model, train_loader, optimizer, epoch) predictions_proba, predictions, targets = test(model, test_loader) predictions = predictions.tolist() targets = targets.tolist() results_df = pd.DataFrame({ "experiment_id": [experiment.id() for _ in range(len(targets))], "cross_validation_iteration": [cv_iteration for _ in range(len(targets))], "targets": targets, "predictions": predictions, }) results.append(Result(experiment, metric_df, results_df)) return results
def setup_data_loaders(processed, log): batch = 128 train_ds = ParquetIterableDataset( f'file:{processed}/sales_series_melt.parquet', log, '.*part.(?!1).*') valid_ds = ParquetIterableDataset( f'file:{processed}/sales_series_melt.parquet', log, '.*part.1.*') test_ds = ParquetIterableDataset( f'file:{processed}/test_series_melt.parquet', log) train_dl = TorchDataLoader(train_ds, batch_size=batch, shuffle=False, num_workers=0, drop_last=False) valid_dl = TorchDataLoader(valid_ds, batch_size=batch, shuffle=False, num_workers=0, drop_last=False) test_dl = TorchDataLoader(test_ds, batch_size=batch, shuffle=False, num_workers=0, drop_last=False) data = OrderedDict() data["train"] = train_dl data["valid"] = valid_dl data["test"] = test_dl return data
def test_data_loading_augment(): target = "cone" data_dir = f"data/datasets/debug_data/{target}" dataset = fgbg.AugmentedTripletDataset( hdf5_file=f"{data_dir}/data.hdf5", json_file=f"{data_dir}/data.json", background_images_directory="data/datasets/dtd", blur=True, fg_augmentation=True, ) dataloader = TorchDataLoader(dataset, 9, shuffle=True) for batch in dataloader: print(f'mean {batch["observation"].mean()}, ' f'std {batch["observation"].std()}, ' f'min {batch["observation"].min()}, ' f'max {batch["observation"].max()}') grid_observation = torchvision.utils.make_grid(batch["observation"], nrow=3) plt.imshow(grid_observation.permute(1, 2, 0).numpy()) plt.title("observation") plt.show() grid_positive = torchvision.utils.make_grid(batch["positive"], nrow=3) plt.imshow(grid_positive.permute(1, 2, 0).numpy()) plt.title("positive") plt.show() grid_negative = torchvision.utils.make_grid(batch["negative"], nrow=3) plt.imshow(grid_negative.permute(1, 2, 0).numpy()) plt.title("negative") plt.show() break
def gen_data_loaders(self, batch_size, train=True): return [ TorchDataLoader(self.train if train else self.test, collate_fn=Batch.collate([]), batch_size=batch_size // 2, shuffle=True) for i in range(3) ]
def DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=default_collate, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, prefetch_factor=2, persistent_workers=False): return TorchDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers)
def build(self, train_mode: bool, max_items: int, validation_ratio=0.0): """ Builds the dataloader based on input params. Returns a tuple (train_dl, validation_dl). """ # check validation_ratio if validation_ratio < 0 or validation_ratio > 1: raise ValueError("validation_ratio must be in [0,1] interval") # check type of dataloader required if train_mode: self.data_folder = os.path.join(self.data_folder, 'train') else: self.data_folder = os.path.join(self.data_folder, 'test') validation_ratio = 0.0 # build base dataset base_set = CustomDataset(self.data_folder, self.transformation) # check items count if (len(base_set) > max_items) and (max_items > 0): base_set.images = base_set.images[:max_items] base_set.labels = base_set.labels[:max_items] # split base dataset into Subset main_set_len = int(len(base_set) - (len(base_set) * validation_ratio)) val_set_len = len(base_set) - main_set_len main_set, val_set = torch.utils.data.random_split( base_set, [main_set_len, val_set_len]) # build dataloaders self.main_dl = TorchDataLoader( main_set, batch_size=self.batch_size, shuffle=self.shuffle, drop_last=True) self.val_dl = TorchDataLoader( val_set, batch_size=self.batch_size, shuffle=False, drop_last=True) return (self.main_dl, self.val_dl)
def main( checkpoint_file_path: str, data_directory: str = "../input/aptos2019-blindness-detection/test_images", data_frame: str = "../input/aptos2019-blindness-detection/test.csv", sample_submission: str = "../input/aptos2019-blindess-detection/sample_submission.csv"): print("Beginning submission") # Always load the first iteration from cross validation? Should really re-train on the whole dataset checkpoint = torch.load(checkpoint_file_path) print("Loaded checkpoint") experiment_state_dict = checkpoint["experiment"] experiment_state_dict.update(train_test_directories=[data_directory], train_test_data_frames=[data_frame]) experiment = Experiment.from_dict(experiment_state_dict) print("Initialised experiment: %s", experiment) pipeline = Pipeline(experiment.pipeline_stages()) print("Initialised pipeline") dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() dataset = TorchConcatDataset([ APTOSSubmissionDataset(df, directory, pipeline) for df, directory in zip(dfs, directories) ]) print("Initialised dataset") loader = TorchDataLoader( dataset, batch_size=experiment.batch_size(), ) print("Initialised loader") model = checkpoint["model"] model.load_state_dict(checkpoint['state_dict']) print("Initialised model") print("Beginning inference") predictions_proba, predictions, ids = inference(model, loader, "cpu") sample = pd.read_csv(sample_submission) sample.diagnosis = predictions sample.to_csv("submission.csv", index=False)
def create_dataloader(cfg, mode=None, domain=None, name=None, authority=None, train_type=None, items=None): """ :param cfg: :param items: :return: create the dataset(search for the dataset class) init the sampler create the loader """ train_name_factory = { 'source': cfg.dataset.train.source, 'target': cfg.dataset.train.target, } if mode is 'train': name = train_name_factory[domain].name dataset = create_dataset(cfg, mode=mode, domain=domain, name=name, authority=authority, train_type=train_type, items=items) # from DataLoaders.Datasets.market1501 import Market1501 # dataset = Market1501(cfg, items=items) sampler_factory = { 'train': cfg.dataloader.train, 'test': cfg.dataloader.test } sampler = get_sampler(cfg, dataset, sampler_factory[mode]) data_loader = TorchDataLoader( dataset=dataset, batch_size=sampler_factory[mode].batch_size, sampler=sampler, num_workers=cfg.dataloader.num_workers, pin_memory=True, drop_last=sampler_factory[mode].drop_last, ) return data_loader
def create_a_loader(dataset): if cfg.batch_type == 'seq': sampler = SequentialSampler(dataset) elif cfg.batch_type == 'random': sampler = RandomSampler(dataset) elif cfg.batch_type == 'pk': sampler = RandomIdentitySampler(dataset, cfg.pk.k) else: raise NotImplementedError loader = TorchDataLoader( dataset, batch_size=cfg.batch_size, sampler=sampler, num_workers=cfg.num_workers, pin_memory=True, drop_last=cfg.drop_last, ) return loader
def test_data_loading_clean(): target = "gate" data_dir = f"data/datasets/gate_cone_line/{target}" dataset = fgbg.CleanDataset( hdf5_file=f"{data_dir}/data.hdf5", json_file=f"{data_dir}/data.json", ) dataloader = TorchDataLoader(dataset, 9, shuffle=True) for batch in dataloader: print(f'mean {batch["observation"].mean()}, ' f'std {batch["observation"].std()}, ' f'min {batch["observation"].min()}, ' f'max {batch["observation"].max()}') grid = torchvision.utils.make_grid(batch["observation"], nrow=3) plt.imshow(grid.permute(1, 2, 0).numpy()) plt.show() break
def test_data_loading_real_images(): target = "gate" data_dir = "data/datasets/bebop_real" dataset = fgbg.ImagesDataset( target=target, dir_name=data_dir, input_size=(3, 200, 200), output_size=(200, 200), ) dataloader = TorchDataLoader(dataset, 9, shuffle=True) for batch in dataloader: print(f'mean {batch["observation"].mean()}, ' f'std {batch["observation"].std()}, ' f'min {batch["observation"].min()}, ' f'max {batch["observation"].max()}') grid = torchvision.utils.make_grid(batch["observation"], nrow=3) plt.imshow(grid.permute(1, 2, 0).numpy()) plt.show() break
def gen_data_loaders(self, size, batch_size, train=True, use_distributed_sampling=False): loaders = [] for i in range(2): dataset = combined_syn.get_dataset( "graph", size // 2, np.arange(self.min_size + 1, self.max_size + 1)) sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) if \ use_distributed_sampling else None loaders.append( TorchDataLoader(dataset, collate_fn=Batch.collate([]), batch_size=batch_size // 2 if i == 0 else batch_size // 2, sampler=sampler, shuffle=False)) loaders.append([None] * (size // batch_size)) return loaders
def create_dataloader(cfg, dataset_cfg, samples=None): # cfg : dataloader dataset_cfg:dataset dataset = create_dataset(dataset_cfg, samples=samples) # dataset : Market1501 if cfg.batch_type == 'seq': #test取序取数据集元素 sampler = SequentialSampler(dataset) elif cfg.batch_type == 'random': #train到这里 sampler = RandomSampler(dataset) elif cfg.batch_type == 'pk': sampler = RandomIdentitySampler(dataset, cfg.pk.k) else: raise NotImplementedError loader = TorchDataLoader( dataset, batch_size=cfg.batch_size, sampler=sampler, num_workers=cfg.num_workers, pin_memory=True, drop_last=cfg.drop_last, ) return loader
def gen_data_loaders(self, size, batch_size, train=True, use_distributed_sampling=False): loaders = [] for i in range(2): neighs = [] for j in range(size // 2): graph, neigh = utils.sample_neigh( self.train_set if train else self.test_set, random.randint(self.min_size, self.max_size)) neighs.append(graph.subgraph(neigh)) dataset = GraphDataset(GraphDataset.list_to_graphs(neighs)) loaders.append( TorchDataLoader(dataset, collate_fn=Batch.collate([]), batch_size=batch_size // 2 if i == 0 else batch_size // 2, sampler=None, shuffle=False)) loaders.append([None] * (size // batch_size)) return loaders
def __init__(self, purposes=["train", "val", "test"], **kwargs): super().__init__() self.config = kwargs if "datasets" in self.config: self.datasets_config = self.config["datasets"] elif "dataset" in self.config: self.datasets_config = [self.config["dataset"]] else: raise ValueError subsets = {} sampling_weights = {} for purpose in purposes: subsets[purpose] = [] sampling_weights[purpose] = [] for dataset_config in self.datasets_config: dataset_type = DatasetEnum(dataset_config["type"]) dataset_sampling_weights = dataset_config.get( "sampling_weights", { "train": 1, "val": 1, "test": 1 }) assert all(purpose in dataset_sampling_weights.keys() for purpose in purposes) transforms = {} transforms_config = dataset_config.get("transforms", {}) for purpose in purposes: purpose_transforms_config = transforms_config.get(purpose, []) if len(purpose_transforms_config) > 0: transforms[purpose] = Transformer( purpose, purpose_transforms_config) else: transforms[purpose] = None if "split" in dataset_config: split = dataset_config["split"] assert dataset_type != DatasetEnum.HDF5 assert list(split.keys()) == purposes total_split_sum = np.array(list(split.values())).sum().item() for purpose in split.keys(): split[purpose] /= total_split_sum dataset = DATASETS[dataset_type](config=dataset_config, dataset_path=pathlib.Path( dataset_config["path"])) indices = np.arange(len(dataset)) if self.config.get("shuffle", True): rng = np.random.RandomState(seed=0) rng.shuffle(indices) start_idx = 0 for purpose in purposes: len_subset = int(split[purpose] * len(dataset)) subset_indices = indices[start_idx:start_idx + len_subset] if dataset_sampling_weights[purpose] <= 0: # we do not want to add the dataset if its not going to get sampled start_idx += len_subset continue # we need to separately create a subset dataset because we need to apply purpose-specific transforms if transforms[purpose] is not None: subset_dataset = DATASETS[dataset_type]( config=dataset_config, dataset_path=pathlib.Path(dataset_config["path"]), purpose=purpose, transform=transforms[purpose]) subset = Subset(subset_dataset, subset_indices) else: subset = Subset(dataset, subset_indices) subsets[purpose].append(subset) sampling_weights[purpose].append( dataset_sampling_weights[purpose]) start_idx += len_subset else: for purpose in purposes: if dataset_sampling_weights[purpose] <= 0: # we do not want to add the dataset if its not going to get sampled continue subset_dataset = DATASETS[dataset_type]( config=dataset_config, dataset_path=pathlib.Path(dataset_config["path"]), purpose=purpose, transform=transforms[purpose]) subsets[purpose].append(subset_dataset) sampling_weights[purpose].append( dataset_sampling_weights[purpose]) self.dataloaders = {} for purpose in purposes: if len(subsets[purpose]) > 1: purpose_dataset = ConcatDataset(datasets=subsets[purpose]) elif len(subsets[purpose]) == 1: purpose_dataset = subsets[purpose][0] else: raise ValueError shuffle = self.config.get("shuffle", True) if purpose in ["test"]: shuffle = False if shuffle: weights = np.ones(shape=(len(purpose_dataset), )) start_idx = 0 for subset, subset_sampling_weight in zip( subsets[purpose], sampling_weights[purpose]): end_idx = start_idx + len(subset) weights[start_idx:end_idx] = subset_sampling_weight start_idx = end_idx sampler = WeightedRandomSampler( weights=weights, num_samples=len(purpose_dataset), replacement=False) else: for dataset_sampling_weight in sampling_weights[purpose]: if dataset_sampling_weight != 1: raise ValueError( "Currently we do not support weighted, sequential sampling" ) sampler = SequentialSampler(purpose_dataset) self.dataloaders[purpose] = TorchDataLoader( dataset=purpose_dataset, batch_size=self.config["batch_size"], sampler=sampler, num_workers=self.config["num_workers"])
) / batch_size # this assertion only holds if len(datset) is # divisible by batch size start = time() for _ in range(num_epochs): batches = 0 for _ in mt: batches += 1 stop = time() print('high performance batchgenerators %03.4f seconds' % (stop - start)) from torch.utils.data import DataLoader as TorchDataLoader trainloader = TorchDataLoader(cifar_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory, collate_fn=default_collate) batches = 0 for _ in iter(trainloader): batches += 1 assert len(_['data'].shape) == 4 start = time() for _ in range(num_epochs): batches = 0 for _ in trainloader: batches += 1 stop = time() print('pytorch took %03.4f seconds' % (stop - start))
def visualize_network_inference(args): # Input argument handling assert os.path.exists( args.input_params_path ), 'Expected input_params_path "{}" to exist, but it does not.'.format( args.input_params_path) if args.input_config_path: input_config_path = args.input_config_path else: # Use params filepath to infer the config filepath input_config_path = os.path.splitext( args.input_params_path)[0] + ".yaml" assert os.path.exists( input_config_path ), 'Expected input_config_path "{}" to exist, but it does not.'.format( input_config_path) assert os.path.exists( args.dataset_path ), 'Expected dataset_path "{}" to exist, but it does not.'.format( args.dataset_path) # Determine what types of visualizations to do print("visualization types: {}".format(args.visualization_types)) do_kp_overlay_raw = True if KP_OVERLAY_RAW in args.visualization_types else False do_kp_overlay_net_input = (True if KP_OVERLAY_NET_INPUT in args.visualization_types else False) do_kp_belief_overlay_raw = (True if KP_BELIEF_OVERLAY_RAW in args.visualization_types else False) do_belief_overlay_raw = (True if BELIEF_OVERLAY_RAW in args.visualization_types else False) videos_to_make = [] needs_belief_maps = False if do_kp_overlay_raw: idx_kp_overlay_raw = len(videos_to_make) videos_to_make.append({ "frames_dir": os.path.join(args.output_dir, "frames_kp_overlay_raw"), "output_path": os.path.join(args.output_dir, "kp_overlay_raw.mp4"), "frame": [], }) if do_kp_overlay_net_input: idx_kp_overlay_net_input = len(videos_to_make) videos_to_make.append({ "frames_dir": os.path.join(args.output_dir, "frames_kp_overlay_net_input"), "output_path": os.path.join(args.output_dir, "kp_overlay_net_input.mp4"), "frame": [], }) if do_kp_belief_overlay_raw: idx_kp_belief_overlay_raw = len(videos_to_make) needs_belief_maps = True videos_to_make.append({ "frames_dir": os.path.join(args.output_dir, "frames_kp_belief_overlay_raw"), "output_path": os.path.join(args.output_dir, "kp_belief_overlay_raw.mp4"), "frame": [], }) if do_belief_overlay_raw: idx_belief_overlay_raw = len(videos_to_make) needs_belief_maps = True videos_to_make.append({ "frames_dir": os.path.join(args.output_dir, "frames_belief_overlay_raw"), "output_path": os.path.join(args.output_dir, "belief_overlay_raw.mp4"), "frame": [], }) if len(videos_to_make) == 0: print("No visualizations have been selected.") sys.exit(0) dream.utilities.makedirs(args.output_dir, exist_ok=args.force_overwrite) for video in videos_to_make: if os.path.exists(video["frames_dir"]): assert args.force_overwrite, 'Frames directory "{}" already exists.'.format( video["frames_dir"]) shutil.rmtree(video["frames_dir"]) dream.utilities.makedirs(video["frames_dir"], exist_ok=args.force_overwrite) # Create parser data_parser = YAML(typ="safe") with open(input_config_path, "r") as f: network_config = data_parser.load(f) # Overwrite GPU # If nothing is specified at the command line, None is the default, which uses all GPUs # TBD - think about a better way of doing this network_config["training"]["platform"]["gpu_ids"] = args.gpu_ids # Load network dream_network = dream.create_network_from_config_data(network_config) dream_network.model.load_state_dict(torch.load(args.input_params_path)) dream_network.enable_evaluation() # Use image preprocessing specified by config by default, unless user specifies otherwise if args.image_preproc_override: image_preprocessing = args.image_preproc_override else: image_preprocessing = dream_network.image_preprocessing() if args.keypoint_ids is None or len(args.keypoint_ids) == 0: idx_keypoints = list(range(dream_network.n_keypoints)) else: idx_keypoints = args.keypoint_ids n_idx_keypoints = len(idx_keypoints) sample_results = [] dataset_to_viz = dream.utilities.find_ndds_data_in_dir(args.dataset_path) dataset_file_dict_list = dataset_to_viz[ 0] # list of data file dictionaries; each dictionary indicates the files names for rgb, depth, seg, ... dataset_meta_dict = dataset_to_viz[ 1] # dictionary of camera, object files, etc. if dataset_file_dict_list: # Downselect based on frame name if args.start_frame or args.end_frame: sample_names = [x["name"] for x in dataset_file_dict_list] start_idx = sample_names.index( args.start_frame) if args.start_frame else 0 end_idx = (sample_names.index(args.end_frame) + 1 if args.end_frame else len(dataset_file_dict_list)) dataset_to_viz = ( dataset_file_dict_list[start_idx:end_idx], dataset_meta_dict, ) image_raw_resolution = dream.utilities.load_image_resolution( dataset_meta_dict["camera"]) ( network_input_res_inf, network_output_res_inf, ) = dream_network.net_resolutions_from_image_raw_resolution( image_raw_resolution, image_preprocessing_override=image_preprocessing) manip_dataset_debug_mode = dream.datasets.ManipulatorNDDSDatasetDebugLevels[ "LIGHT"] manip_dataset = dream.datasets.ManipulatorNDDSDataset( dataset_to_viz, dream_network.manipulator_name, dream_network.keypoint_names, network_input_res_inf, network_output_res_inf, dream_network.image_normalization, image_preprocessing, augment_data=False, include_ground_truth=not args.no_ground_truth, debug_mode=manip_dataset_debug_mode, ) # TODO: set batch size and num_workers at command line batch_size = 8 num_workers = 4 training_data = TorchDataLoader(manip_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) # Network inference on dataset with torch.no_grad(): for batch_idx, sample in enumerate(tqdm(training_data)): this_batch_size = len(sample["config"]["name"]) # Conduct inference network_image_input = sample["image_rgb_input"].cuda() ( belief_maps_batch, detected_kp_projs_netout_batch, ) = dream_network.inference(network_image_input) for b in range(this_batch_size): input_image_path = sample["config"]["image_paths"]["rgb"][ b] if needs_belief_maps: belief_maps = belief_maps_batch[b] selected_belief_maps_copy = ( belief_maps[idx_keypoints, :, :].detach().clone()) else: selected_belief_maps_copy = [] detected_kp_projs_netout = np.array( detected_kp_projs_netout_batch[b], dtype=float) selected_detected_kp_projs_netout = detected_kp_projs_netout[ idx_keypoints, :] selected_detected_kp_projs_netin = dream.image_proc.convert_keypoints_to_netin_from_netout( selected_detected_kp_projs_netout, network_output_res_inf, network_input_res_inf, ) selected_detected_kp_projs_raw = dream.image_proc.convert_keypoints_to_raw_from_netin( selected_detected_kp_projs_netin, network_input_res_inf, image_raw_resolution, image_preprocessing, ) if args.no_ground_truth: selected_gt_kp_projs_raw = [] selected_gt_kp_projs_netin = [] else: selected_gt_kp_projs_raw = np.array( sample["keypoint_projections_raw"][b][ idx_keypoints, :], dtype=float, ) selected_gt_kp_projs_netin = np.array( sample["keypoint_projections_input"][b][ idx_keypoints, :], dtype=float, ) input_image_raw = PILImage.open(input_image_path).convert( "RGB") image_net_input = dream.image_proc.image_from_tensor( sample["image_rgb_input_viz"][b]) sample_results.append(( input_image_raw, image_net_input, selected_belief_maps_copy, selected_detected_kp_projs_raw, selected_detected_kp_projs_netin, selected_gt_kp_projs_raw, selected_gt_kp_projs_netin, )) else: # Probably a directory of images - fix this later to avoid code duplication dirlist = os.listdir(args.dataset_path) dirlist.sort() png_image_names = [f for f in dirlist if f.endswith(".png")] jpg_image_names = [f for f in dirlist if f.endswith(".jpg")] image_names = (png_image_names if len(png_image_names) > len(jpg_image_names) else jpg_image_names) if args.start_frame or args.end_frame: sample_names = [os.path.splitext(i)[0] for i in image_names] start_idx = sample_names.index( args.start_frame) if args.start_frame else 0 end_idx = (sample_names.index(args.end_frame) + 1 if args.end_frame else len(sample_names)) image_names = image_names[start_idx:end_idx] # Just use a heuristic to determine the image extension image_paths = [os.path.join(args.dataset_path, i) for i in image_names] for input_image_path in tqdm(image_paths): input_image_raw = PILImage.open(input_image_path).convert("RGB") detection_result = dream_network.keypoints_from_image( input_image_raw, image_preprocessing_override=image_preprocessing, debug=True, ) selected_detected_kps_raw = detection_result["detected_keypoints"][ idx_keypoints, :] selected_detected_kps_netin = detection_result[ "detected_keypoints_net_input"][idx_keypoints, :] image_net_input = detection_result["image_rgb_net_input"] selected_belief_maps = ( detection_result["belief_maps"][idx_keypoints, :, :] if needs_belief_maps else []) selected_gt_kps_raw = [] selected_gt_kps_netin = [] sample_results.append(( input_image_raw, image_net_input, selected_belief_maps, selected_detected_kps_raw, selected_detected_kps_netin, selected_gt_kps_raw, selected_gt_kps_netin, )) # Iterate through inferred results idx_this_frame = 1 print("Creating visualizations...") for ( image_raw, input_image, belief_maps, detected_kp_projs_raw, detected_kp_projs_net_input, gt_kp_projs_raw, gt_kp_projs_net_input, ) in tqdm(sample_results): show_gt_keypoints = ( not args.no_ground_truth) and len(gt_kp_projs_raw) > 0 image_raw_resolution = image_raw.size net_input_resolution = input_image.size if do_kp_overlay_net_input: videos_to_make[idx_kp_overlay_net_input]["frame"] = input_image if do_kp_overlay_raw: videos_to_make[idx_kp_overlay_raw]["frame"] = image_raw if do_kp_belief_overlay_raw: flattened_belief_tensor = belief_maps.sum(dim=0) flattened_belief_image = dream.image_proc.image_from_belief_map( flattened_belief_tensor, colormap="hot", normalization_method=6) flattened_belief_image_netin = dream.image_proc.convert_image_to_netin_from_netout( flattened_belief_image, net_input_resolution) flattened_belief_image_raw = dream.image_proc.inverse_preprocess_image( flattened_belief_image_netin, image_raw_resolution, image_preprocessing) videos_to_make[idx_kp_belief_overlay_raw][ "frame"] = PILImage.blend(image_raw, flattened_belief_image_raw, alpha=0.5) # Previous code here, but the overlays don't look as nice # Note - this seems pretty slow # I = np.asarray(flattened_belief_image_raw.convert('L')) # I_black = I < 20 # mask = PILImage.fromarray(np.uint8(255*I_black)) # temp = PILImage.composite(image_raw, flattened_belief_image_raw, mask) # videos_to_make[idx_kp_belief_overlay_raw]['frame'] = PILImage.blend(image_raw, temp, alpha=0.75) # #PILImage.alpha_composite(flattened_belief_image_raw.convert('RGBA'), image_raw.convert('RGBA')) if do_belief_overlay_raw: flattened_belief_tensor = belief_maps.sum(dim=0) flattened_belief_image = dream.image_proc.image_from_belief_map( flattened_belief_tensor, colormap="hot", normalization_method=6) flattened_belief_image_netin = dream.image_proc.convert_image_to_netin_from_netout( flattened_belief_image, net_input_resolution) flattened_belief_image_raw = dream.image_proc.inverse_preprocess_image( flattened_belief_image_netin, image_raw_resolution, image_preprocessing) videos_to_make[idx_belief_overlay_raw]["frame"] = PILImage.blend( image_raw, flattened_belief_image_raw, alpha=0.5) for n in range(n_idx_keypoints): detected_kp_proj_raw = detected_kp_projs_raw[n, :] detected_kp_proj_net_input = detected_kp_projs_net_input[n, :] if show_gt_keypoints: gt_kp_proj_raw = gt_kp_projs_raw[n, :] gt_kp_proj_net_input = gt_kp_projs_net_input[n, :] # Overlay if do_kp_overlay_net_input: # Heuristic to make point diameter look good for larger raw resolutions pt_diameter = (12.0 if image_raw_resolution[0] * image_raw_resolution[1] > 500000 else 6.0) if show_gt_keypoints: videos_to_make[idx_kp_overlay_net_input][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_overlay_net_input]["frame"], [gt_kp_proj_net_input], annotation_color_dot="green", annotation_color_text="white", point_thickness=2, point_diameter=pt_diameter, ) videos_to_make[idx_kp_overlay_net_input][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_overlay_net_input]["frame"], [detected_kp_proj_net_input], annotation_color_dot="red", annotation_color_text="white", point_diameter=pt_diameter, ) if do_kp_overlay_raw: # Heuristic to make point diameter look good for larger raw resolutions pt_diameter = (12.0 if image_raw_resolution[0] * image_raw_resolution[1] > 500000 else 6.0) if show_gt_keypoints: videos_to_make[idx_kp_overlay_raw][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_overlay_raw]["frame"], [gt_kp_proj_raw], annotation_color_dot="green", annotation_color_text="white", point_thickness=2, point_diameter=pt_diameter + 2, ) videos_to_make[idx_kp_overlay_raw][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_overlay_raw]["frame"], [detected_kp_proj_raw], annotation_color_dot="red", annotation_color_text="white", point_diameter=pt_diameter, ) if do_kp_belief_overlay_raw: # Heuristic to make point diameter look good for larger raw resolutions pt_diameter = (12.0 if image_raw_resolution[0] * image_raw_resolution[1] > 500000 else 6.0) if show_gt_keypoints: videos_to_make[idx_kp_belief_overlay_raw][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_belief_overlay_raw]["frame"], [gt_kp_proj_raw], annotation_color_dot="green", annotation_color_text="white", point_thickness=2, point_diameter=pt_diameter + 2, ) videos_to_make[idx_kp_belief_overlay_raw][ "frame"] = dream.image_proc.overlay_points_on_image( videos_to_make[idx_kp_belief_overlay_raw]["frame"], [detected_kp_proj_raw], annotation_color_dot="red", annotation_color_text="white", point_diameter=pt_diameter, ) frame_output_filename = str(idx_this_frame).zfill(6) + ".png" for video in videos_to_make: video["frame"].save( os.path.join(video["frames_dir"], frame_output_filename)) idx_this_frame += 1 # Call to ffmpeg for video in videos_to_make: video_from_frames(video["frames_dir"], video["output_path"], args.framerate) shutil.rmtree(video["frames_dir"])
def run_experiment(experiment: Experiment, debug_pipeline: bool = False, develop_mode: bool = False, data_loader_workers: int = 1, cross_validation_iterations: int = 3, device: str = "cpu", develop_mode_sampls: int = 10) -> List[Result]: LOGGER.info("Beginning experiment: %s, %s", experiment.id(), experiment.description()) #preprocessing pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline) #augmentations augmentations = AugmentedCollate(experiment.augmentation_stages()) test_augmentations = AugmentedCollate( experiment.test_augmentation_stages()) dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers, # can safely be used in each cross validation run cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0) LOGGER.info("Initialised cache: %s", cache) LOGGER.info("Creating APTOSDataset for the following directories: %s", directories) dataset = TorchConcatDataset([ APTOSDataset(df, directory, pipeline, cache) for df, directory in zip(dfs, directories) ]) # To facilitate software development this makes running end to end tests feasible if develop_mode: LOGGER.warn( "Running in develop mode, using a fraction of the whole dataset") dataset, _ = torch_random_split( dataset, [develop_mode_sampls, len(dataset) - develop_mode_sampls]) results = [] # Stratified ShuffleSplit cross-validator, Provides train/test indices to split data in train/test sets. sss = StratifiedShuffleSplit(n_splits=cross_validation_iterations, test_size=experiment.test_size(), train_size=1 - experiment.test_size(), random_state=0) #TODO: will probably need debugging when more than one datasets are added labels = np.asarray([x for x in dfs[0]["diagnosis"]]) split_generator = sss.split(np.zeros(labels.shape), labels) for cv_iteration, (train_index, test_index) in zip( range(1, cross_validation_iterations + 1), split_generator): LOGGER.info("Cross validation iteration: %s", cv_iteration) with APTOSMonitor(experiment, cv_iteration) as monitor: LOGGER.info( f'tensorboard --logdir "{monitor._summary_writer.log_dir}"') test_ds = Subset(dataset, test_index) train_ds = Subset(dataset, train_index) LOGGER.info("train data size: {}".format(train_ds.__len__())) LOGGER.info("Histogram of classses {}".format( np.histogram(labels[train_index], 5))) class_data = np.histogram(labels[train_index], 5)[0] class_weights = class_data.sum() / (class_data.shape[0] * class_data) LOGGER.info("test data size: {}".format(test_ds.__len__())) LOGGER.info("Histogram of classses {}".format( np.histogram(labels[test_index], 5))) sampler, sampler_kwargs = experiment.sampler() sampler = sampler(train_ds, **sampler_kwargs) train_loader = TorchDataLoader( train_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, # Potentially an unconventional use of collate_fn, but it does make the # train data loader responsible for augmentations which is nice. collate_fn=augmentations, sampler=sampler) test_loader = TorchDataLoader(test_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, collate_fn=test_augmentations) model = experiment.model(input_shape=train_ds[0][0].shape) print(torch_summary(model.cuda(), train_ds[0][0].shape)) optimizer_class, optim_kwargs = experiment.optimizer() optimizer = optimizer_class(model.parameters(), **optim_kwargs) lr_scheduler, scheduler_kwargs = experiment.lr_scheduler() lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs) monitor.on_cv_start(train_ds, augmentations) #add parameter alpha for class weights criterion = FocalLoss(num_class=5, gamma=2, alpha=class_weights) for epoch in range(1, experiment.max_epochs() + 1): LOGGER.info("Epoch: %s", epoch) train(model, train_loader, optimizer, device, criterion, monitor) lr_scheduler.step() predictions_proba, predictions, targets, ids, losses = test( model, test_loader, device, criterion, monitor) if epoch % 2 == 0: checkpoint = { 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'experiment': experiment.state_dict() } checkpoint_directory = f'results/{experiment.id()}' if not os.path.isdir(checkpoint_directory): os.mkdir(checkpoint_directory) torch.save( checkpoint, os.path.join(checkpoint_directory, f'{cv_iteration}-{epoch}-checkpoint.pth')) monitor.on_cv_end() predictions = predictions.tolist() targets = targets.tolist() results_df = pd.DataFrame({ "experiment_id": [experiment.id() for _ in range(len(targets))], "cross_validation_iteration": [cv_iteration for _ in range(len(targets))], "targets": targets, "predictions": predictions, "id_code": ids }) results.append(Result(experiment, results_df)) # Deletes content on disk... (until experiments have a unique hash this make sense) cache.clear() return results
def main(checkpoint_file_path: str, data_directory: str = "../input/aptos2019-blindness-detection/test_images", data_frame: str = "../input/aptos2019-blindness-detection/test.csv", sample_submission: str = "../input/aptos2019-blindness-detection/sample_submission.csv", device: str = "cuda:0", samples_to_visualise: int = 30): print("Beginning submission") print(f"Using {device} for submissions") # Always load the first iteration from cross validation? Should really re-train on the whole dataset checkpoint = torch.load(checkpoint_file_path) print("Loaded checkpoint") experiment_state_dict = checkpoint["experiment"] experiment_state_dict.update(train_test_directories=[data_directory], train_test_data_frames=[data_frame]) experiment = Experiment.from_dict(experiment_state_dict) print("Initialised experiment: %s", experiment) pipeline = Pipeline(experiment.pipeline_stages()) print("Initialised pipeline") dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() test_augmentations = AugmentedCollate( experiment.test_augmentation_stages()) dataset = TorchConcatDataset([ APTOSSubmissionDataset(df, directory, pipeline) for df, directory in zip(dfs, directories) ]) print("Initialised dataset") loader = TorchDataLoader(dataset, batch_size=experiment.batch_size(), collate_fn=test_augmentations) print("Initialised loader") model = checkpoint["model"] model.load_state_dict(checkpoint['state_dict']) print("Initialised model") print("Beginning inference") predictions_proba, predictions, ids = inference(model, loader, device) sample = pd.read_csv(sample_submission) sample.diagnosis = predictions sample.to_csv("submission.csv", index=False) visualisations_directory = os.path.join("samples", experiment.id()) if not os.path.isdir(visualisations_directory): os.makedirs(visualisations_directory, exist_ok=True) sample_indexes = random.sample(population=list(range(len(ids))), k=samples_to_visualise) for sample_index in sample_indexes: id_ = ids[sample_index] prediction = predictions[sample_index] proba = predictions_proba[sample_index].cpu().numpy() # Second argument is mask image, _ = eight_bit_normalization( cv2.cvtColor( cv2.imread(os.path.join(data_directory, f"{id_}.png")), cv2.COLOR_BGR2RGB), # No mask None) plt.figure(figsize=(30, 15)) fig, axes = plt.subplots(1, 2) fig.set_dpi(150) axes[0].imshow(image) axes[0].axis("off") axes[0].title.set_text(f"Raw image") bar_list = axes[1].bar(x=CLASSES, height=proba - proba.min()) bar_list[prediction].set_color('r') axes[1].title.set_text(f"Classes") plt.setp(axes[1].get_xticklabels(), rotation=45) plt.tight_layout() plt.savefig(os.path.join(visualisations_directory, f"{id_}.jpeg")) plt.clf()
def train_network(args): # Input argument handling assert ( args.epochs > 0 ), "The number of training epochs must be greater than 0, but it is {}.".format( args.epochs) assert ( args.batch_size > 0 ), "The training batch size must be greater than 0, but it is {}.".format( args.batch_size) assert ( args.num_workers >= 0 ), "The number of subprocesses used for training data loading must be greater than or equal to 0, but it is {}.".format( args.num_workers) # Parse training fraction assert ( 0.0 < args.training_data_fraction and args.training_data_fraction < 1.0 ), "Expected training_data_fraction to be within 0. and 1., but it is {}.".format( args.training_data_fraction) validation_data_fraction = 1.0 - args.training_data_fraction if args.output_dir: save_results = True if not args.resume_training: dream.utilities.makedirs(args.output_dir, exist_ok=args.force_overwrite) else: assert (not args.resume_training ), "Cannot resume training; output directory not provided." save_results = False training_start_time = time.time() if args.resume_training: # Find the latest network we have dirlist = os.listdir(args.output_dir) epoch_weight_paths_unsorted = [ x for x in dirlist if x.startswith("epoch") and x.endswith(".pth") ] epoch_numbers_unsorted = [] for net_path in epoch_weight_paths_unsorted: epoch_number = int(net_path.split("_")[1].split(".")[0]) epoch_numbers_unsorted.append(epoch_number) temp = sorted( zip(epoch_weight_paths_unsorted, epoch_numbers_unsorted), key=lambda pair: pair[1], reverse=True, ) epoch_weight_paths = [x[0] for x in temp] epoch_numbers = [x[1] for x in temp] # Most recent network most_recent_epoch_weight_path = epoch_weight_paths[0] start_epoch = epoch_numbers[0] assert ( start_epoch < args.epochs ), "Network is already trained for the number of requested epochs." # Find the best network to determine its validation loss best_valid_network_config_path = os.path.join(args.output_dir, "best_network.yaml") assert os.path.exists( best_valid_network_config_path ), "Could not determine the best validation loss." valid_parser = YAML(typ="safe") with open(best_valid_network_config_path, "r") as f: best_valid_network_config = valid_parser.load(f) best_valid_loss = best_valid_network_config["training"]["results"][ "validation_loss"]["mean"] # Load in the old training log if os.path.exists(os.path.join(args.output_dir, "training_log.pkl")): train_log_path = os.path.join(args.output_dir, "training_log.pkl") with open(train_log_path, "rb") as f: train_log = pickle.load(f) # Move this to make this consistent as if we're in the middle of training os.rename( train_log_path, os.path.join(args.output_dir, "training_log_e{}.pkl".format(start_epoch)), ) elif os.path.exists( os.path.join(args.output_dir, "training_log_e{}.pkl".format(start_epoch))): train_log_path = os.path.join( args.output_dir, "training_log_e{}.pkl".format(start_epoch)) with open(train_log_path, "rb") as f: train_log = pickle.load(f) else: assert False, "Could not determine training log file to resume." # Get the random seed that was used here - we need to to ensure test/valid splits are right random_seed = train_log["random_seed"] # Set the random seed here because it's different if not isinstance(train_log["start_time"], list): # Convert to a list train_log["start_time"] = [train_log["start_time"]] train_log["start_time"].append(training_start_time) # Also log the fact that we resumed if "epochs_resumed" in train_log: train_log["epochs_resumed"].append(start_epoch + 1) else: train_log["epochs_resumed"] = [start_epoch + 1] else: # Determine the random seed random_seed = (args.random_seed if args.random_seed else random.randint(0, 999999)) train_log = { "epochs": [], "losses": [], "validation_losses": [], "batch_training_losses": [], "batch_validation_losses": [], "batch_training_sample_names": [], "batch_validation_sample_names": [], "start_time": training_start_time, "timestamps": [], "random_seed": random_seed, } best_valid_loss = float("Inf") dream.utilities.set_random_seed(random_seed) enable_augment_data = not args.not_augment_data gpu_ids = args.gpu_ids if args.gpu_ids else [] try: user = os.getlogin() except: user = "******" # Parse input data input_data_path = args.input_data_path # Attempt path contraction to make path portable between different platforms input_data_abs_path = os.path.abspath(input_data_path) input_data_abs_path_split = input_data_abs_path.split("/") if (len(input_data_abs_path_split) >= 3 and input_data_abs_path_split[0] == "" and input_data_abs_path_split[1] == "home" and input_data_abs_path_split[2] == user): # Change the path to use the tilde shortcut input_data_path = os.path.join("~", *input_data_abs_path_split[3:]) # Find data in provided directory found_data = dream.utilities.find_ndds_data_in_dir(input_data_path) found_data_config = found_data[1] image_raw_resolution = dream.utilities.load_image_resolution( found_data_config["camera"]) # Parse manipulation configuration file yaml_parser = YAML(typ="safe") assert os.path.exists( args.manipulator_config_path ), 'Expected manipulator_config_path "{}" to exist, but it does not.'.format( args.manipulator_config_path) with open(args.manipulator_config_path, "r") as f: manipulator_config_file = yaml_parser.load(f) assert ( "manipulator" in manipulator_config_file ), 'Expected key "manipulator" to exist in the manipulator config file, but it does not.' manipulator_config = manipulator_config_file["manipulator"] # Parse architecture assert os.path.exists( args.architecture_config ), 'Expected architecture_config file "{}" to exist, but it does not.'.format( args.architecture_config) with open(args.architecture_config, "r") as f: architecture_config_file = yaml_parser.load(f) assert ( "architecture" in architecture_config_file ), 'Expected key "architecture" to exist in the architecture config file, but it does not.' architecture_config = architecture_config_file["architecture"] assert ( "training" in architecture_config_file ), 'Expected key "training" to exist in the architecture config file, but it does not.' assert ( "config" in architecture_config_file["training"] ), 'Expected key "config" to exist in training dictionary in the architecture config file, but it does not.' training_config = architecture_config_file["training"]["config"] assert ( "image_preprocessing" in training_config ), 'Expected key "image_preprocessing" to exist in the training config in the architecture config file, but it does not.' training_image_preprocessing = training_config["image_preprocessing"] assert ( "net_input_resolution" in training_config ), 'Expected key "net_input_resolution" to exist in the training config in the architecture config file, but it does not.' training_net_input_resolution = training_config["net_input_resolution"] # TODO: possibly read in other arguments here, such as optimizer, instead of using command line defaults if "image_preprocessing" in architecture_config: # This could happen if we're trying to resume training. assert ( architecture_config["image_preprocessing"] == training_image_preprocessing ), 'If defined, "image_preprocessing" in the architecture and training record must be consistent for this script to work properly.' else: architecture_config[ "image_preprocessing"] = training_image_preprocessing if enable_augment_data: # TODO: specify the types of image augmentation data_augment_config = odict([("image_rgb", True)]) else: data_augment_config = False network_config = odict([ ("data_path", input_data_path), ("manipulator", manipulator_config), ("architecture", architecture_config), ( "training", odict([ ( "config", odict([ ("epochs", args.epochs), ( "training_data_fraction", args.training_data_fraction, ), ( "validation_data_fraction", validation_data_fraction, ), ("batch_size", args.batch_size), ("data_augmentation", data_augment_config), ("worker_size", args.num_workers), ( "optimizer", odict([ ("type", args.optimizer), ("learning_rate", args.learning_rate), ]), ), ( "image_preprocessing", training_image_preprocessing, ), ( "image_raw_resolution", list(image_raw_resolution), ), ( "net_input_resolution", training_net_input_resolution, ), ]), ), # net_output_resolution is set below ( "platform", odict([ ("user", user), ("hostname", socket.gethostname()), ("gpu_ids", gpu_ids), ]), ), ("results", odict([("epochs_trained", 0)])), ]), ), ]) # Now check against existing network configuration if we are resuming training if args.resume_training: # Load corresponding config file to ensure we're consistent most_recent_config_path = most_recent_epoch_weight_path.replace( "pth", "yaml") config_parser = YAML(typ="safe") with open(os.path.join(args.output_dir, most_recent_config_path), "r") as f: most_recent_network_config_file = config_parser.load(f) # Do a bunch of network consistency checks assert (most_recent_network_config_file["data_path"] == network_config["data_path"]) assert (most_recent_network_config_file["manipulator"] == network_config["manipulator"]) assert (most_recent_network_config_file["architecture"] == network_config["architecture"]) assert (most_recent_network_config_file["training"]["config"] ["training_data_fraction"] == network_config["training"] ["config"]["training_data_fraction"]) assert (most_recent_network_config_file["training"]["config"] ["validation_data_fraction"] == network_config["training"] ["config"]["validation_data_fraction"]) assert ( most_recent_network_config_file["training"]["config"]["batch_size"] == network_config["training"]["config"]["batch_size"]) assert (most_recent_network_config_file["training"]["config"] ["data_augmentation"] == network_config["training"]["config"] ["data_augmentation"]) assert (most_recent_network_config_file["training"]["config"] ["worker_size"] == network_config["training"]["config"] ["worker_size"]) assert ( most_recent_network_config_file["training"]["config"]["optimizer"] == network_config["training"]["config"]["optimizer"]) assert (most_recent_network_config_file["training"]["config"] ["image_preprocessing"] == network_config["training"]["config"] ["image_preprocessing"]) assert (most_recent_network_config_file["training"]["config"] ["image_raw_resolution"] == network_config["training"] ["config"]["image_raw_resolution"]) assert (most_recent_network_config_file["training"]["config"] ["net_input_resolution"] == network_config["training"] ["config"]["net_input_resolution"]) # Use this one instead! network_config = most_recent_network_config_file print("~~ RESUMING TRAINING FROM {} ~~".format( most_recent_epoch_weight_path)) print("") else: start_epoch = 0 # Print to screen print("Network configuration: {}".format(network_config)) dream_network = dream.create_network_from_config_data(network_config) if args.resume_training: dream_network.model.load_state_dict( torch.load( os.path.join(args.output_dir, most_recent_epoch_weight_path))) dream_network.enable_training() # The following ensures the config is consistent with the dataloader ( trained_net_input_res, trained_net_output_res, ) = dream_network.net_resolutions_from_image_raw_resolution( image_raw_resolution) assert dream_network.trained_net_input_resolution( ) == trained_net_input_res assert dream_network.trained_net_output_resolution( ) == trained_net_output_res dream_network.network_config["training"]["config"][ "net_output_resolution"] = trained_net_output_res # Create NDDS dataset and loader training_debug_mode = dream.datasets.ManipulatorNDDSDatasetDebugLevels[ "NONE"] network_requires_belief_maps = ( dream_network.network_config["architecture"]["target"] == "belief_maps" ) found_dataset = dream.datasets.ManipulatorNDDSDataset( found_data, manipulator_config["name"], dream_network.keypoint_names, trained_net_input_res, trained_net_output_res, dream_network.image_normalization, dream_network.image_preprocessing(), augment_data=enable_augment_data, include_ground_truth=True, include_belief_maps=network_requires_belief_maps, debug_mode=training_debug_mode, ) # Split into train and validation subsets n_data = len(found_dataset) n_train_data = int(round(n_data * args.training_data_fraction)) n_valid_data = n_data - n_train_data train_dataset, valid_dataset = torch.utils.data.random_split( found_dataset, [n_train_data, n_valid_data]) train_data_loader = TorchDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) valid_data_loader = TorchDataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers) # Train the network print("") print( "TRAINING NETWORK ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) print("") last_epoch_timestamp = 0.0 for e in tqdm(range(start_epoch, args.epochs)): this_epoch = e + 1 print("Epoch {} ------------".format(this_epoch)) # Training Phase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if args.verbose: print("") print("~~ Training Phase ~~") dream_network.enable_training() training_batch_losses = [] training_batch_sample_names = [] for batch_idx, sample in enumerate(tqdm(train_data_loader)): this_batch_sample_names = sample["config"]["name"] this_batch_size = sample["image_rgb_input"].shape[0] if args.verbose: print("Processing batch index {} for training...".format( batch_idx)) print("Sample names in this training batch: {}".format( this_batch_sample_names)) print("This training batch size: {}".format(this_batch_size)) # New unified training network_input_heads = [] network_input_heads.append(sample["image_rgb_input"].cuda()) if dream_network.network_config["architecture"][ "target"] == "belief_maps": training_labels = sample["belief_maps"].cuda() elif dream_network.network_config["architecture"][ "target"] == "keypoints": training_labels = sample["keypoint_projections_output"].cuda() else: assert ( False ), "Could not determine how to provide training labels to network." loss = dream_network.train(network_input_heads, training_labels) training_loss_this_batch = loss.item() training_batch_losses.append(training_loss_this_batch) if args.verbose: print("Training loss for this batch: {}".format( training_loss_this_batch)) print("") training_batch_sample_names.append(this_batch_sample_names) mean_training_loss_per_batch = np.mean(training_batch_losses) std_training_loss_per_batch = np.std(training_batch_losses) # Evaluation Phase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if args.verbose: print("") print("~~ Validation Phase ~~") dream_network.enable_evaluation() with torch.no_grad(): valid_batch_losses = [] valid_batch_sample_names = [] for valid_batch_idx, valid_sample in enumerate( tqdm(valid_data_loader)): this_valid_batch_sample_names = valid_sample["config"]["name"] this_valid_batch_size = valid_sample["image_rgb_input"].shape[ 0] if args.verbose: print("Processing batch index {} for validation...".format( valid_batch_idx)) print("Sample names in this validation batch: {}".format( this_valid_batch_sample_names)) print("This validation batch size: {}".format( this_valid_batch_size)) # New unified validation valid_network_input_heads = [] valid_network_input_heads.append( valid_sample["image_rgb_input"].cuda()) if (dream_network.network_config["architecture"]["target"] == "belief_maps"): valid_labels = valid_sample["belief_maps"].cuda() elif (dream_network.network_config["architecture"]["target"] == "keypoints"): valid_labels = valid_sample[ "keypoint_projections_output"].cuda() else: assert ( False ), "Could not determine how to provide validation labels to network." valid_loss = dream_network.loss(valid_network_input_heads, valid_labels) valid_loss_this_batch = valid_loss.item() valid_batch_losses.append(valid_loss_this_batch) if args.verbose: print("Validation loss for this batch: {}".format( valid_loss_this_batch)) print("") valid_batch_sample_names.append(this_valid_batch_sample_names) mean_valid_loss_per_batch = np.mean(valid_batch_losses) std_valid_loss_per_batch = np.std(valid_batch_losses) # Bookkeeping and print info dream_network.network_config["training"]["results"][ "epochs_trained"] += 1 dream_network.network_config["training"]["results"][ "training_loss"] = odict([ ("mean", float(mean_training_loss_per_batch)), ("stdev", float(std_training_loss_per_batch)), ]) dream_network.network_config["training"]["results"][ "validation_loss"] = odict([ ("mean", float(mean_valid_loss_per_batch)), ("stdev", float(std_valid_loss_per_batch)), ]) print("Training Loss (batch-wise mean +- 1 stdev): {} +- {}".format( mean_training_loss_per_batch, std_training_loss_per_batch)) print("Validation Loss (batch-wise mean +- 1 stdev): {} +- {}".format( mean_valid_loss_per_batch, std_valid_loss_per_batch)) # Save network if it's better than anything trained so far if mean_valid_loss_per_batch < best_valid_loss: print("Best network result so far.") best_valid_loss = mean_valid_loss_per_batch if save_results: dream_network.save_network(args.output_dir, "best_network", overwrite=True) this_epoch_timestamp = time.time() - training_start_time print("This epoch took {} seconds.".format(this_epoch_timestamp - last_epoch_timestamp)) last_epoch_timestamp = this_epoch_timestamp print("") # Append to history train_log["epochs"].append(this_epoch) train_log["losses"].append(mean_training_loss_per_batch) train_log["validation_losses"].append(mean_valid_loss_per_batch) train_log["batch_training_losses"].append(training_batch_losses) train_log["batch_validation_losses"].append(valid_batch_losses) train_log["batch_training_sample_names"].append( training_batch_sample_names) train_log["batch_validation_sample_names"].append( valid_batch_sample_names) train_log["timestamps"].append(this_epoch_timestamp) if save_results: # Write training log so far epoch_training_log_path = os.path.join( args.output_dir, "training_log_e{}.pkl".format(this_epoch)) with open(epoch_training_log_path, "wb") as f: pickle.dump(train_log, f) # Remove old training log last_epoch_training_log_path = os.path.join( args.output_dir, "training_log_e{}.pkl".format(e)) if os.path.exists(last_epoch_training_log_path): os.remove(last_epoch_training_log_path) # Save this epoch dream_network.save_network(args.output_dir, "epoch_{}".format(this_epoch), overwrite=True) # Save results if save_results: # Rename the final training log instead of re-writing it training_log_path = os.path.join(args.output_dir, "training_log.pkl") os.rename(epoch_training_log_path, training_log_path) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) print("") print("Done.") print("") print("Total training time: {} seconds.".format(time.time() - training_start_time)) print("")
def __init__(self, data_loader_type, variant_loader, batch_size=32, shuffle=True, num_workers=4, sample_encoder=PileupEncoder( window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ]), label_encoder=ZygosityLabelEncoder()): """Construct a data loader. Args: data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST) variant_loader : A loader class for variants batch_size : batch size for data loader [32] shuffle : shuffle dataset [True] num_workers : numbers of parallel data loader threads [4] sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100] label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable when type=TRAIN/EVAL) Returns: Instance of class. """ super().__init__() self.data_loader_type = data_loader_type self.variant_loader = variant_loader self.sample_encoder = sample_encoder self.label_encoder = label_encoder class DatasetWrapper(TorchDataset): """A wrapper around Torch dataset class to generate individual samples.""" def __init__(self, data_loader_type, sample_encoder, variant_loader, label_encoder): """Construct a dataset wrapper. Args: data_loader_type : Type of data loader sample_encoder : Custom pileup encoder for variant variant_loader : A loader class for variants label_encoder : Custom label encoder for variant Returns: Instance of class. """ super().__init__() self.variant_loader = variant_loader self.label_encoder = label_encoder self.sample_encoder = sample_encoder self.data_loader_type = data_loader_type def __len__(self): return len(self.variant_loader) def __getitem__(self, idx): sample = self.variant_loader[idx] if self.data_loader_type == ReadPileupDataLoader.Type.TEST: sample = self.sample_encoder(sample) return sample else: encoding = self.sample_encoder(sample) label = self.label_encoder(sample) return label, encoding dataset = DatasetWrapper(data_loader_type, self.sample_encoder, self.variant_loader, self.label_encoder) self.dataloader = TorchDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
def run_experiment( experiment: Experiment, debug_pipeline: bool = False, develop_mode: bool = False, data_loader_workers: int = 1, cross_validation_iterations: int = 3, device: str = "cpu", develop_mode_sampls: int = 10 ) -> List[Result]: LOGGER.info("Beginning experiment: %s, %s", experiment.id(), experiment.description()) pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline) augmentations = AugmentedCollate(experiment.augmentation_stages()) dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers, # can safely be used in each cross validation run cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0) LOGGER.info("Initialised cache: %s", cache) LOGGER.info("Creating APTOSDataset for the following directories: %s", directories) dataset = TorchConcatDataset( [APTOSDataset(df, directory, pipeline, cache) for df, directory in zip(dfs, directories)] ) # To facilitate software development this makes running end to end tests feasible if develop_mode: LOGGER.warn("Running in develop mode, using a fraction of the whole dataset") dataset, _ = torch_random_split(dataset, [develop_mode_sampls, len(dataset) - develop_mode_sampls]) results = [] for cv_iteration in range(1, cross_validation_iterations + 1): LOGGER.info("Cross validation iteration: %s", cv_iteration) with APTOSMonitor(experiment, cv_iteration) as monitor: LOGGER.info(f'tensorboard --logdir "{monitor._summary_writer.log_dir}"') test_size = experiment.test_size() train_ds, test_ds = torch_random_split( dataset, [round((1 - test_size) * len(dataset)), round(test_size * len(dataset))] ) sampler, sampler_kwargs = experiment.sampler() sampler = sampler(train_ds, **sampler_kwargs) train_loader = TorchDataLoader( train_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, # Potentially an unconventional use of collate_fn, but it does make the # train data loader responsible for augmentations which is nice. collate_fn=augmentations, sampler=sampler ) test_loader = TorchDataLoader( test_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, ) model = experiment.model(input_shape=train_ds[0][0].shape) print(torch_summary(model.cuda(), train_ds[0][0].shape)) optimizer_class, optim_kwargs = experiment.optimizer() optimizer = optimizer_class(model.parameters(), **optim_kwargs) lr_scheduler, scheduler_kwargs = experiment.lr_scheduler() lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs) monitor.on_cv_start(train_ds, augmentations) for epoch in range(1, experiment.max_epochs() + 1): LOGGER.info("Epoch: %s", epoch) train(model, train_loader, optimizer, device, monitor) lr_scheduler.step() predictions_proba, predictions, targets, ids, losses = test(model, test_loader, device, monitor) if epoch % 2 == 0: checkpoint = { 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'experiment': experiment.state_dict() } checkpoint_directory = f'results/{experiment.id()}' if not os.path.isdir(checkpoint_directory): os.mkdir(checkpoint_directory) torch.save(checkpoint, os.path.join(checkpoint_directory, f'{cv_iteration}-{epoch}-checkpoint.pth')) monitor.on_cv_end() predictions = predictions.tolist() targets = targets.tolist() results_df = pd.DataFrame({ "experiment_id": [experiment.id() for _ in range(len(targets))], "cross_validation_iteration": [cv_iteration for _ in range(len(targets))], "targets": targets, "predictions": predictions, "id_code": ids }) results.append(Result(experiment, results_df)) # Deletes content on disk... (until experiments have a unique hash this make sense) cache.clear() return results
def evaluate(settings: Settings, model, loss_handlers, device, epoch, global_step, eval_results, eval_data_set, return_detailed=False): if settings.optimization_settings.local_rank == -1: eval_sampler = SequentialSampler(eval_data_set) else: eval_sampler = DistributedSampler(eval_data_set) eval_data_loader = TorchDataLoader( eval_data_set, sampler=eval_sampler, batch_size=settings.optimization_settings.predict_batch_size, collate_fn=collate_fn) model.eval() all_results = OrderedDict() logger.info("Start evaluating") if settings.show_step_progress: batch_iterator = tqdm(eval_data_loader, desc="Evaluating") else: batch_iterator = eval_data_loader total_loss = 0 total_count = 0 losses_to_write = OrderedDict() losses_to_write_counts = OrderedDict() for batch in batch_iterator: # if len(all_results) % 1000 == 0: # logger.info("Processing example: %d" % (len(all_results))) for k in batch: batch[k] = batch[k].to(device) with torch.no_grad(): predictions = model(batch, eval_data_set) loss_result = OrderedDict( (h.field, (h.weight, h(True, epoch, global_step, batch, predictions, return_detailed=return_detailed, apply_weight=False, as_numpy=True, reduction='none'))) for h in loss_handlers) if return_detailed: loss_dict = OrderedDict() for k in loss_result: weight, (summary, detailed) = loss_result[k] loss_dict[k] = weight, summary if k not in all_results: all_results[k] = list() all_results[k].extend(detailed) else: loss_dict = loss_result for data_key in loss_dict: weight, (data_loss, data_valid_count) = loss_dict[data_key] if data_key not in losses_to_write: losses_to_write[data_key] = 0 losses_to_write_counts[data_key] = 0 if data_valid_count == 0: current = np.nan else: current = np.sum(data_loss) losses_to_write[data_key] += current losses_to_write_counts[data_key] += data_valid_count if data_valid_count > 0: kind = eval_data_set.response_data_kind(data_key) if data_key in settings.loss_tasks or kind in settings.loss_tasks: total_loss += current total_count += data_valid_count for h in loss_handlers: if hasattr(h, 'after_eval_batches'): h.after_eval_batches(epoch, global_step) for k in losses_to_write: if losses_to_write_counts[k] == 0: losses_to_write[k] = np.nan else: losses_to_write[k] /= losses_to_write_counts[k] eval_results.add_result(k, epoch, global_step, losses_to_write[k]) if total_count > 0: if len(losses_to_write) < 4: logger.info('eval: {:<#8.6}, '.format(total_loss / total_count) + ', '.join([ '{}: {:<#8.6}'.format(k, losses_to_write[k]) for k in losses_to_write ])) else: logger.info('eval: {}'.format(total_loss / total_count)) else: if len(losses_to_write) < 4: logger.info('eval: {:<#8.6}, '.format(total_loss / total_count) + ', '.join([ '{}: {:<#8.6}'.format(k, losses_to_write[k]) for k in losses_to_write ])) else: logger.info('eval: {}'.format(np.nan)) if return_detailed: return all_results
def __init__(self, data_loader_type, variant_loaders, batch_size=32, shuffle=True, num_workers=4, sample_encoder=PileupEncoder( window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ]), label_encoder=ZygosityLabelEncoder()): """Construct a data loader. Args: data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST) variant_loaders : A list of loader classes for variants batch_size : batch size for data loader [32] shuffle : shuffle dataset [True] num_workers : numbers of parallel data loader threads [4] sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100] label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable when type=TRAIN/EVAL) Returns: Instance of class. """ super().__init__() self.data_loader_type = data_loader_type self.variant_loaders = variant_loaders self.sample_encoder = sample_encoder self.label_encoder = label_encoder class DatasetWrapper(TorchDataset): """A wrapper around Torch dataset class to generate individual samples.""" def __init__(self, data_loader_type, sample_encoder, variant_loaders, label_encoder): """Construct a dataset wrapper. Args: data_loader_type : Type of data loader sample_encoder : Custom pileup encoder for variant variant_loaders : A list of loader classes for variants label_encoder : Custom label encoder for variant Returns: Instance of class. """ super().__init__() self.variant_loaders = variant_loaders self.label_encoder = label_encoder self.sample_encoder = sample_encoder self.data_loader_type = data_loader_type self._len = sum( [len(loader) for loader in self.variant_loaders]) def _map_idx_to_sample(self, sample_idx): file_idx = 0 while (file_idx < len(self.variant_loaders)): if sample_idx < len(self.variant_loaders[file_idx]): return self.variant_loaders[file_idx][sample_idx] else: sample_idx -= len(self.variant_loaders[file_idx]) file_idx += 1 raise RuntimeError( "Could not map sample index to file. This is a bug.") def __len__(self): return self._len def __getitem__(self, idx): sample = self._map_idx_to_sample(idx) if self.data_loader_type == ReadPileupDataLoader.Type.TEST: sample = self.sample_encoder(sample) return sample else: encoding = self.sample_encoder(sample) label = self.label_encoder(sample) return label, encoding dataset = DatasetWrapper(data_loader_type, self.sample_encoder, self.variant_loaders, self.label_encoder) self.dataloader = TorchDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
def _cast_data(data, cfg: Config, mode: str, datatype: str = "", preproc: bool = True): assert mode in ("train", "test") kwargs = {"batch_size": cfg.batch_size} # for torch DataLoader if cfg.num_workers: kwargs["num_workers"] = cfg.num_workers if isinstance(data, (CsvDataset)): # TODO scvi.dataset.GeneExpression expression = data.X if preproc: expression = np.log(expression + 1.) tensors = (Tensor(expression), Tensor(data.batch_indices)) if mode == "test": tensors += (Tensor(data.labels), ) ds = TensorDataset(*tensors) return TorchDataLoader(ds, **kwargs) elif isinstance(data, (tuple, list, Array)): items_shape = data[0].shape for item in data: if item.shape[0] != items_shape[0]: raise ValueError( f"Size mismatch {items_shape[0]} and {item.shape[0]}") ''' if (len(data) != 2 and mode == "train") or (len(data) != 3 and mode == "test"): raise ValueError( f"Expected {datatype} data \ with dim=2, got dim={len(data)}" ) ''' n = 3 if mode == "test" else 2 tensors = [Tensor(data[i]) for i in range(n)] if preproc: tensors[0] = np.log(tensors[0] + 1.) ds = TensorDataset(*tensors) return TorchDataLoader(ds, **kwargs) elif isinstance(data, dict): if "expression" in data.keys(): expression = data["expression"] elif "X" in data.keys(): expression = data["X"] else: raise KeyError( f"{datatype} data must contains 'X' or 'expression' key") if preproc: expression = np.log(expression + 1.) try: batches = data["batch_indices"] labels = data["labels"] except KeyError as err: raise KeyError(f"{datatype} data must contains {str(err)} key") tensors = (expression, batches, labels) ds = TensorDataset(*tensors) return TorchDataLoader(ds, batch_size=cfg.batch_size, num_workers=cfg.num_workers) elif isinstance(data, AnnData): raise NotImplementedError() elif isinstance(data, TorchDataLoader): return data elif isinstance(data, TensorDataset): return TorchDataLoader(ds, batch_size=cfg.batch_size, num_workers=cfg.num_workers) else: raise NotImplementedError()
def CreateLoader(self): return TorchDataLoader(self, **self.loader_args)
def train(settings: Settings, output_validation_path: str, output_test_path: str, output_model_path: str, train_data_set: PreparedDataDataset, validation_data_set: PreparedDataDataset, test_data_set: Optional[PreparedDataDataset], n_gpu: int, device, load_from_path: str = None): output_train_curve_path = os.path.join( os.path.split(output_validation_path)[0], 'train_curve.npz') output_validation_curve_path = os.path.join( os.path.split(output_validation_path)[0], 'validation_curve.npz') num_train_steps = int( len(train_data_set) / settings.optimization_settings.train_batch_size / settings.optimization_settings.gradient_accumulation_steps * settings.optimization_settings.num_train_epochs) num_epochs_prediction_head_only_train = settings.optimization_settings.num_epochs_train_prediction_heads_only if num_epochs_prediction_head_only_train < 0: num_epochs_prediction_head_only_train = settings.optimization_settings.num_train_epochs start_final_epochs_prediction_head_only_train = int( settings.optimization_settings.num_train_epochs - settings. optimization_settings.num_final_epochs_train_prediction_heads_only) prediction_heads, token_supplemental_key_to_shape, pooled_supplemental_key_to_shape, loss_handlers = \ setup_prediction_heads_and_losses(settings, train_data_set) # Prepare model model_loader = BertMultiPredictionHead.from_fine_tuned \ if load_from_path is not None else BertMultiPredictionHead.from_pretrained model = model_loader( load_from_path if load_from_path is not None else settings.bert_model, map_location=lambda storage, loc: None if loc == 'cpu' else storage.cuda(device.index), prediction_head_settings=prediction_heads, token_supplemental_key_to_shape=token_supplemental_key_to_shape, pooled_supplemental_key_to_shape=pooled_supplemental_key_to_shape) if settings.optimization_settings.fp16: model.half() model.to(device) if settings.optimization_settings.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[settings.optimization_settings.local_rank], output_device=settings.optimization_settings.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if settings.optimization_settings.fp16: param_optimizer = [ (n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters() ] elif settings.optimization_settings.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] non_prediction_head_parameters = None if num_epochs_prediction_head_only_train > 0 or start_final_epochs_prediction_head_only_train: non_prediction_head_parameters = [ p for n, p in param_optimizer if not n.startswith('prediction_head.') ] for p in non_prediction_head_parameters: p.requires_grad = False optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam( optimizer_grouped_parameters, lr=settings.optimization_settings.learning_rate, warmup=settings.optimization_settings.warmup_proportion, t_total=num_train_steps) global_step = 0 train_results = TaskResults() validation_results = TaskResults() logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_data_set)) # for now we set max_sequence_length so these are never split logger.info(" Num split examples = %d", len(train_data_set)) logger.info(" Batch size = %d", settings.optimization_settings.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if settings.optimization_settings.local_rank == -1: train_sampler = RandomSampler(train_data_set) else: train_sampler = DistributedSampler(train_data_set) train_data_loader = TorchDataLoader( train_data_set, sampler=train_sampler, batch_size=settings.optimization_settings.train_batch_size, collate_fn=collate_fn) if settings.show_epoch_progress: epoch_range = trange(int( settings.optimization_settings.num_train_epochs), desc="Epoch") else: epoch_range = range( int(settings.optimization_settings.num_train_epochs)) for index_epoch in epoch_range: logger.info('Starting epoch {}'.format(index_epoch)) model.train() if index_epoch == start_final_epochs_prediction_head_only_train: for p in non_prediction_head_parameters: p.requires_grad = False elif index_epoch == num_epochs_prediction_head_only_train: for p in non_prediction_head_parameters: p.requires_grad = True if settings.show_step_progress: batch_iterator = tqdm(train_data_loader, desc="Iteration") else: batch_iterator = train_data_loader for step, batch in enumerate(batch_iterator): if n_gpu == 1: for k in batch: batch[k] = batch[k].to(device) predictions = model(batch, train_data_set) loss_dict = OrderedDict((h.field, (h.weight, h(False, index_epoch, global_step, batch, predictions, apply_weight=False))) for h in loss_handlers) # free up memory del predictions del batch loss = None losses_to_write = OrderedDict() for data_key in loss_dict: weight, data_loss = loss_dict[data_key] no_valid_inputs = isinstance( data_loss, str) and data_loss == 'no_valid_inputs' kind = train_data_set.response_data_kind(data_key) if (data_key in settings.loss_tasks or kind in settings.loss_tasks) and not no_valid_inputs: current = weight * data_loss losses_to_write[ data_key] = np.nan if no_valid_inputs else data_loss.detach( ).cpu().numpy().item() if loss is None: loss = current else: loss += current train_result = np.nan if no_valid_inputs else data_loss.detach( ).cpu().numpy().item() train_results.add_result(data_key, index_epoch, global_step, train_result) del loss_dict if loss is not None: if len(losses_to_write) < 4: logger.info( 'train: {:<#8.6}, '.format(loss.item()) + ', '.join([ '{}: {:<#8.6}'.format(k, losses_to_write[k]) for k in losses_to_write ])) else: logger.info('train: {}'.format(loss.item())) if n_gpu > 1: # hmm - not sure how this is supposed to work loss = loss.mean() # mean() to average on multi-gpu. if settings.optimization_settings.fp16 and settings.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * settings.loss_scale if settings.optimization_settings.gradient_accumulation_steps > 1: loss = loss / settings.optimization_settings.gradient_accumulation_steps loss.backward() if ( step + 1 ) % settings.optimization_settings.gradient_accumulation_steps == 0: if settings.optimization_settings.fp16 or settings.optimization_settings.optimize_on_cpu: if settings.optimization_settings.fp16 and settings.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / settings.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) settings.loss_scale = settings.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 # we're being super aggressive about releasing memory here because # we're right on the edge of fitting in gpu del loss gc.collect() torch.cuda.empty_cache() write_loss_curve(output_train_curve_path, train_results) if len(validation_data_set) > 0: evaluate(settings, model, loss_handlers, device, index_epoch, global_step, validation_results, validation_data_set) write_loss_curve(output_validation_curve_path, validation_results) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(validation_data_set)) logger.info(" Num split examples = %d", len(validation_data_set)) logger.info(" Batch size = %d", settings.optimization_settings.predict_batch_size) if len(validation_data_set) > 0: all_validation = evaluate( settings, model, loss_handlers, device, settings.optimization_settings.num_train_epochs - 1, global_step, TaskResults(), validation_data_set, return_detailed=True) else: all_validation = {} if len(test_data_set) > 0: all_test = evaluate(settings, model, loss_handlers, device, settings.optimization_settings.num_train_epochs - 1, global_step, TaskResults(), test_data_set, return_detailed=True) else: all_test = {} write_predictions(output_validation_path, all_validation, validation_data_set, settings) write_predictions(output_test_path, all_test, test_data_set, settings) # Save a trained model and the associated configuration if not os.path.exists(output_model_path): os.makedirs(output_model_path) model.save(output_model_path) # clean up after we're done to try to release CUDA resources to other people when there are no more tasks gc.collect() torch.cuda.empty_cache()
def __init__( self, hdf_file, batch_size=32, shuffle=True, num_workers=4, tensor_keys=["encodings", "labels"], tensor_dtypes=[torch.float32, torch.int64], tensor_dims=[('B', 'C', 'W', 'H'), tuple('B')], tensor_neural_types=[ ReadPileupNeuralType(), VariantZygosityNeuralType() ], ): """Constructor for data loader. Args: hdf_file : Path to HDF file with pileup encodings batch_size : batch size for data loader [32] shuffle : shuffle dataset [True] num_workers : numbers of parallel data loader threads [4] tensor_keys : List with keys of tensors to load. ["encodings", "labels"] tensor_dtypes : torch data types for tensor. [torch.float32, torch.int64] tensor_dims : NeuralModule axes for tensors. [('B', 'C', 'W', 'H'), ('B')] tensor_neural_types : NeuralTypes for tensors. [SummaryPileupNeuralType(), HaploidNeuralType()] Returns: Instance of class. """ super().__init__() self.hdf_file = hdf_file self.tensor_keys = tensor_keys self.tensor_dtypes = tensor_dtypes self.tensor_dims = tensor_dims self.tensor_neural_types = tensor_neural_types class DatasetWrapper(TorchDataset): """A wrapper around Torch dataset class to generate individual samples.""" def __init__(self, hdf_file, tensor_dtypes, tensor_keys): """Constructor for dataset wrapper. Args: hdf_file : Path to HDF5 file. tensor_keys : List with keys of tensors to load. tensor_dtypes : torch data types for tensor. Returns: Instance of class. """ super().__init__() self.hdf_file = hdf_file self.tensor_dtypes = tensor_dtypes self.tensor_keys = tensor_keys with h5py.File(self.hdf_file, "r") as hdf: self.len = len(hdf.get(self.tensor_keys[0])) self._h5_gen = None def __len__(self): return self.len def __getitem__(self, idx): # Using generator to keep the file handle to HDF5 # file open during the life of the process. if self._h5_gen is None: self._h5_gen = self._get_generator() next(self._h5_gen) return self._h5_gen.send(idx) def _get_generator(self): hrecs = {} hdf = h5py.File(self.hdf_file, "r") for key in hdf.keys(): hrecs[key] = hdf.get(key) idx = yield while True: outputs = [] for i, key in enumerate(self.tensor_keys): data = hrecs[key] tensor = torch.tensor(data[idx], dtype=self.tensor_dtypes[i]) outputs.append(tensor) idx = yield tuple(outputs) dataset = DatasetWrapper(self.hdf_file, self.tensor_dtypes, self.tensor_keys) sampler = None if self._placement == DeviceType.AllGpu: sampler = torch.utils.data.distributed.DistributedSampler( self._dataset) self.dataloader = TorchDataLoader( dataset, batch_size=batch_size, shuffle=shuffle if sampler is None else False, num_workers=num_workers, pin_memory=True, sampler=sampler)
def __init__(self, data_loader_type, hdf_file, batch_size=32, shuffle=True, num_workers=4, encoding_dtype=torch.float32, label_dtype=torch.int64, hdf_encoding_key="encodings", hdf_label_key="labels"): """Constructor for data loader. Args: data_loader_type : Type of data loader (HDFPileupDataLoader.Type.TRAIN/EVAL/TEST) hdf_file : Path to HDF file with pileup encodings batch_size : batch size for data loader [32] shuffle : shuffle dataset [True] num_workers : numbers of parallel data loader threads [4] encoding_dtype : Torch data type for encoding [torch.float32] label_dtype : Torch data type for label [torch.int64] hdf_encoding_key : HDF5 key for encodings. [encodings] hdf_label_key : HDF5 key for labels. [labels] Returns: Instance of class. """ super().__init__() self.data_loader_type = data_loader_type self.hdf_file = hdf_file class DatasetWrapper(TorchDataset): """A wrapper around Torch dataset class to generate individual samples.""" def __init__(self, data_loader_type, hdf_file, encoding_dtype, label_dtype, hdf_encoding_key, hdf_label_key): """Constructor for dataset wrapper. Args: data_loader_type : Type of data loader. hdf_file : Path to HDF5 file. encoding_dtype : Torch type for encoding. label_dtype : Torch type for label. hdf_encoding_key : HDF5 key for encodings. hdf_label_key : HDF5 key for labels. Returns: Instance of class. """ super().__init__() self.data_loader_type = data_loader_type self.hdf_file = hdf_file self.encoding_dtype = encoding_dtype self.label_dtype = label_dtype self.hdf_encoding_key = hdf_encoding_key self.hdf_label_key = hdf_label_key def __len__(self): hdf = h5py.File(self.hdf_file, "r") return len(hdf.get(self.hdf_encoding_key)) def __getitem__(self, idx): hdf = h5py.File(self.hdf_file, "r") if self.data_loader_type == HDFPileupDataLoader.Type.TEST: encoding = hdf.get(self.hdf_encoding_key)[idx] return torch.tensor(encoding, dtype=self.encoding_dtype) else: encoding_data = hdf.get(self.hdf_encoding_key) label_data = hdf.get(self.hdf_label_key) encoding = torch.tensor(encoding_data[idx], dtype=self.encoding_dtype) label = torch.tensor(label_data[idx], dtype=self.label_dtype) return label, encoding dataset = DatasetWrapper(data_loader_type, self.hdf_file, encoding_dtype, label_dtype, hdf_encoding_key, hdf_label_key) self.dataloader = TorchDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)