def zscore_video(videofile: Union[str, os.PathLike], project_config: dict, stride: int = 10): """calculates channel-wise mean and standard deviation for input video. Calculates mean and std deviation independently for each input video channel. Grayscale videos are converted to RGB. Saves statistics to the augs/normalization dictionary in project_config. Only takes every STRIDE frames for speed. Calculates mean and std deviation incrementally to not load thousands of frames into memory at once: https://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html Args: videofile: path to video file. Must be one of inputs to file_io/VideoReader: avi, mp4, jpg directory, or hdf5 project_config: dictionary for your deepethogram project. Contains augs/normalization field stride: only every STRIDE frames will be computed. Use stride=1 for the full video Returns: """ assert os.path.exists(videofile) assert projects.is_deg_file(videofile) # config['arch'] = 'flow-generator' # config['normalization'] = None # transforms = get_transforms_from_config(config) # xform = transforms['train'] log.info('zscoring file: {}'.format(videofile)) imdata = get_video_statistics(videofile, stride) fname = os.path.join(os.path.dirname(videofile), 'stats.yaml') dictionary = {} if os.path.isfile(fname): dictionary = utils.load_yaml(fname) dictionary['normalization'] = imdata utils.save_dict_to_yaml(dictionary, fname) update_project_with_normalization(imdata, project_config)
def update_project_with_normalization(norm_dict: dict, project_config: dict): """ Adds statistics from this video to the overall mean / std deviation for the project """ # project_dict = utils.load_yaml(os.path.join(project_dir, 'project_config.yaml')) if 'normalization' not in project_config['augs'].keys(): raise ValueError( 'Must have project_config/augs/normalization field: {}'.format( project_config)) old_rgb = project_config['augs']['normalization'] if old_rgb is not None and old_rgb['N'] is not None and old_rgb[ 'mean'] is not None: old_mean_total = old_rgb['N'] * np.array(old_rgb['mean']) old_std_total = old_rgb['N'] * np.array(old_rgb['std']) old_N = old_rgb['N'] else: old_mean_total = 0 old_std_total = 0 old_N = 0 new_n = old_N + norm_dict['N'] new_mean = (old_mean_total + norm_dict['N'] * np.array(norm_dict['mean'])) / new_n new_std = (old_std_total + norm_dict['N'] * np.array(norm_dict['std'])) / new_n project_config['augs']['normalization'] = { 'N': new_n, 'mean': new_mean.tolist(), 'std': new_std.tolist() } utils.save_dict_to_yaml( project_config, os.path.join(project_config['project']['path'], 'project_config.yaml'))
def train_from_cfg(cfg: DictConfig) -> Type[nn.Module]: rundir = os.getcwd() # done by hydra device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") torch.cuda.set_device(device) log.info('Training sequence model...') dataloaders = get_dataloaders_from_cfg(cfg, model_type='sequence') utils.save_dict_to_yaml(dataloaders['split'], os.path.join(rundir, 'split.yaml')) log.debug('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) model = build_model_from_cfg(cfg, dataloaders['num_features'], dataloaders['num_classes'], pos=dataloaders['pos'], neg=dataloaders['neg']) weights = projects.get_weightfile_from_cfg(cfg, model_type='sequence') if weights is not None: model = utils.load_weights(model, weights) model = model.to(device) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(model))) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.train.lr) torch.save(model, os.path.join(rundir, cfg.sequence.arch + '_definition.pt')) stopper = get_stopper(cfg) scheduler = initialize_scheduler( optimizer, cfg, mode='max', reduction_factor=cfg.train.reduction_factor) metrics = get_metrics(rundir, num_classes=len(cfg.project.class_names), num_parameters=utils.get_num_parameters(model), key_metric='f1') criterion = get_criterion(cfg.feature_extractor.final_activation, dataloaders, device) steps_per_epoch = dict(cfg.train.steps_per_epoch) model = train(model, dataloaders, criterion, optimizer, metrics, scheduler, rundir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=True, normalizer=None)
def get_split_from_records(records: dict, datadir: Union[str, bytes, os.PathLike], splitfile: Union[str, bytes, os.PathLike] = None, supervised: bool = True, reload_split: bool = True, valid_splits_only: bool = True, train_val_test: list = [0.7, 0.15, 0.15]): """ Splits the records into train, validation, and test splits Parameters ---------- records: dict of dicts E.g. {'animal': {'rgb': path/to/video.mp4, 'label': path/to/label.csv}, 'animal2': ...} datadir: str, os.PathLike absolute path to the base directory containing data. Only used to save split splitfile: str, os.PathLike absolute path to file containing a pre-made split to load. If none, make a new one from scratch supervised: bool if True, enables the option to use the valid split function reload_split: bool if True, tries to load the file in splitfile valid_splits_only: bool if True and supervised is True, make sure each split has at least 1 instance of each class train_val_test: list fractions / Ns in each split. see train_val_test_split Returns ------- split_dictionary: dict see train_val_test_split """ if splitfile is None: splitfile = os.path.join(datadir, 'split.yaml') else: assert os.path.isfile(splitfile), 'split file does not exist! {}'.format(splitfile) if supervised and valid_splits_only: # this function makes sure that each split has all classes in the dataset split_func = get_valid_split else: split_func = train_val_test_split if reload_split and os.path.isfile(splitfile): split_dictionary = utils.load_yaml(splitfile) if split_dictionary is None: # some malformatting split_dictionary = split_func(records, train_val_test) # if there are new records, e.g. new records were added to an old splitfile, # assign them to train, val, or test split_dictionary = update_split(records, split_dictionary) else: split_dictionary = split_func(records, train_val_test) utils.save_dict_to_yaml(split_dictionary, splitfile) return split_dictionary
def flow_generator_train(cfg: DictConfig) -> nn.Module: """Trains flow generator models from a configuration. Parameters ---------- cfg : DictConfig Configuration, e.g. that returned by deepethogram.configration.make_flow_generator_train_cfg Returns ------- nn.Module Trained flow generator """ cfg = projects.setup_run(cfg) log.info('args: {}'.format(' '.join(sys.argv))) # only two custom overwrites of the configuration file # allow for editing OmegaConf.set_struct(cfg, False) # second, use the model directory to find the most recent run of each model type # cfg = projects.overwrite_cfg_with_latest_weights(cfg, cfg.project.model_path, model_type='flow_generator') # SHOULD NEVER MODIFY / MAKE ASSIGNMENTS TO THE CFG OBJECT AFTER RIGHT HERE! log.info('configuration used ~~~~~') log.info(OmegaConf.to_yaml(cfg)) datasets, data_info = get_datasets_from_cfg( cfg, 'flow_generator', input_images=cfg.flow_generator.n_rgb) flow_generator = build_model_from_cfg(cfg) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(flow_generator))) utils.save_dict_to_yaml(data_info['split'], os.path.join(os.getcwd(), 'split.yaml')) flow_weights = deepethogram.projects.get_weightfile_from_cfg( cfg, 'flow_generator') if flow_weights is not None: print('reloading weights...') flow_generator = utils.load_weights(flow_generator, flow_weights, device='cpu') stopper = get_stopper(cfg) metrics = get_metrics(cfg, os.getcwd(), utils.get_num_parameters(flow_generator)) lightning_module = OpticalFlowLightning(flow_generator, cfg, datasets, metrics, viz.visualize_logger_optical_flow) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) trainer.fit(lightning_module) return flow_generator
def sequence_train(cfg: DictConfig) -> nn.Module: """Trains sequence models from a configuration. Parameters ---------- cfg : DictConfig Configuration, e.g. that returned by deepethogram.configration.make_sequence_train_cfg Returns ------- nn.Module Trained sequence model """ cfg = projects.setup_run(cfg) log.info('args: {}'.format(' '.join(sys.argv))) if cfg.sequence.latent_name is None: cfg.sequence.latent_name = cfg.feature_extractor.arch # allow for editing OmegaConf.set_struct(cfg, False) log.info('Configuration used: ') log.info(OmegaConf.to_yaml(cfg)) datasets, data_info = get_datasets_from_cfg(cfg, 'sequence') utils.save_dict_to_yaml(data_info['split'], os.path.join(os.getcwd(), 'split.yaml')) model = build_model_from_cfg(cfg, data_info['num_features'], data_info['num_classes'], pos=data_info['pos'], neg=data_info['neg']) weights = projects.get_weightfile_from_cfg(cfg, model_type='sequence') if weights is not None: model = utils.load_weights(model, weights) log.debug('model arch: {}'.format(model)) log.info('Total trainable params: {:,}'.format(utils.get_num_parameters(model))) stopper = get_stopper(cfg) metrics = get_metrics(os.getcwd(), data_info['num_classes'], num_parameters=utils.get_num_parameters(model), key_metric='f1_class_mean', num_workers=cfg.compute.metrics_workers) criterion = get_criterion(cfg, model, data_info) lightning_module = SequenceLightning(model, cfg, datasets, metrics, criterion) # change auto batch size parameters because large sequences can overflow RAM trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) trainer.fit(lightning_module) return model
def train_from_cfg(cfg: DictConfig) -> Type[nn.Module]: device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") if device != 'cpu': torch.cuda.set_device(device) log.info('Training flow generator....') dataloaders = get_dataloaders_from_cfg( cfg, model_type='flow_generator', input_images=cfg.flow_generator.n_rgb) # print(dataloaders) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) flow_generator = build_model_from_cfg(cfg) flow_generator = flow_generator.to(device) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(flow_generator))) rundir = os.getcwd() # this is configured by hydra # save model definition torch.save( flow_generator, os.path.join(rundir, cfg.flow_generator.arch + '_definition.pt')) utils.save_dict_to_yaml(dataloaders['split'], os.path.join(rundir, 'split.yaml')) optimizer = optim.Adam(filter(lambda p: p.requires_grad, flow_generator.parameters()), lr=cfg.train.lr) flow_weights = deepethogram.projects.get_weightfile_from_cfg( cfg, 'flow_generator') if flow_weights is not None: print('reloading weights...') flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) # stopper, early_stopping_begins = get_stopper(cfg) stopper = get_stopper(cfg) scheduler = initialize_scheduler(optimizer, cfg, mode='min') if cfg.flow_generator.loss == 'MotionNet': criterion = MotionNetLoss( flow_sparsity=cfg.flow_generator.flow_sparsity, sparsity_weight=cfg.flow_generator.sparsity_weight, smooth_weight_multiplier=cfg.flow_generator. smooth_weight_multiplier) else: raise NotImplementedError metrics = get_metrics(cfg, rundir, utils.get_num_parameters(flow_generator)) reconstructor = Reconstructor(cfg) steps_per_epoch = cfg.train.steps_per_epoch if cfg.compute.fp16: assert torch_amp, 'must install torch 1.6 or greater to use FP16 training' flow_generator = train(flow_generator, dataloaders, criterion, optimizer, metrics, scheduler, reconstructor, rundir, stopper, device, num_epochs=cfg.train.num_epochs, steps_per_epoch=steps_per_epoch['train'], steps_per_validation_epoch=steps_per_epoch['val'], steps_per_test_epoch=steps_per_epoch['test'], max_flow=cfg.flow_generator.max, dali=cfg.compute.dali, fp16=cfg.compute.fp16) return flow_generator
def prepare_thumos14_for_testing(datadir): testing_directory = setup_testing_directory(datadir) annotation_paths = { 'val': os.path.join(testing_directory, 'annotation'), 'test': os.path.join(testing_directory, 'TH14_Temporal_Annotations_Test', 'annotations', 'annotation') } video_paths = { 'val': os.path.join(testing_directory, 'validation'), 'test': os.path.join(testing_directory, 'TH14_test_set_mp4') } n_videos = {'val': 200, 'test': 212} # get rid of any files that shouldn't be there clean_deg_directory(testing_directory) # CHECK IF ALREADY PREPARED if check_if_videos_exist( video_paths, n_videos) and check_if_annotations_exist(annotation_paths): pass else: # DOWNLOAD IF ZIP FILE DOESNT EXIST download_thumos14(testing_directory) # UNZIP zipfiles = get_zipfiles(testing_directory) for filetype in ['labels', 'videos']: for split in ['val', 'test']: # you might need to enter a password here unzip_file(zipfiles[filetype][split], testing_directory) for split in ['val', 'test']: assert os.path.isdir(annotation_paths[split]) assert os.path.isdir(video_paths[split]) videos = {} for split in ['val', 'test']: videos[split] = glob.glob(video_paths[split] + '/*.mp4') videos[split].sort() # WRANGLE LABEL FILES labelfiles = {} for split in ['val', 'test']: videofiles = {os.path.basename(i)[:-4]: i for i in videos[split]} label_dir = os.path.join(testing_directory, split + '_deg_labels') if not os.path.isdir(label_dir): os.makedirs(label_dir) annotation_path = annotation_paths[split] annotations = read_annotations(annotation_path) per_video = convert_per_behavior_to_per_video(annotations) labelfiles[split] = {} for video_name, label_dict in per_video.items(): label = convert_label_to_deg(videofiles, video_name, label_dict) df = pd.DataFrame(label, columns=behaviors) outfile = os.path.join(label_dir, video_name + '_labels.csv') df.to_csv(outfile) labelfiles[split][video_name] = outfile # MAKE SPLIT split = [0.8, 0.2, 0.0] # ensure we have at least one instance of every behavior in the validation set np.random.seed(42) while True: split_dict = train_val_test_split(labelfiles['val'], split) n_labels = get_val_n_labels(split_dict, labelfiles) if not np.any(n_labels == 0): break else: print('invalid') split_dict['metadata']['split'][ 2] = None # put this as none to remind that split came with it split_dict['test'] = list(labelfiles['test'].keys()) splitfile = os.path.join(testing_directory, 'DATA', 'split.yaml') utils.save_dict_to_yaml(split_dict, splitfile) # ADD TO DEEPETHOGRAM PROJECT project_config = projects.initialize_project(testing_directory, 'thumos14', behaviors, make_subdirectory=False) for split in ['val', 'test']: videofiles = {os.path.basename(i)[:-4]: i for i in videos[split]} for video_name, labelfile in labelfiles[split].items(): videofile = videofiles[video_name] labelfile = labelfiles[split][video_name] new_path = projects.add_video_to_project(project_config, videofile, mode='symlink') projects.add_label_to_project(labelfile, new_path)
def train_from_cfg(cfg: DictConfig) -> Type[nn.Module]: """ train DeepEthogram feature extractors from a configuration object. Args: cfg (DictConfig): configuration object generated by Hydra Returns: trained feature extractor """ rundir = os.getcwd() # done by hydra device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") if device != 'cpu': torch.cuda.set_device(device) flow_generator = build_flow_generator(cfg) flow_weights = get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') log.info('loading flow generator from file {}'.format(flow_weights)) flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) flow_generator = flow_generator.to(device) dataloaders = get_dataloaders_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) spatial_classifier, flow_classifier = build_model_from_cfg( cfg, return_components=True, pos=dataloaders['pos'], neg=dataloaders['neg']) spatial_classifier = spatial_classifier.to(device) flow_classifier = flow_classifier.to(device) num_classes = len(cfg.project.class_names) utils.save_dict_to_yaml(dataloaders['split'], os.path.join(rundir, 'split.yaml')) criterion = get_criterion(cfg.feature_extractor.final_activation, dataloaders, device) steps_per_epoch = dict(cfg.train.steps_per_epoch) metrics = get_metrics( rundir, num_classes=num_classes, num_parameters=utils.get_num_parameters(spatial_classifier)) dali = cfg.compute.dali # training in a curriculum goes as follows: # first, we train the spatial classifier, which takes still images as input # second, we train the flow classifier, which generates optic flow with the flow_generator model and then classifies # it. Thirdly, we will train the whole thing end to end # Without the curriculum we just train end to end from the start if cfg.feature_extractor.curriculum: del dataloaders # train spatial model, then flow model, then both end-to-end dataloaders = get_dataloaders_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_rgb) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) # we'll use this to visualize our data, because it is loaded z-scored. we want it to be in the range [0-1] or # [0-255] for visualization, and for that we need to know mean and std normalizer = get_normalizer(cfg, input_images=cfg.feature_extractor.n_rgb) optimizer = optim.Adam(filter(lambda p: p.requires_grad, spatial_classifier.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) spatialdir = os.path.join(rundir, 'spatial') if not os.path.isdir(spatialdir): os.makedirs(spatialdir) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) log.info('key metric: {}'.format(metrics.key_metric)) spatial_classifier = train( spatial_classifier, dataloaders, criterion, optimizer, metrics, scheduler, spatialdir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) log.info('Training flow stream....') input_images = cfg.feature_extractor.n_flows + 1 del dataloaders dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', input_images=input_images) normalizer = get_normalizer(cfg, input_images=input_images) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) flowdir = os.path.join(rundir, 'flow') if not os.path.isdir(flowdir): os.makedirs(flowdir) flow_generator_and_classifier = FlowOnlyClassifier( flow_generator, flow_classifier).to(device) optimizer = optim.Adam(filter(lambda p: p.requires_grad, flow_classifier.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) flow_generator_and_classifier = train( flow_generator_and_classifier, dataloaders, criterion, optimizer, metrics, scheduler, flowdir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) flow_classifier = flow_generator_and_classifier.flow_classifier # overwrite checkpoint utils.checkpoint(flow_classifier, flowdir, stopper.epoch_counter) model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, cfg.feature_extractor.arch, fusion_style=cfg.feature_extractor.fusion, num_classes=num_classes).to(device) # setting the mode to end-to-end would allow to backprop gradients into the flow generator itself # the paper does this, but I don't expect that users would have enough data for this to make sense model.set_mode('classifier') log.info('Training end to end...') input_images = cfg.feature_extractor.n_flows + 1 dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', input_images=input_images) normalizer = get_normalizer(cfg, input_images=input_images) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(model))) model = train(model, dataloaders, criterion, optimizer, metrics, scheduler, rundir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) utils.save_hidden_two_stream(model, rundir, dict(cfg), stopper.epoch_counter) return model
def zscore_video(videofile: Union[str, os.PathLike], project_config: dict, stride: int = 10): """calculates channel-wise mean and standard deviation for input video. Calculates mean and std deviation independently for each input video channel. Grayscale videos are converted to RGB. Saves statistics to the augs/normalization dictionary in project_config. Only takes every STRIDE frames for speed. Calculates mean and std deviation incrementally to not load thousands of frames into memory at once: https://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html Args: videofile: path to video file. Must be one of inputs to file_io/VideoReader: avi, mp4, jpg directory, or hdf5 project_config: dictionary for your deepethogram project. Contains augs/normalization field stride: only every STRIDE frames will be computed. Use stride=1 for the full video Returns: """ assert os.path.isfile(videofile) assert projects.is_deg_file(videofile) image_stats = StatsRecorder() # config['arch'] = 'flow-generator' # config['normalization'] = None # transforms = get_transforms_from_config(config) # xform = transforms['train'] log.info('zscoring file: {}'.format(videofile)) with deepethogram.file_io.VideoReader(videofile) as reader: log.debug('N frames: {}'.format(len(reader))) for i in tqdm(range(0, len(reader), stride)): image = reader[i] image = image.astype(np.float) / 255 image = image.transpose(2, 1, 0) # image = image[np.newaxis,...] # N, C, H, W = image.shape image = image.reshape(3, -1).transpose(1, 0) # image = image.reshape(N, C, -1).squeeze().transpose(1, 0) # if i == 0: # print(image.shape) image_stats.update(image) log.info('final stats: {}'.format(image_stats)) imdata = { 'mean': image_stats.mean, 'std': image_stats.std, 'N': image_stats.nobservations } for k, v in imdata.items(): if type(v) == torch.Tensor: v = v.detach().cpu().numpy() if type(v) == np.ndarray: v = v.tolist() imdata[k] = v fname = os.path.join(os.path.dirname(videofile), 'stats.yaml') dictionary = {} if os.path.isfile(fname): dictionary = utils.load_yaml(fname) dictionary['normalization'] = imdata utils.save_dict_to_yaml(dictionary, fname) update_project_with_normalization(imdata, project_config)
def feature_extractor_train(cfg: DictConfig) -> nn.Module: """Trains feature extractor models from a configuration. Parameters ---------- cfg : DictConfig Configuration, e.g. that returned by deepethogram.configration.make_feature_extractor_train_cfg Returns ------- nn.Module Trained feature extractor """ # rundir = os.getcwd() cfg = projects.setup_run(cfg) log.info('args: {}'.format(' '.join(sys.argv))) # change the project paths from relative to absolute # allow for editing OmegaConf.set_struct(cfg, False) # SHOULD NEVER MODIFY / MAKE ASSIGNMENTS TO THE CFG OBJECT AFTER RIGHT HERE! log.info('configuration used ~~~~~') log.info(OmegaConf.to_yaml(cfg)) # we build flow generator independently because you might want to load it from a different location flow_generator = build_flow_generator(cfg) flow_weights = projects.get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') log.info('loading flow generator from file {}'.format(flow_weights)) flow_generator = utils.load_weights(flow_generator, flow_weights) _, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) model_parts = build_model_from_cfg(cfg, pos=data_info['pos'], neg=data_info['neg']) _, spatial_classifier, flow_classifier, fusion, model = model_parts # log.info('model: {}'.format(model)) num_classes = len(cfg.project.class_names) utils.save_dict_to_yaml(data_info['split'], os.path.join(cfg.run.dir, 'split.yaml')) metrics = get_metrics( cfg.run.dir, num_classes=num_classes, num_parameters=utils.get_num_parameters(spatial_classifier), key_metric='f1_class_mean_nobg', num_workers=cfg.compute.metrics_workers) # cfg.compute.batch_size will be changed by the automatic batch size finder, possibly. store here so that # with each step of the curriculum, we can auto-tune it original_batch_size = cfg.compute.batch_size original_lr = cfg.train.lr # training in a curriculum goes as follows: # first, we train the spatial classifier, which takes still images as input # second, we train the flow classifier, which generates optic flow with the flow_generator model and then classifies # it. Thirdly, we will train the whole thing end to end # Without the curriculum we just train end to end from the start if cfg.feature_extractor.curriculum: # train spatial model, then flow model, then both end-to-end # dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', # input_images=cfg.feature_extractor.n_rgb) datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_rgb) stopper = get_stopper(cfg) criterion = get_criterion(cfg, spatial_classifier, data_info) lightning_module = HiddenTwoStreamLightning(spatial_classifier, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # this horrible syntax is because we just changed our configuration's batch size and learning rate, if they are # set to auto. so we need to re-instantiate our lightning module # https://pytorch-lightning.readthedocs.io/en/latest/lr_finder.html?highlight=auto%20scale%20learning%20rate # I tried to do this without re-creating module, but finding the learning rate increments the epoch?? # del lightning_module # log.info('epoch num: {}'.format(trainer.current_epoch)) # lightning_module = HiddenTwoStreamLightning(spatial_classifier, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) # free RAM. note: this doesn't do much log.info('free ram') del datasets, lightning_module, trainer, stopper, data_info torch.cuda.empty_cache() gc.collect() # return datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) # re-initialize stopper so that it doesn't think we need to stop due to the previous model stopper = get_stopper(cfg) cfg.compute.batch_size = original_batch_size cfg.train.lr = original_lr # this class will freeze the flow generator flow_generator_and_classifier = FlowOnlyClassifier( flow_generator, flow_classifier) criterion = get_criterion(cfg, flow_generator_and_classifier, data_info) lightning_module = HiddenTwoStreamLightning( flow_generator_and_classifier, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # lightning_module = HiddenTwoStreamLightning(flow_generator_and_classifier, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) del datasets, lightning_module, trainer, stopper, data_info torch.cuda.empty_cache() gc.collect() torch.cuda.empty_cache() gc.collect() model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, fusion, cfg.feature_extractor.arch) model.set_mode('classifier') datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) criterion = get_criterion(cfg, model, data_info) stopper = get_stopper(cfg) cfg.compute.batch_size = original_batch_size cfg.train.lr = original_lr # log.warning('SETTING ANAOMALY DETECTION TO TRUE! WILL SLOW DOWN.') # torch.autograd.set_detect_anomaly(True) lightning_module = HiddenTwoStreamLightning(model, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # see above for horrible syntax explanation # lightning_module = HiddenTwoStreamLightning(model, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) # trainer.test(model=lightning_module) return model