def build_model_from_cfg( cfg: DictConfig, return_components: bool = False, pos: np.ndarray = None, neg: np.ndarray = None) -> Union[Type[nn.Module], tuple]: """ Builds feature extractor from a configuration object. Parameters ---------- cfg: DictConfig configuration, e.g. from Hydra command line return_components: bool if True, returns spatial classifier and flow classifier individually pos: np.ndarray Number of positive examples in dataset. Used for initializing biases in final layer neg: np.ndarray Number of negative examples in dataset. Used for initializing biases in final layer Returns ------- if `return_components`: spatial_classifier, flow_classifier: nn.Module, nn.Module cnns for classifying rgb images and optic flows else: hidden two stream model: nn.Module hidden two stream CNN """ device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") feature_extractor_weights = get_weightfile_from_cfg( cfg, 'feature_extractor') num_classes = len(cfg.project.class_names) # if feature_extractor_weights is None: # # we get the dataloaders here just for the pos and negative example fields of this dictionary. This allows us # # to build our models with initialization based on the class imbalance of our dataset # dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', # input_images=cfg.feature_extractor.n_flows + 1) # else: # dataloaders = {'pos': None, 'neg': None} in_channels = cfg.feature_extractor.n_rgb * 3 if '3d' not in cfg.feature_extractor.arch else 3 reload_imagenet = feature_extractor_weights is None if cfg.feature_extractor.arch == 'resnet3d_34': assert feature_extractor_weights is not None, 'Must specify path to resnet3d weights!' spatial_classifier = get_cnn(cfg.feature_extractor.arch, in_channels=in_channels, dropout_p=cfg.feature_extractor.dropout_p, num_classes=num_classes, reload_imagenet=reload_imagenet, pos=pos, neg=neg) # load this specific component from the weight file if feature_extractor_weights is not None: spatial_classifier = utils.load_feature_extractor_components( spatial_classifier, feature_extractor_weights, 'spatial', device=device) in_channels = cfg.feature_extractor.n_flows * 2 if '3d' not in cfg.feature_extractor.arch else 2 flow_classifier = get_cnn(cfg.feature_extractor.arch, in_channels=in_channels, dropout_p=cfg.feature_extractor.dropout_p, num_classes=num_classes, reload_imagenet=reload_imagenet, pos=pos, neg=neg) # load this specific component from the weight file if feature_extractor_weights is not None: flow_classifier = utils.load_feature_extractor_components( flow_classifier, feature_extractor_weights, 'flow', device=device) if return_components: return spatial_classifier, flow_classifier flow_generator = build_flow_generator(cfg) flow_weights = get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, cfg.feature_extractor.arch, fusion_style=cfg.feature_extractor.fusion, num_classes=num_classes) model.set_mode('classifier') return model
def train_from_cfg(cfg: DictConfig) -> Type[nn.Module]: """ train DeepEthogram feature extractors from a configuration object. Args: cfg (DictConfig): configuration object generated by Hydra Returns: trained feature extractor """ rundir = os.getcwd() # done by hydra device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") torch.cuda.set_device(device) flow_generator = build_flow_generator(cfg) flow_weights = get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') log.info('loading flow generator from file {}'.format(flow_weights)) flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) flow_generator = flow_generator.to(device) dataloaders = get_dataloaders_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) spatial_classifier, flow_classifier = build_model_from_cfg( cfg, return_components=True, pos=dataloaders['pos'], neg=dataloaders['neg']) spatial_classifier = spatial_classifier.to(device) flow_classifier = flow_classifier.to(device) num_classes = len(cfg.project.class_names) utils.save_dict_to_yaml(dataloaders['split'], os.path.join(rundir, 'split.yaml')) criterion = get_criterion(cfg.feature_extractor.final_activation, dataloaders, device) steps_per_epoch = dict(cfg.train.steps_per_epoch) metrics = get_metrics( rundir, num_classes=num_classes, num_parameters=utils.get_num_parameters(spatial_classifier)) dali = cfg.compute.dali # training in a curriculum goes as follows: # first, we train the spatial classifier, which takes still images as input # second, we train the flow classifier, which generates optic flow with the flow_generator model and then classifies # it. Thirdly, we will train the whole thing end to end # Without the curriculum we just train end to end from the start if cfg.feature_extractor.curriculum: del dataloaders # train spatial model, then flow model, then both end-to-end dataloaders = get_dataloaders_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_rgb) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) # we'll use this to visualize our data, because it is loaded z-scored. we want it to be in the range [0-1] or # [0-255] for visualization, and for that we need to know mean and std normalizer = get_normalizer(cfg, input_images=cfg.feature_extractor.n_rgb) optimizer = optim.Adam(filter(lambda p: p.requires_grad, spatial_classifier.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) spatialdir = os.path.join(rundir, 'spatial') if not os.path.isdir(spatialdir): os.makedirs(spatialdir) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) log.info('key metric: {}'.format(metrics.key_metric)) spatial_classifier = train( spatial_classifier, dataloaders, criterion, optimizer, metrics, scheduler, spatialdir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) log.info('Training flow stream....') input_images = cfg.feature_extractor.n_flows + 1 del dataloaders dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', input_images=input_images) normalizer = get_normalizer(cfg, input_images=input_images) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) flowdir = os.path.join(rundir, 'flow') if not os.path.isdir(flowdir): os.makedirs(flowdir) flow_generator_and_classifier = FlowOnlyClassifier( flow_generator, flow_classifier).to(device) optimizer = optim.Adam(filter(lambda p: p.requires_grad, flow_classifier.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) flow_generator_and_classifier = train( flow_generator_and_classifier, dataloaders, criterion, optimizer, metrics, scheduler, flowdir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) flow_classifier = flow_generator_and_classifier.flow_classifier # overwrite checkpoint utils.checkpoint(flow_classifier, flowdir, stopper.epoch_counter) model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, cfg.feature_extractor.arch, fusion_style=cfg.feature_extractor.fusion, num_classes=num_classes).to(device) # setting the mode to end-to-end would allow to backprop gradients into the flow generator itself # the paper does this, but I don't expect that users would have enough data for this to make sense model.set_mode('classifier') log.info('Training end to end...') input_images = cfg.feature_extractor.n_flows + 1 dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', input_images=input_images) normalizer = get_normalizer(cfg, input_images=input_images) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.train.lr, weight_decay=cfg.feature_extractor.weight_decay) stopper = get_stopper(cfg) # we're using validation loss as our key metric scheduler = initialize_scheduler( optimizer, cfg, mode='min', reduction_factor=cfg.train.reduction_factor) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(model))) model = train(model, dataloaders, criterion, optimizer, metrics, scheduler, rundir, stopper, device, steps_per_epoch, final_activation=cfg.feature_extractor.final_activation, sequence=False, normalizer=normalizer, dali=dali) utils.save_hidden_two_stream(model, rundir, dict(cfg), stopper.epoch_counter) return model
def sequence_inference(cfg: DictConfig): cfg = projects.setup_run(cfg) log.info('args: {}'.format(' '.join(sys.argv))) # turn "models" in your project configuration to "full/path/to/models" log.info('configuration used: ') log.info(OmegaConf.to_yaml(cfg)) weights = projects.get_weightfile_from_cfg(cfg, model_type='sequence') assert weights is not None, 'Must either specify a weightfile or use reload.latest=True' run_files = utils.get_run_files_from_weights(weights) if cfg.sequence.latent_name is None: # find the latent name used in the weight file you loaded rundir = os.path.dirname(weights) loaded_cfg = utils.load_yaml(run_files['config_file']) latent_name = loaded_cfg['sequence']['latent_name'] # if this latent name is also None, use the arch of the feature extractor # this should never happen if latent_name is None: latent_name = loaded_cfg['feature_extractor']['arch'] else: latent_name = cfg.sequence.latent_name if cfg.inference.use_loaded_model_cfg: output_name = cfg.sequence.output_name loaded_config_file = run_files['config_file'] loaded_model_cfg = OmegaConf.load(loaded_config_file).sequence current_model_cfg = cfg.sequence model_cfg = OmegaConf.merge(current_model_cfg, loaded_model_cfg) cfg.sequence = model_cfg # we don't want to use the weights that the trained model was initialized with, but the weights after training # therefore, overwrite the loaded configuration with the current weights cfg.sequence.weights = weights cfg.sequence.latent_name = latent_name cfg.sequence.output_name = output_name log.info('latent name used for running sequence inference: {}'.format(latent_name)) # the output name will be a group in the output hdf5 dataset containing probabilities, etc if cfg.sequence.output_name is None: output_name = cfg.sequence.arch else: output_name = cfg.sequence.output_name directory_list = cfg.inference.directory_list if directory_list is None or len(directory_list) == 0: raise ValueError('must pass list of directories from commmand line. ' 'Ex: directory_list=[path_to_dir1,path_to_dir2] or directory_list=all') elif type(directory_list) == str and directory_list == 'all': basedir = cfg.project.data_path directory_list = utils.get_subfiles(basedir, 'directory') outputfiles = [] for directory in directory_list: assert os.path.isdir(directory), 'Not a directory: {}'.format(directory) record = projects.get_record_from_subdir(directory) assert record['output'] is not None outputfiles.append(record['output']) model = build_model_from_cfg(cfg, 1024, len(cfg.project.class_names)) log.info('model: {}'.format(model)) model = utils.load_weights(model, weights) metrics_file = run_files['metrics_file'] assert os.path.isfile(metrics_file) best_epoch = utils.get_best_epoch_from_weightfile(weights) # best_epoch = -1 log.info('best epoch from loaded file: {}'.format(best_epoch)) with h5py.File(metrics_file, 'r') as f: try: thresholds = f['val']['metrics_by_threshold']['optimum'][best_epoch, :] except KeyError: # backwards compatibility thresholds = f['threshold_curves']['val']['optimum'][:] # [best_epoch, :] if thresholds.ndim > 1: thresholds = thresholds[best_epoch, :] log.info('thresholds: {}'.format(thresholds)) class_names = list(cfg.project.class_names) if len(thresholds) != len(class_names): error_message = '''Number of classes in trained model: {} Number of classes in project: {} Did you add or remove behaviors after training this model? If so, please retrain! '''.format(len(thresholds), len(class_names)) raise ValueError(error_message) device = 'cuda:{}'.format(cfg.compute.gpu_id) class_names = cfg.project.class_names class_names = np.array(class_names) extract(model, outputfiles, thresholds, cfg.feature_extractor.final_activation, latent_name, output_name, cfg.sequence.sequence_length, True, device, cfg.inference.ignore_error, cfg.inference.overwrite, class_names=class_names)
def train_from_cfg(cfg: DictConfig) -> Type[nn.Module]: device = torch.device( "cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") if device != 'cpu': torch.cuda.set_device(device) log.info('Training flow generator....') dataloaders = get_dataloaders_from_cfg( cfg, model_type='flow_generator', input_images=cfg.flow_generator.n_rgb) # print(dataloaders) log.info('Num training batches {}, num val: {}'.format( len(dataloaders['train']), len(dataloaders['val']))) flow_generator = build_model_from_cfg(cfg) flow_generator = flow_generator.to(device) log.info('Total trainable params: {:,}'.format( utils.get_num_parameters(flow_generator))) rundir = os.getcwd() # this is configured by hydra # save model definition torch.save( flow_generator, os.path.join(rundir, cfg.flow_generator.arch + '_definition.pt')) utils.save_dict_to_yaml(dataloaders['split'], os.path.join(rundir, 'split.yaml')) optimizer = optim.Adam(filter(lambda p: p.requires_grad, flow_generator.parameters()), lr=cfg.train.lr) flow_weights = deepethogram.projects.get_weightfile_from_cfg( cfg, 'flow_generator') if flow_weights is not None: print('reloading weights...') flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) # stopper, early_stopping_begins = get_stopper(cfg) stopper = get_stopper(cfg) scheduler = initialize_scheduler(optimizer, cfg, mode='min') if cfg.flow_generator.loss == 'MotionNet': criterion = MotionNetLoss( flow_sparsity=cfg.flow_generator.flow_sparsity, sparsity_weight=cfg.flow_generator.sparsity_weight, smooth_weight_multiplier=cfg.flow_generator. smooth_weight_multiplier) else: raise NotImplementedError metrics = get_metrics(cfg, rundir, utils.get_num_parameters(flow_generator)) reconstructor = Reconstructor(cfg) steps_per_epoch = cfg.train.steps_per_epoch if cfg.compute.fp16: assert torch_amp, 'must install torch 1.6 or greater to use FP16 training' flow_generator = train(flow_generator, dataloaders, criterion, optimizer, metrics, scheduler, reconstructor, rundir, stopper, device, num_epochs=cfg.train.num_epochs, steps_per_epoch=steps_per_epoch['train'], steps_per_validation_epoch=steps_per_epoch['val'], steps_per_test_epoch=steps_per_epoch['test'], max_flow=cfg.flow_generator.max, dali=cfg.compute.dali, fp16=cfg.compute.fp16) return flow_generator
def feature_extractor_train(cfg: DictConfig) -> nn.Module: """Trains feature extractor models from a configuration. Parameters ---------- cfg : DictConfig Configuration, e.g. that returned by deepethogram.configration.make_feature_extractor_train_cfg Returns ------- nn.Module Trained feature extractor """ # rundir = os.getcwd() cfg = projects.setup_run(cfg) log.info('args: {}'.format(' '.join(sys.argv))) # change the project paths from relative to absolute # allow for editing OmegaConf.set_struct(cfg, False) # SHOULD NEVER MODIFY / MAKE ASSIGNMENTS TO THE CFG OBJECT AFTER RIGHT HERE! log.info('configuration used ~~~~~') log.info(OmegaConf.to_yaml(cfg)) # we build flow generator independently because you might want to load it from a different location flow_generator = build_flow_generator(cfg) flow_weights = projects.get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') log.info('loading flow generator from file {}'.format(flow_weights)) flow_generator = utils.load_weights(flow_generator, flow_weights) _, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) model_parts = build_model_from_cfg(cfg, pos=data_info['pos'], neg=data_info['neg']) _, spatial_classifier, flow_classifier, fusion, model = model_parts # log.info('model: {}'.format(model)) num_classes = len(cfg.project.class_names) utils.save_dict_to_yaml(data_info['split'], os.path.join(cfg.run.dir, 'split.yaml')) metrics = get_metrics( cfg.run.dir, num_classes=num_classes, num_parameters=utils.get_num_parameters(spatial_classifier), key_metric='f1_class_mean_nobg', num_workers=cfg.compute.metrics_workers) # cfg.compute.batch_size will be changed by the automatic batch size finder, possibly. store here so that # with each step of the curriculum, we can auto-tune it original_batch_size = cfg.compute.batch_size original_lr = cfg.train.lr # training in a curriculum goes as follows: # first, we train the spatial classifier, which takes still images as input # second, we train the flow classifier, which generates optic flow with the flow_generator model and then classifies # it. Thirdly, we will train the whole thing end to end # Without the curriculum we just train end to end from the start if cfg.feature_extractor.curriculum: # train spatial model, then flow model, then both end-to-end # dataloaders = get_dataloaders_from_cfg(cfg, model_type='feature_extractor', # input_images=cfg.feature_extractor.n_rgb) datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_rgb) stopper = get_stopper(cfg) criterion = get_criterion(cfg, spatial_classifier, data_info) lightning_module = HiddenTwoStreamLightning(spatial_classifier, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # this horrible syntax is because we just changed our configuration's batch size and learning rate, if they are # set to auto. so we need to re-instantiate our lightning module # https://pytorch-lightning.readthedocs.io/en/latest/lr_finder.html?highlight=auto%20scale%20learning%20rate # I tried to do this without re-creating module, but finding the learning rate increments the epoch?? # del lightning_module # log.info('epoch num: {}'.format(trainer.current_epoch)) # lightning_module = HiddenTwoStreamLightning(spatial_classifier, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) # free RAM. note: this doesn't do much log.info('free ram') del datasets, lightning_module, trainer, stopper, data_info torch.cuda.empty_cache() gc.collect() # return datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) # re-initialize stopper so that it doesn't think we need to stop due to the previous model stopper = get_stopper(cfg) cfg.compute.batch_size = original_batch_size cfg.train.lr = original_lr # this class will freeze the flow generator flow_generator_and_classifier = FlowOnlyClassifier( flow_generator, flow_classifier) criterion = get_criterion(cfg, flow_generator_and_classifier, data_info) lightning_module = HiddenTwoStreamLightning( flow_generator_and_classifier, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # lightning_module = HiddenTwoStreamLightning(flow_generator_and_classifier, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) del datasets, lightning_module, trainer, stopper, data_info torch.cuda.empty_cache() gc.collect() torch.cuda.empty_cache() gc.collect() model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, fusion, cfg.feature_extractor.arch) model.set_mode('classifier') datasets, data_info = get_datasets_from_cfg( cfg, model_type='feature_extractor', input_images=cfg.feature_extractor.n_flows + 1) criterion = get_criterion(cfg, model, data_info) stopper = get_stopper(cfg) cfg.compute.batch_size = original_batch_size cfg.train.lr = original_lr # log.warning('SETTING ANAOMALY DETECTION TO TRUE! WILL SLOW DOWN.') # torch.autograd.set_detect_anomaly(True) lightning_module = HiddenTwoStreamLightning(model, cfg, datasets, metrics, criterion) trainer = get_trainer_from_cfg(cfg, lightning_module, stopper) # see above for horrible syntax explanation # lightning_module = HiddenTwoStreamLightning(model, cfg, datasets, metrics, criterion) trainer.fit(lightning_module) # trainer.test(model=lightning_module) return model
def build_model_from_cfg(cfg: DictConfig, pos: np.ndarray = None, neg: np.ndarray = None, num_classes: int = None) -> tuple: """ Builds feature extractor from a configuration object. Parameters ---------- cfg: DictConfig configuration, e.g. from Hydra command line return_components: bool if True, returns spatial classifier and flow classifier individually pos: np.ndarray Number of positive examples in dataset. Used for initializing biases in final layer neg: np.ndarray Number of negative examples in dataset. Used for initializing biases in final layer Returns ------- if `return_components`: spatial_classifier, flow_classifier: nn.Module, nn.Module cnns for classifying rgb images and optic flows else: hidden two stream model: nn.Module hidden two stream CNN """ # device = torch.device("cuda:" + str(cfg.compute.gpu_id) if torch.cuda.is_available() else "cpu") device = 'cpu' feature_extractor_weights = projects.get_weightfile_from_cfg( cfg, 'feature_extractor') if num_classes is None: num_classes = len(cfg.project.class_names) log.info( 'feature extractor weightfile: {}'.format(feature_extractor_weights)) in_channels = cfg.feature_extractor.n_rgb * 3 if '3d' not in cfg.feature_extractor.arch else 3 reload_imagenet = feature_extractor_weights is None if cfg.feature_extractor.arch == 'resnet3d_34': assert feature_extractor_weights is not None, 'Must specify path to resnet3d weights!' spatial_classifier = get_cnn(cfg.feature_extractor.arch, in_channels=in_channels, dropout_p=cfg.feature_extractor.dropout_p, num_classes=num_classes, reload_imagenet=reload_imagenet, pos=pos, neg=neg, final_bn=cfg.feature_extractor.final_bn) # load this specific component from the weight file if feature_extractor_weights is not None: spatial_classifier = utils.load_feature_extractor_components( spatial_classifier, feature_extractor_weights, 'spatial', device=device) in_channels = cfg.feature_extractor.n_flows * 2 if '3d' not in cfg.feature_extractor.arch else 2 flow_classifier = get_cnn(cfg.feature_extractor.arch, in_channels=in_channels, dropout_p=cfg.feature_extractor.dropout_p, num_classes=num_classes, reload_imagenet=reload_imagenet, pos=pos, neg=neg, final_bn=cfg.feature_extractor.final_bn) # load this specific component from the weight file if feature_extractor_weights is not None: flow_classifier = utils.load_feature_extractor_components( flow_classifier, feature_extractor_weights, 'flow', device=device) flow_generator = build_flow_generator(cfg) flow_weights = projects.get_weightfile_from_cfg(cfg, 'flow_generator') assert flow_weights is not None, ( 'Must have a valid weightfile for flow generator. Use ' 'deepethogram.flow_generator.train or cfg.reload.latest') flow_generator = utils.load_weights(flow_generator, flow_weights, device=device) spatial_classifier, flow_classifier, fusion = build_fusion_layer( spatial_classifier, flow_classifier, cfg.feature_extractor.fusion, num_classes) if feature_extractor_weights is not None: fusion = utils.load_feature_extractor_components( fusion, feature_extractor_weights, 'fusion', device=device) model = HiddenTwoStream(flow_generator, spatial_classifier, flow_classifier, fusion, cfg.feature_extractor.arch) # log.info(model.fusion.flow_weight) model.set_mode('classifier') return flow_generator, spatial_classifier, flow_classifier, fusion, model