def initialize_statistics_collection(self): """ - Initializes all ``StatisticsCollectors`` and ``StatisticsAggregators`` used by a given worker: \ - For training statistics (adds the statistics of the model & task), - For validation statistics (adds the statistics of the model & task). - Creates the output files (csv). """ # TRAINING. # Create statistics collector for training. self.training_stat_col = StatisticsCollector() self.add_statistics(self.training_stat_col) self.training.task.add_statistics(self.training_stat_col) self.pipeline.add_statistics(self.training_stat_col) # Create the csv file to store the training statistics. self.training_batch_stats_file = self.training_stat_col.initialize_csv_file( self.app_state.log_dir, 'training_statistics.csv') # Create statistics aggregator for training. self.training_stat_agg = StatisticsAggregator() self.add_aggregators(self.training_stat_agg) self.training.task.add_aggregators(self.training_stat_agg) self.pipeline.add_aggregators(self.training_stat_agg) # Create the csv file to store the training statistic aggregations. self.training_set_stats_file = self.training_stat_agg.initialize_csv_file( self.app_state.log_dir, 'training_set_agg_statistics.csv') # VALIDATION. # Create statistics collector for validation. self.validation_stat_col = StatisticsCollector() self.add_statistics(self.validation_stat_col) self.validation.task.add_statistics(self.validation_stat_col) self.pipeline.add_statistics(self.validation_stat_col) # Create the csv file to store the validation statistics. self.validation_batch_stats_file = self.validation_stat_col.initialize_csv_file( self.app_state.log_dir, 'validation_statistics.csv') # Create statistics aggregator for validation. self.validation_stat_agg = StatisticsAggregator() self.add_aggregators(self.validation_stat_agg) self.validation.task.add_aggregators(self.validation_stat_agg) self.pipeline.add_aggregators(self.validation_stat_agg) # Create the csv file to store the validation statistic aggregations. self.validation_set_stats_file = self.validation_stat_agg.initialize_csv_file( self.app_state.log_dir, 'validation_set_agg_statistics.csv')
def test_aggregator_string(self): """ Tests whether the collector is aggregating and producing the right string. """ stat_col = StatisticsCollector() stat_agg = StatisticsAggregator() # Add default statistics with formatting. stat_col.add_statistics('loss', '{:12.10f}') stat_col.add_statistics('episode', '{:06d}') stat_col.add_statistics('batch_size', None) # create some random values loss_values = random.sample(range(100), 100) # "Collect" basic statistics. for episode, loss in enumerate(loss_values): stat_col['episode'] = episode stat_col['loss'] = loss stat_col['batch_size'] = 1 # print(stat_col.export_statistics_to_string()) # Empty before aggregation. self.assertEqual(stat_agg.export_to_string(), " ") # Number of aggregated episodes. stat_agg.add_aggregator('acc_mean', '{:2.5f}') collected_loss_values = stat_col['loss'] batch_sizes = stat_col['batch_size'] stat_agg['acc_mean'] = np.mean(collected_loss_values) / np.sum(batch_sizes) # Aggregated result. self.assertEqual(stat_agg.export_to_string('[Epoch 1]'), "acc_mean 0.49500 [Epoch 1]") #if __name__ == "__main__": # unittest.main()
def initialize_statistics_collection(self): """ Function initializes all statistics collectors and aggregators used by a given worker, creates output files etc. """ # Create statistics collector. self.stat_col = StatisticsCollector() self.add_statistics(self.stat_col) self.pm.task.add_statistics(self.stat_col) self.pipeline.add_statistics(self.stat_col) # Create the csv file to store the statistics. self.pm_batch_stats_file = self.stat_col.initialize_csv_file( self.app_state.log_dir, self.tsn + '_statistics.csv') # Create statistics aggregator. self.stat_agg = StatisticsAggregator() self.add_aggregators(self.stat_agg) self.pm.task.add_aggregators(self.stat_agg) self.pipeline.add_aggregators(self.stat_agg) # Create the csv file to store the statistic aggregations. # Will contain a single row with aggregated statistics. self.pm_set_stats_file = self.stat_agg.initialize_csv_file( self.app_state.log_dir, self.tsn + '_set_agg_statistics.csv')
class Processor(Worker): """ Defines the basic ``Processor``. If defining another type of Processor, it should subclass it. """ def __init__(self): """ Calls the ``Worker`` constructor, adds some additional arguments to parser. """ # Call base constructor to set up app state, registry and add default params. super(Processor, self).__init__("Processor", Processor) self.parser.add_argument( '--section', dest='section_name', type=str, default="test", help= 'Name of the section defining the specific set to be processed (DEFAULT: test)' ) def setup_global_experiment(self): """ Sets up the global test experiment for the ``Processor``: - Checks that the model to use exists - Checks that the configuration file exists - Creates the configuration The rest of the experiment setup is done in :py:func:`setup_individual_experiment()` \ to allow for multiple tests suppport. """ # Call base method to parse all command line arguments and add default sections. super(Processor, self).setup_experiment() # "Pass" configuration parameters from the default_test section to section indicated by the section_name. self.config.add_default_params({ self.app_state.args.section_name: self.config['default_test'].to_dict() }) self.config.del_default_params('default_test') # Retrieve checkpoint file. chkpt_file = self.app_state.args.load_checkpoint # Check the presence of the CUDA-compatible devices. if self.app_state.args.use_gpu and (torch.cuda.device_count() == 0): self.logger.error( "Cannot use GPU as there are no CUDA-compatible devices present in the system!" ) exit(-1) # Config that will be used. abs_root_configs = None # Check if checkpoint file was indicated. if chkpt_file != "": #print('Please pass path to and name of the file containing pipeline to be loaded as --load parameter') #exit(-2) # Check if file with model exists. if not path.isfile(chkpt_file): print('Checkpoint file {} does not exist'.format(chkpt_file)) exit(-3) # Extract path. self.abs_path, _ = path.split( path.dirname(path.expanduser(chkpt_file))) # Use the "default" config. abs_root_configs = [ path.join(self.abs_path, 'training_configuration.yml') ] # Check if config file was indicated by the user. if self.app_state.args.config != '': # Split and make them absolute. root_configs = self.app_state.args.config.replace(" ", "").split(',') # If there are - expand them to absolute paths. abs_root_configs = [ path.expanduser(config) for config in root_configs ] # Using name of the first configuration file from command line. basename = path.basename(root_configs[0]) # Take config filename without extension. pipeline_name = path.splitext(basename)[0] # Use path to experiments + pipeline. self.abs_path = path.join( path.expanduser(self.app_state.args.expdir), pipeline_name) if abs_root_configs is None: print( 'Please indicate configuration file to be used (--config) and/or pass path to and name of the file containing pipeline to be loaded (--load)' ) exit(-2) # Get the list of configurations which need to be loaded. configs_to_load = config_parsing.recurrent_config_parse( abs_root_configs, [], self.app_state.absolute_config_path) # Read the YAML files one by one - but in reverse order -> overwrite the first indicated config(s) config_parsing.reverse_order_config_load(self.config, configs_to_load) # -> At this point, the Config Registry contains the configuration loaded (and overwritten) from several files. def setup_individual_experiment(self): """ Setup individual test experiment in the case of multiple tests, or the main experiment in the case of \ one test experiment. - Set up the log directory path - Set random seeds - Creates the pipeline consisting of many components - Creates testing task manager - Performs testing of compatibility of testing pipeline """ # Get test section. try: self.tsn = self.app_state.args.section_name self.config_test = self.config[self.tsn] if self.config_test is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the section '{}' from the loaded configuration" .format(self.tsn)) exit(-1) # Get testing task type. try: _ = self.config_test['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the '{}' section in the loaded configuration" .format(self.tsn)) exit(-5) # Get pipeline section. try: psn = self.app_state.args.pipeline_section_name self.config_pipeline = self.config[psn] if self.config_pipeline is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the pipeline section '{}' from the loaded configuration" .format(psn)) exit(-1) # Get pipeline name. try: pipeline_name = self.config_pipeline['name'] except KeyError: print( "Error: Couldn't retrieve the pipeline 'name' from the loaded configuration" ) exit(-6) # Prepare output paths for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = self.tsn + '_{0:%Y%m%d_%H%M%S}'.format( datetime.now()) if self.app_state.args.exptag != '': time_str = time_str + "_" + self.app_state.args.exptag self.app_state.log_dir = self.abs_path + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.app_state.log_dir + 'processor.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format( self.app_state.log_dir)) # Set cpu/gpu types. self.app_state.set_types() # Set random seeds in the testing section. self.set_random_seeds(self.tsn, self.config_test) # Total number of detected errors. errors = 0 ################# TESTING PROBLEM ################# # Build the used task manager. self.pm = TaskManager(self.tsn, self.config_test) errors += self.pm.build() # check if the maximum number of episodes is specified, if not put a # default equal to the size of the dataset (divided by the batch size) # So that by default, we loop over the test set once. task_size_in_episodes = len(self.pm) if self.config_test["terminal_conditions"]["episode_limit"] == -1: # Overwrite the config value! self.config_test['terminal_conditions'].add_config_params( {'episode_limit': task_size_in_episodes}) # Warn if indicated number of episodes is larger than an epoch size: if self.config_test["terminal_conditions"][ "episode_limit"] > task_size_in_episodes: self.logger.warning( 'Indicated limit of number of episodes is larger than one epoch, reducing it.' ) # Overwrite the config value! self.config_test['terminal_conditions'].add_config_params( {'episode_limit': task_size_in_episodes}) self.logger.info("Limiting the number of episodes to: {}".format( self.config_test["terminal_conditions"]["episode_limit"])) ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration and global variables. self.pipeline = PipelineManager(pipeline_name, self.config_pipeline) errors += self.pipeline.build() # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.pm.task.summarize_io(self.tsn) summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-7) # Handshake definitions. self.logger.info("Handshaking testing pipeline") defs_testing = self.pm.task.output_data_definitions() errors += self.pipeline.handshake(defs_testing) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) # Check if there are any models in the pipeline. if len(self.pipeline.models) == 0: self.logger.error( 'Cannot proceed with training, as there are no trainable models in the pipeline' ) exit(-3) # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config_pipeline: pipeline_name = self.config_pipeline['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the the whole pipeline. if pipeline_name != "": if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception( "Couldn't load the checkpoint {} indicated in the {}: file does not exist" .format(pipeline_name, msg)) # If we succeeded, we do not want to load the models from the file anymore! else: # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error( "File {} indicated in the {} seems not to be a valid model checkpoint" .format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() # Turn on evaluation mode. self.pipeline.eval() # Export and log configuration, optionally asking the user for confirmation. config_parsing.display_parsing_results(self.logger, self.app_state.args, self.unparsed) config_parsing.display_globals(self.logger, self.app_state.globalitems()) config_parsing.export_experiment_configuration_to_yml( self.logger, self.app_state.log_dir, "training_configuration.yml", self.config, self.app_state.args.confirm) def initialize_statistics_collection(self): """ Function initializes all statistics collectors and aggregators used by a given worker, creates output files etc. """ # Create statistics collector. self.stat_col = StatisticsCollector() self.add_statistics(self.stat_col) self.pm.task.add_statistics(self.stat_col) self.pipeline.add_statistics(self.stat_col) # Create the csv file to store the statistics. self.pm_batch_stats_file = self.stat_col.initialize_csv_file( self.app_state.log_dir, self.tsn + '_statistics.csv') # Create statistics aggregator. self.stat_agg = StatisticsAggregator() self.add_aggregators(self.stat_agg) self.pm.task.add_aggregators(self.stat_agg) self.pipeline.add_aggregators(self.stat_agg) # Create the csv file to store the statistic aggregations. # Will contain a single row with aggregated statistics. self.pm_set_stats_file = self.stat_agg.initialize_csv_file( self.app_state.log_dir, self.tsn + '_set_agg_statistics.csv') def finalize_statistics_collection(self): """ Finalizes statistics collection, closes all files etc. """ # Close all files. self.pm_batch_stats_file.close() self.pm_set_stats_file.close() def run_experiment(self): """ Main function of the ``Processor``: Test the loaded model over the set. Iterates over the ``DataLoader`` for a maximum number of episodes equal to the set size. The function does the following for each episode: - Forwards pass of the model, - Logs statistics & accumulates loss, - Activate visualization if set. """ # Initialize tensorboard and statistics collection. self.initialize_statistics_collection() num_samples = len(self.pm) self.logger.info( 'Processing the entire set ({} samples in {} episodes)'.format( num_samples, len(self.pm.dataloader))) try: # Run in no_grad mode. with torch.no_grad(): # Reset the counter. self.app_state.episode = -1 # Inform the task manager that epoch has started. self.pm.initialize_epoch() for batch in self.pm.dataloader: # Increment counter. self.app_state.episode += 1 # Terminal condition 0: max test episodes reached. if self.app_state.episode == self.config_test[ "terminal_conditions"]["episode_limit"]: break # Forward pass. self.pipeline.forward(batch) # Collect the statistics. self.collect_all_statistics(self.pm, self.pipeline, batch, self.stat_col) # Export to csv - at every step. self.stat_col.export_to_csv() # Log to logger - at logging frequency. if self.app_state.episode % self.app_state.args.logging_interval == 0: self.logger.info( self.stat_col.export_to_string('[Partial]')) # move to next episode. self.app_state.episode += 1 # End for. # Inform the task managers that the epoch has ended. self.pm.finalize_epoch() self.logger.info('\n' + '=' * 80) self.logger.info('Processing finished') # Aggregate statistics for the whole set. self.aggregate_all_statistics(self.pm, self.pipeline, self.stat_col, self.stat_agg) # Export aggregated statistics. self.export_all_statistics(self.stat_agg, '[Full Set]') except SystemExit as e: # the training did not end properly self.logger.error('Experiment interrupted because {}'.format(e)) except KeyboardInterrupt: # the training did not end properly self.logger.error('Experiment interrupted!') finally: # Finalize statistics collection. self.finalize_statistics_collection() self.logger.info("Experiment logged to: {}".format( self.app_state.log_dir))
class Trainer(Worker): """ Base class for the trainers. Iterates over epochs on the dataset. All other types of trainers (e.g. ``OnlineTrainer`` & ``OfflineTrainer``) should subclass it. """ def __init__(self, name, class_type): """ Base constructor for all trainers: - Adds default trainer command line arguments :param name: Name of the worker :type name: str :param class_type: Class type of the component. """ # Call base constructor to set up app state, registry and add default arguments. super(Trainer, self).__init__(name, class_type) # Add arguments to the specific parser. # These arguments will be shared by all basic trainers. self.parser.add_argument( '--tensorboard', action='store', dest='tensorboard', choices=[0, 1, 2], type=int, help= "If present, enable logging to TensorBoard. Available log levels:\n" "0: Log the collected statistics.\n" "1: Add the histograms of the model's biases & weights (Warning: Slow).\n" "2: Add the histograms of the model's biases & weights gradients " "(Warning: Even slower).") self.parser.add_argument( '--saveall', dest='save_intermediate', action='store_true', help= 'Setting to true results in saving intermediate models during training (DEFAULT: False)' ) self.parser.add_argument( '--training', dest='training_section_name', type=str, default="training", help= 'Name of the section defining the training procedure (DEFAULT: training)' ) self.parser.add_argument( '--validation', dest='validation_section_name', type=str, default="validation", help= 'Name of the section defining the validation procedure (DEFAULT: validation)' ) def setup_experiment(self): """ Sets up experiment of all trainers: - Calls base class setup_experiment to parse the command line arguments, - Loads the config file(s) - Set up the log directory path - Add a ``FileHandler`` to the logger - Set random seeds - Creates the pipeline consisting of many components - Creates training task manager - Handles curriculum learning if indicated - Creates validation task manager - Set optimizer - Performs testing of compatibility of both training and validation tasks and created pipeline. """ # Call base method to parse all command line arguments and add default sections. super(Trainer, self).setup_experiment() # "Pass" configuration parameters from the "default_training" section to training section indicated by the section_name. self.config.add_default_params({ self.app_state.args.training_section_name: self.config['default_training'].to_dict() }) self.config.del_default_params('default_training') # "Pass" configuration parameters from the "default_validation" section to validation section indicated by the section_name. self.config.add_default_params({ self.app_state.args.validation_section_name: self.config['default_validation'].to_dict() }) self.config.del_default_params('default_validation') # Check the presence of the CUDA-compatible devices. if self.app_state.args.use_gpu and (torch.cuda.device_count() == 0): self.logger.error( "Cannot use GPU as there are no CUDA-compatible devices present in the system!" ) exit(-1) # Check if config file was selected. if self.app_state.args.config == '': print('Please pass configuration file(s) as --c parameter') exit(-2) # Split and make them absolute. root_configs = self.app_state.args.config.replace(" ", "").split(',') # If there are - expand them to absolute paths. abs_root_configs = [path.expanduser(config) for config in root_configs] # Get the list of configurations which need to be loaded. configs_to_load = config_parse.recurrent_config_parse( abs_root_configs, [], self.app_state.absolute_config_path) # Read the YAML files one by one - but in reverse order -> overwrite the first indicated config(s) config_parse.reverse_order_config_load(self.config, configs_to_load) # -> At this point, the Param Registry contains the configuration loaded (and overwritten) from several files. # Log the resulting training configuration. conf_str = 'Loaded (initial) configuration:\n' conf_str += '=' * 80 + '\n' conf_str += yaml.safe_dump(self.config.to_dict(), default_flow_style=False) conf_str += '=' * 80 + '\n' print(conf_str) # Get training section. try: tsn = self.app_state.args.training_section_name self.config_training = self.config[tsn] # We must additionally check if it is None - weird behvaiour when using default value. if self.config_training is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the training section '{}' from the loaded configuration" .format(tsn)) exit(-1) # Get training task type. try: training_task_type = self.config_training['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the training section '{}' in the loaded configuration" .format(tsn)) exit(-1) # Get validation section. try: vsn = self.app_state.args.validation_section_name self.config_validation = self.config[vsn] if self.config_validation is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the validation section '{}' from the loaded configuration" .format(vsn)) exit(-1) # Get validation task type. try: _ = self.config_validation['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the validation section '{}' in the loaded configuration" .format(vsn)) exit(-1) # Get pipeline section. try: psn = self.app_state.args.pipeline_section_name self.config_pipeline = self.config[psn] if self.config_pipeline is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the pipeline section '{}' from the loaded configuration" .format(psn)) exit(-1) # Get pipeline name. try: pipeline_name = self.config_pipeline['name'] except KeyError: # Using name of the first configuration file from command line. basename = path.basename(root_configs[0]) # Take config filename without extension. pipeline_name = path.splitext(basename)[0] # Set pipeline name, so processor can use it afterwards. self.config_pipeline.add_config_params({'name': pipeline_name}) # Prepare the output path for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = '{0:%Y%m%d_%H%M%S}'.format(datetime.now()) if self.app_state.args.exptag != '': time_str = time_str + "_" + self.app_state.args.exptag self.app_state.log_dir = path.expanduser( self.app_state.args.expdir ) + '/' + training_task_type + '/' + pipeline_name + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.app_state.log_dir + 'trainer.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format( self.app_state.log_dir)) # Set cpu/gpu types. self.app_state.set_types() # Models dir. self.checkpoint_dir = self.app_state.log_dir + 'checkpoints/' makedirs(self.checkpoint_dir, exist_ok=False) # Set random seeds in the training section. self.set_random_seeds('training', self.config_training) # Total number of detected errors. errors = 0 ################# TRAINING PROBLEM ################# # Build training task manager. self.training = TaskManager('training', self.config_training) errors += self.training.build() # parse the curriculum learning section in the loaded configuration. if 'curriculum_learning' in self.config_training: # Initialize curriculum learning - with values from loaded configuration. self.training.task.curriculum_learning_initialize( self.config_training['curriculum_learning']) # If the 'must_finish' key is not present in config then then it will be finished by default self.config_training['curriculum_learning'].add_default_params( {'must_finish': True}) self.must_finish_curriculum = self.config_training[ 'curriculum_learning']['must_finish'] self.logger.info("Curriculum Learning activated") else: # If not using curriculum learning then it does not have to be finished. self.must_finish_curriculum = False self.curric_done = True ################# VALIDATION PROBLEM ################# # Build validation task manager. self.validation = TaskManager('validation', self.config_validation) errors += self.validation.build() ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration. self.pipeline = PipelineManager(pipeline_name, self.config_pipeline) errors += self.pipeline.build() # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.training.task.summarize_io("training") summary_str += self.validation.task.summarize_io("validation") summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Handshake definitions. self.logger.info("Handshaking training pipeline") defs_training = self.training.task.output_data_definitions() errors += self.pipeline.handshake(defs_training) self.logger.info("Handshaking validation pipeline") defs_valid = self.validation.task.output_data_definitions() errors += self.pipeline.handshake(defs_valid) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) ################## MODEL LOAD/FREEZE ################# # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config_pipeline: pipeline_name = self.config_pipeline['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the model. if pipeline_name != "": if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception( "Couldn't load the checkpoint {} indicated in the {}: file does not exist" .format(pipeline_name, msg)) # If we succeeded, we do not want to load the models from the file anymore! else: # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error( "File {} indicated in the {} seems not to be a valid model checkpoint" .format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Finally, freeze the models (that the user wants to freeze). self.pipeline.freeze_models() # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() ################# OPTIMIZER ################# # Set the optimizer. optimizer_conf = dict(self.config_training['optimizer']) optimizer_type = optimizer_conf['type'] del optimizer_conf['type'] # Check if there are any models in the pipeline. if len( list( filter(lambda p: p.requires_grad, self.pipeline.parameters()))) == 0: self.logger.error( 'Cannot proceed with training, as there are no trainable models in the pipeline (or all models are frozen)' ) exit(-7) # Instantiate the optimizer and filter the model parameters based on if they require gradients. self.optimizer = getattr(torch.optim, optimizer_type)(filter( lambda p: p.requires_grad, self.pipeline.parameters()), **optimizer_conf) log_str = 'Optimizer:\n' + '=' * 80 + "\n" log_str += " Type: " + optimizer_type + "\n" log_str += " Params: {}".format(optimizer_conf) self.logger.info(log_str) def add_statistics(self, stat_col): """ Calls base method and adds epoch statistics to ``StatisticsCollector``. :param stat_col: ``StatisticsCollector``. """ # Add loss and episode. super(Trainer, self).add_statistics(stat_col) # Add default statistics with formatting. stat_col.add_statistics('epoch', '{:02d}') def add_aggregators(self, stat_agg): """ Adds basic aggregators to to ``StatisticsAggregator`` and extends them with: epoch. :param stat_agg: ``StatisticsAggregator``. """ # Add basic aggregators. super(Trainer, self).add_aggregators(stat_agg) # add 'aggregators' for the epoch. stat_agg.add_aggregator('epoch', '{:02d}') def initialize_statistics_collection(self): """ - Initializes all ``StatisticsCollectors`` and ``StatisticsAggregators`` used by a given worker: \ - For training statistics (adds the statistics of the model & task), - For validation statistics (adds the statistics of the model & task). - Creates the output files (csv). """ # TRAINING. # Create statistics collector for training. self.training_stat_col = StatisticsCollector() self.add_statistics(self.training_stat_col) self.training.task.add_statistics(self.training_stat_col) self.pipeline.add_statistics(self.training_stat_col) # Create the csv file to store the training statistics. self.training_batch_stats_file = self.training_stat_col.initialize_csv_file( self.app_state.log_dir, 'training_statistics.csv') # Create statistics aggregator for training. self.training_stat_agg = StatisticsAggregator() self.add_aggregators(self.training_stat_agg) self.training.task.add_aggregators(self.training_stat_agg) self.pipeline.add_aggregators(self.training_stat_agg) # Create the csv file to store the training statistic aggregations. self.training_set_stats_file = self.training_stat_agg.initialize_csv_file( self.app_state.log_dir, 'training_set_agg_statistics.csv') # VALIDATION. # Create statistics collector for validation. self.validation_stat_col = StatisticsCollector() self.add_statistics(self.validation_stat_col) self.validation.task.add_statistics(self.validation_stat_col) self.pipeline.add_statistics(self.validation_stat_col) # Create the csv file to store the validation statistics. self.validation_batch_stats_file = self.validation_stat_col.initialize_csv_file( self.app_state.log_dir, 'validation_statistics.csv') # Create statistics aggregator for validation. self.validation_stat_agg = StatisticsAggregator() self.add_aggregators(self.validation_stat_agg) self.validation.task.add_aggregators(self.validation_stat_agg) self.pipeline.add_aggregators(self.validation_stat_agg) # Create the csv file to store the validation statistic aggregations. self.validation_set_stats_file = self.validation_stat_agg.initialize_csv_file( self.app_state.log_dir, 'validation_set_agg_statistics.csv') def finalize_statistics_collection(self): """ Finalizes the statistics collection by closing the csv files. """ # Close all files. self.training_batch_stats_file.close() self.training_set_stats_file.close() self.validation_batch_stats_file.close() self.validation_set_stats_file.close() def initialize_tensorboard(self): """ Initializes the TensorBoard writers, and log directories. """ # Create TensorBoard outputs - if TensorBoard is supposed to be used. if self.app_state.args.tensorboard is not None: from tensorboardX import SummaryWriter self.training_batch_writer = SummaryWriter(self.app_state.log_dir + '/training') self.training_stat_col.initialize_tensorboard( self.training_batch_writer) self.training_set_writer = SummaryWriter(self.app_state.log_dir + '/training_set_agg') self.training_stat_agg.initialize_tensorboard( self.training_set_writer) self.validation_batch_writer = SummaryWriter( self.app_state.log_dir + '/validation') self.validation_stat_col.initialize_tensorboard( self.validation_batch_writer) self.validation_set_writer = SummaryWriter(self.app_state.log_dir + '/validation_set_agg') self.validation_stat_agg.initialize_tensorboard( self.validation_set_writer) else: self.training_batch_writer = None self.training_set_writer = None self.validation_batch_writer = None self.validation_set_writer = None def finalize_tensorboard(self): """ Finalizes the operation of TensorBoard writers by closing them. """ # Close the TensorBoard writers. if self.training_batch_writer is not None: self.training_batch_writer.close() if self.training_set_writer is not None: self.training_set_writer.close() if self.validation_batch_writer is not None: self.validation_batch_writer.close() if self.validation_set_writer is not None: self.validation_set_writer.close() def validate_on_batch(self, valid_batch): """ Performs a validation of the model using the provided batch. Additionally logs results (to files, TensorBoard) and handles visualization. :param valid_batch: data batch generated by the task and used as input to the model. :type valid_batch: ``DataStreams`` :return: Validation loss. """ # Turn on evaluation mode. self.pipeline.eval() # Empty the statistics collector. self.validation_stat_col.empty() # Compute the validation loss using the provided data batch. with torch.no_grad(): # Forward pass. self.pipeline.forward(valid_batch) # Collect the statistics. self.collect_all_statistics(self.validation, self.pipeline, valid_batch, self.validation_stat_col) # Export collected statistics. self.export_all_statistics(self.validation_stat_col, '[Partial Validation]') def validate_on_set(self): """ Performs a validation of the model on the whole validation set, using the validation ``DataLoader``. Iterates over the entire validation set (through the `DataLoader``), aggregates the collected statistics \ and logs that to the console, csv and TensorBoard (if set). """ # Get number of samples. num_samples = len(self.validation) self.logger.info( 'Validating over the entire validation set ({} samples in {} episodes)' .format(num_samples, len(self.validation.dataloader))) # Turn on evaluation mode. self.pipeline.eval() # Reset the statistics. self.validation_stat_col.empty() # Remember global episode number. old_episode = self.app_state.episode with torch.no_grad(): for ep, valid_batch in enumerate(self.validation.dataloader): self.app_state.episode = ep # Forward pass. self.pipeline.forward(valid_batch) # Collect the statistics. self.collect_all_statistics(self.validation, self.pipeline, valid_batch, self.validation_stat_col) # Revert to global episode number. self.app_state.episode = old_episode # Aggregate statistics for the whole set. self.aggregate_all_statistics(self.validation, self.pipeline, self.validation_stat_col, self.validation_stat_agg) # Export aggregated statistics. self.export_all_statistics(self.validation_stat_agg, '[Full Validation]')
tb_writer = self.tb_writer # If it is still None - well, we cannot do anything more. if tb_writer is None: return # Iterate through keys and values and concatenate them. for key, value in self.aggregators.items(): # Skip episode. if key == 'episode': continue tb_writer.add_scalar(key, value, episode) if __name__ == "__main__": stat_col = StatisticsCollector() stat_agg = StatisticsAggregator() # Add default statistics with formatting. stat_col.add_statistic('loss', '{:12.10f}') stat_col.add_statistic('episode', '{:06d}') import random # create some random values loss_values = random.sample(range(100), 100) # "Collect" basic statistics. for episode, loss in enumerate(loss_values): stat_col['episode'] = episode stat_col['loss'] = loss # print(stat_col.export_statistics_to_string())
class Tester(Worker): """ Defines the basic ``Tester``. If defining another type of tester, it should subclass it. """ def __init__(self, name="Tester"): """ Calls the ``Worker`` constructor, adds some additional arguments to parser. :param name: Name of the worker (DEFAULT: "Tester"). :type name: str """ # Call base constructor to set up app state, registry and add default params. super(Tester, self).__init__(name) def setup_global_experiment(self): """ Sets up the global test experiment for the ``Tester``: - Checks that the model to use exists on file: >>> if not os.path.isfile(flags.model) - Checks that the configuration file exists: >>> if not os.path.isfile(config_file) - Create the configuration: >>> self.config.add_config_params_from_yaml(config) The rest of the experiment setup is done in :py:func:`setup_individual_experiment()` \ to allow for multiple tests suppport. """ # Call base method to parse all command line arguments and add default sections. super(Tester, self).setup_experiment() chkpt_file = self.app_state.args.load_checkpoint # Check if checkpoint file was indicated. if chkpt_file == "": print( 'Please pass path to and name of the file containing pipeline to be loaded as --load parameter' ) exit(-1) # Check if file with model exists. if not os.path.isfile(chkpt_file): print('Checkpoint file {} does not exist'.format(chkpt_file)) exit(-2) # Extract path. abs_config_path, _ = os.path.split( os.path.dirname(os.path.expanduser(chkpt_file))) # Check if config file was indicated by the user. if self.app_state.args.config != '': root_config = self.app_state.args.config else: # Use the "default one". root_config = os.path.join(abs_config_path, 'training_configuration.yaml') # Check if configuration file exists. if not os.path.isfile(root_config): print('Config file {} does not exist'.format(root_config)) exit(-3) # Check the presence of the CUDA-compatible devices. if self.app_state.args.use_gpu and (torch.cuda.device_count() == 0): self.logger.error( "Cannot use GPU as there are no CUDA-compatible devices present in the system!" ) exit(-4) # Extract absolute path to main ptp 'config' directory. # Save it in app_state! self.app_state.absolute_config_path = abs_config_path[:abs_config_path. find("configs") + 8] # Get relative path. rel_config_path = abs_config_path[abs_config_path.find("configs") + 8:] print("TODO: different root config extraction path!!") print(self.app_state.absolute_config_path) exit(1) # Get the list of configurations which need to be loaded. configs_to_load = config_parse.recurrent_config_parse( rel_config_path, [], self.app_state.absolute_config_path) # Read the YAML files one by one - but in reverse order -> overwrite the first indicated config(s) config_parse.reverse_order_config_load( self.config, configs_to_load, self.app_state.absolute_config_path) # -> At this point, the Config Registry contains the configuration loaded (and overwritten) from several files. def setup_individual_experiment(self): """ Setup individual test experiment in the case of multiple tests, or the main experiment in the case of \ one test experiment. - Set up the log directory path: >>> os.makedirs(self.log_dir, exist_ok=False) - Add a FileHandler to the logger (defined in BaseWorker): >>> self.logger.addHandler(fh) - Set random seeds: >>> self.set_random_seeds('testing', self.config['testing']) - Creates the pipeline consisting of many components - Creates testing problem manager - Performs testing of compatibility of testing pipeline. """ # Get testing problem type. try: _ = self.config['testing']['problem']['type'] except KeyError: print( "Error: Couldn't retrieve the problem 'type' from the 'testing' section in the loaded configuration" ) exit(-5) # Get pipeline name. try: pipeline_name = self.config['pipeline']['name'] except KeyError: print( "Error: Couldn't retrieve the pipeline 'name' from the loaded configuration" ) exit(-6) # Prepare output paths for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = 'test_{0:%Y%m%d_%H%M%S}'.format(datetime.now()) if self.app_state.args.savetag != '': time_str = time_str + "_" + self.app_state.args.savetag self.log_dir = self.abs_path + '/' + time_str + '/' # Lowercase dir. self.log_dir = self.log_dir.lower() os.makedirs(self.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.log_dir + 'tester.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format(self.log_dir)) # Set cpu/gpu types. self.app_state.set_types() # Set random seeds in the testing section. self.set_random_seeds('testing', self.config['testing']) # Total number of detected errors. errors = 0 ################# TESTING PROBLEM ################# # Build training problem manager. self.testing = ProblemManager('testing', self.config['testing']) errors += self.testing.build() # check if the maximum number of episodes is specified, if not put a # default equal to the size of the dataset (divided by the batch size) # So that by default, we loop over the test set once. max_test_episodes = len(self.testing) self.config['testing']['problem'].add_default_params( {'max_test_episodes': max_test_episodes}) if self.config["testing"]["problem"]["max_test_episodes"] == -1: # Overwrite the config value! self.config['testing']['problem'].add_config_params( {'max_test_episodes': max_test_episodes}) # Warn if indicated number of episodes is larger than an epoch size: if self.config["testing"]["problem"][ "max_test_episodes"] > max_test_episodes: self.logger.warning( 'Indicated maximum number of episodes is larger than one epoch, reducing it.' ) self.config['testing']['problem'].add_config_params( {'max_test_episodes': max_test_episodes}) self.logger.info("Setting the max number of episodes to: {}".format( self.config["testing"]["problem"]["max_test_episodes"])) ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration and global variables. self.pipeline = PipelineManager(pipeline_name, self.config['pipeline']) errors += self.pipeline.build() # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.testing.problem.summarize_io("testing") summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-7) # Handshake definitions. self.logger.info("Handshaking testing pipeline") defs_testing = self.testing.problem.output_data_definitions() errors += self.pipeline.handshake(defs_testing) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) # Check if there are any models in the pipeline. if len(self.pipeline.models) == 0: self.logger.error( 'Cannot proceed with training, as there are no trainable models in the pipeline' ) exit(-3) # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config['pipeline']: pipeline_name = self.config['pipeline']['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the model. if pipeline_name != "": if os.path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception( "Couldn't load the checkpoint {} indicated in the {}: file does not exist" .format(pipeline_name, msg)) # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error( "File {} indicated in the {} seems not to be a valid model checkpoint" .format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() # Turn on evaluation mode. self.pipeline.eval() # Export and log configuration, optionally asking the user for confirmation. self.export_experiment_configuration(self.log_dir, "testing_configuration.yaml", self.app_state.args.confirm) def initialize_statistics_collection(self): """ Function initializes all statistics collectors and aggregators used by a given worker, creates output files etc. """ # Create statistics collector for testing. self.testing_stat_col = StatisticsCollector() self.add_statistics(self.testing_stat_col) self.testing.problem.add_statistics(self.testing_stat_col) self.pipeline.add_statistics(self.testing_stat_col) # Create the csv file to store the testing statistics. self.testing_batch_stats_file = self.testing_stat_col.initialize_csv_file( self.log_dir, 'testing_statistics.csv') # Create statistics aggregator for testing. self.testing_stat_agg = StatisticsAggregator() self.add_aggregators(self.testing_stat_agg) self.testing.problem.add_aggregators(self.testing_stat_agg) self.pipeline.add_aggregators(self.testing_stat_agg) # Create the csv file to store the testing statistic aggregations. # Will contain a single row with aggregated statistics. self.testing_set_stats_file = self.testing_stat_agg.initialize_csv_file( self.log_dir, 'testing_set_agg_statistics.csv') def finalize_statistics_collection(self): """ Finalizes statistics collection, closes all files etc. """ # Close all files. self.testing_batch_stats_file.close() self.testing_set_stats_file.close() def run_experiment(self): """ Main function of the ``Tester``: Test the loaded model over the test set. Iterates over the ``DataLoader`` for a maximum number of episodes equal to the test set size. The function does the following for each episode: - Forwards pass of the model, - Logs statistics & accumulates loss, - Activate visualization if set. """ # Initialize tensorboard and statistics collection. self.initialize_statistics_collection() num_samples = len(self.testing) self.logger.info( 'Testing over the entire test set ({} samples in {} episodes)'. format(num_samples, len(self.testing.dataloader))) try: # Run test with torch.no_grad(): episode = 0 for test_dict in self.testing.dataloader: # Terminal condition 0: max test episodes reached. if episode == self.config["testing"]["problem"][ "max_test_episodes"]: break # Forward pass. self.pipeline.forward(test_dict) # Collect the statistics. self.collect_all_statistics(self.testing, self.pipeline, test_dict, self.testing_stat_col, episode) # Export to csv - at every step. self.testing_stat_col.export_to_csv() # Log to logger - at logging frequency. if episode % self.app_state.args.logging_interval == 0: self.logger.info( self.testing_stat_col.export_to_string( '[Partial Test]')) # move to next episode. episode += 1 # End for. self.logger.info('\n' + '=' * 80) self.logger.info('Test finished') # Aggregate statistics for the whole set. self.aggregate_all_statistics(self.testing, self.pipeline, self.testing_stat_col, self.testing_stat_agg, episode) # Export aggregated statistics. self.export_all_statistics(self.testing_stat_agg, '[Full Test]') except SystemExit as e: # the training did not end properly self.logger.error('Experiment interrupted because {}'.format(e)) except KeyboardInterrupt: # the training did not end properly self.logger.error('Experiment interrupted!') finally: # Finalize statistics collection. self.finalize_statistics_collection()
def test_collector_string(self): """ Tests whether the collector is collecting and producing the right string. """ stat_col = StatisticsCollector() stat_col.add_statistics('loss', '{:12.10f}') stat_col.add_statistics('episode', '{:06d}') stat_col.add_statistics('acc', '{:2.3f}') stat_col.add_statistics('acc_help', None) # Episode 0. stat_col['episode'] = 0 stat_col['loss'] = 0.7 stat_col['acc'] = 100 stat_col['acc_help'] = 121 # Export. #csv_file = stat_col.initialize_csv_file('./', 'collector_test.csv') #stat_col.export_to_csv(csv_file) self.assertEqual(stat_col.export_to_string(), "loss 0.7000000000; episode 000000; acc 100.000 ") # Episode 1. stat_col['episode'] = 1 stat_col['loss'] = 0.7 stat_col['acc'] = 99.3 stat_col.add_statistics('seq_length', '{:2.0f}') stat_col['seq_length'] = 5 # Export. #stat_col.export_to_csv(csv_file) self.assertEqual(stat_col.export_to_string('[Validation]'), "loss 0.7000000000; episode 000001; acc 99.300; seq_length 5 [Validation]") # Empty. stat_col.empty() self.assertEqual(stat_col.export_to_string(), "loss ; episode ; acc ; seq_length ")