def __init__(self, name, config): """ Initializes the pipeline manager. :param config: Parameters used to instantiate all required components. :type config: :py:class:`ptp.configuration.ConfigInterface` """ # Initialize the logger. self.name = name self.config = config self.app_state = AppState() self.logger = logging.initialize_logger(name) # Set initial values of all pipeline elements. # Empty list of all components, sorted by their priorities. self.__components = {} # Empty list of all models - it will contain only "references" to objects stored in the components list. self.models = [] # Empty list of all losses - it will contain only "references" to objects stored in the components list. self.losses = [] # Initialization of best loss - as INF. self.best_loss = inf self.best_status = "Unknown"
def setup_experiment(self): """ Setups a specific experiment. Base method: - Parses command line arguments. - Initializes logger with worker name. - Sets the 3 default config sections (training / validation / test) and sets their dataloaders params. .. note:: Child classes should override this method, but still call its parent to draw the basic functionality \ implemented here. """ # Parse arguments. self.app_state.args, self.unparsed = self.parser.parse_known_args() # Initialize logger using the configuration. # For now do not add file handler, as path to logfile is not known yet. self.logger = logging.initialize_logger(self.name, False) # add empty sections self.config.add_default_params( {"training": { 'terminal_conditions': {} }}) self.config.add_default_params({"validation": {}}) self.config.add_default_params({"testing": {}})
def __init__(self, name, class_type, config): """ Initializes the component. This constructor: - sets the access to ``AppState`` (for dtypes, settings, globals etc.) - stores the component name and type - stores reference to the passed configuration registry section - loads default component parameters - initializes the logger - initializes mapping facilities and facades :param name: Name of the component. :param class_type: Class type of the component. :param config: Dictionary of parameters (read from configuration ``.yaml`` file). :type config: :py:class:`ptp.configuration.ConfigInterface` """ self.name = name self.config = config # Get access to AppState: for command line args, globals etc. self.app_state = AppState() # Initialize logger. self.logger = logging.initialize_logger(self.name) # Load default configuration. if class_type is not None: self.config.add_default_params( load_class_default_config_file(class_type)) # Initialize the "streams mapping facility". if "streams" not in config or config["streams"] is None: self.__stream_keys = {} else: self.__stream_keys = config["streams"] self.stream_keys = KeyMappingsFacade(self.__stream_keys) # Initialize the "globals mapping facility". if "globals" not in config or config["globals"] is None: self.__global_keys = {} else: self.__global_keys = config["globals"] self.global_keys = KeyMappingsFacade(self.__global_keys) # Initialize the "statistics mapping facility". if "statistics" not in config or config["statistics"] is None: self.__statistics_keys = {} else: self.__statistics_keys = config["statistics"] self.statistics_keys = KeyMappingsFacade(self.__statistics_keys) # Facade for accessing global parameters (stored still in AppState). self.globals = GlobalsFacade(self.__global_keys)
def setup_individual_experiment(self): """ Setup individual test experiment in the case of multiple tests, or the main experiment in the case of \ one test experiment. - Set up the log directory path - Set random seeds - Creates the pipeline consisting of many components - Creates testing task manager - Performs testing of compatibility of testing pipeline """ # Get test section. try: self.tsn = self.app_state.args.section_name self.config_test = self.config[self.tsn] if self.config_test is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the section '{}' from the loaded configuration" .format(self.tsn)) exit(-1) # Get testing task type. try: _ = self.config_test['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the '{}' section in the loaded configuration" .format(self.tsn)) exit(-5) # Get pipeline section. try: psn = self.app_state.args.pipeline_section_name self.config_pipeline = self.config[psn] if self.config_pipeline is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the pipeline section '{}' from the loaded configuration" .format(psn)) exit(-1) # Get pipeline name. try: pipeline_name = self.config_pipeline['name'] except KeyError: print( "Error: Couldn't retrieve the pipeline 'name' from the loaded configuration" ) exit(-6) # Prepare output paths for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = self.tsn + '_{0:%Y%m%d_%H%M%S}'.format( datetime.now()) if self.app_state.args.exptag != '': time_str = time_str + "_" + self.app_state.args.exptag self.app_state.log_dir = self.abs_path + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.app_state.log_dir + 'processor.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format( self.app_state.log_dir)) # Set cpu/gpu types. self.app_state.set_types() # Set random seeds in the testing section. self.set_random_seeds(self.tsn, self.config_test) # Total number of detected errors. errors = 0 ################# TESTING PROBLEM ################# # Build the used task manager. self.pm = TaskManager(self.tsn, self.config_test) errors += self.pm.build() # check if the maximum number of episodes is specified, if not put a # default equal to the size of the dataset (divided by the batch size) # So that by default, we loop over the test set once. task_size_in_episodes = len(self.pm) if self.config_test["terminal_conditions"]["episode_limit"] == -1: # Overwrite the config value! self.config_test['terminal_conditions'].add_config_params( {'episode_limit': task_size_in_episodes}) # Warn if indicated number of episodes is larger than an epoch size: if self.config_test["terminal_conditions"][ "episode_limit"] > task_size_in_episodes: self.logger.warning( 'Indicated limit of number of episodes is larger than one epoch, reducing it.' ) # Overwrite the config value! self.config_test['terminal_conditions'].add_config_params( {'episode_limit': task_size_in_episodes}) self.logger.info("Limiting the number of episodes to: {}".format( self.config_test["terminal_conditions"]["episode_limit"])) ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration and global variables. self.pipeline = PipelineManager(pipeline_name, self.config_pipeline) errors += self.pipeline.build() # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.pm.task.summarize_io(self.tsn) summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-7) # Handshake definitions. self.logger.info("Handshaking testing pipeline") defs_testing = self.pm.task.output_data_definitions() errors += self.pipeline.handshake(defs_testing) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) # Check if there are any models in the pipeline. if len(self.pipeline.models) == 0: self.logger.error( 'Cannot proceed with training, as there are no trainable models in the pipeline' ) exit(-3) # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config_pipeline: pipeline_name = self.config_pipeline['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the the whole pipeline. if pipeline_name != "": if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception( "Couldn't load the checkpoint {} indicated in the {}: file does not exist" .format(pipeline_name, msg)) # If we succeeded, we do not want to load the models from the file anymore! else: # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error( "File {} indicated in the {} seems not to be a valid model checkpoint" .format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() # Turn on evaluation mode. self.pipeline.eval() # Export and log configuration, optionally asking the user for confirmation. config_parsing.display_parsing_results(self.logger, self.app_state.args, self.unparsed) config_parsing.display_globals(self.logger, self.app_state.globalitems()) config_parsing.export_experiment_configuration_to_yml( self.logger, self.app_state.log_dir, "training_configuration.yml", self.config, self.app_state.args.confirm)
def setup_experiment(self): """ Sets up experiment of all trainers: - Calls base class setup_experiment to parse the command line arguments, - Loads the config file(s) - Set up the log directory path - Add a ``FileHandler`` to the logger - Set random seeds - Creates the pipeline consisting of many components - Creates training task manager - Handles curriculum learning if indicated - Creates validation task manager - Set optimizer - Performs testing of compatibility of both training and validation tasks and created pipeline. """ # Call base method to parse all command line arguments and add default sections. super(Trainer, self).setup_experiment() # "Pass" configuration parameters from the "default_training" section to training section indicated by the section_name. self.config.add_default_params({ self.app_state.args.training_section_name: self.config['default_training'].to_dict() }) self.config.del_default_params('default_training') # "Pass" configuration parameters from the "default_validation" section to validation section indicated by the section_name. self.config.add_default_params({ self.app_state.args.validation_section_name: self.config['default_validation'].to_dict() }) self.config.del_default_params('default_validation') # Check the presence of the CUDA-compatible devices. if self.app_state.args.use_gpu and (torch.cuda.device_count() == 0): self.logger.error( "Cannot use GPU as there are no CUDA-compatible devices present in the system!" ) exit(-1) # Check if config file was selected. if self.app_state.args.config == '': print('Please pass configuration file(s) as --c parameter') exit(-2) # Split and make them absolute. root_configs = self.app_state.args.config.replace(" ", "").split(',') # If there are - expand them to absolute paths. abs_root_configs = [path.expanduser(config) for config in root_configs] # Get the list of configurations which need to be loaded. configs_to_load = config_parse.recurrent_config_parse( abs_root_configs, [], self.app_state.absolute_config_path) # Read the YAML files one by one - but in reverse order -> overwrite the first indicated config(s) config_parse.reverse_order_config_load(self.config, configs_to_load) # -> At this point, the Param Registry contains the configuration loaded (and overwritten) from several files. # Log the resulting training configuration. conf_str = 'Loaded (initial) configuration:\n' conf_str += '=' * 80 + '\n' conf_str += yaml.safe_dump(self.config.to_dict(), default_flow_style=False) conf_str += '=' * 80 + '\n' print(conf_str) # Get training section. try: tsn = self.app_state.args.training_section_name self.config_training = self.config[tsn] # We must additionally check if it is None - weird behvaiour when using default value. if self.config_training is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the training section '{}' from the loaded configuration" .format(tsn)) exit(-1) # Get training task type. try: training_task_type = self.config_training['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the training section '{}' in the loaded configuration" .format(tsn)) exit(-1) # Get validation section. try: vsn = self.app_state.args.validation_section_name self.config_validation = self.config[vsn] if self.config_validation is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the validation section '{}' from the loaded configuration" .format(vsn)) exit(-1) # Get validation task type. try: _ = self.config_validation['task']['type'] except KeyError: print( "Error: Couldn't retrieve the task 'type' from the validation section '{}' in the loaded configuration" .format(vsn)) exit(-1) # Get pipeline section. try: psn = self.app_state.args.pipeline_section_name self.config_pipeline = self.config[psn] if self.config_pipeline is None: raise KeyError() except KeyError: print( "Error: Couldn't retrieve the pipeline section '{}' from the loaded configuration" .format(psn)) exit(-1) # Get pipeline name. try: pipeline_name = self.config_pipeline['name'] except KeyError: # Using name of the first configuration file from command line. basename = path.basename(root_configs[0]) # Take config filename without extension. pipeline_name = path.splitext(basename)[0] # Set pipeline name, so processor can use it afterwards. self.config_pipeline.add_config_params({'name': pipeline_name}) # Prepare the output path for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = '{0:%Y%m%d_%H%M%S}'.format(datetime.now()) if self.app_state.args.exptag != '': time_str = time_str + "_" + self.app_state.args.exptag self.app_state.log_dir = path.expanduser( self.app_state.args.expdir ) + '/' + training_task_type + '/' + pipeline_name + '/' + time_str + '/' # Lowercase dir. self.app_state.log_dir = self.app_state.log_dir.lower() makedirs(self.app_state.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.app_state.log_dir + 'trainer.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format( self.app_state.log_dir)) # Set cpu/gpu types. self.app_state.set_types() # Models dir. self.checkpoint_dir = self.app_state.log_dir + 'checkpoints/' makedirs(self.checkpoint_dir, exist_ok=False) # Set random seeds in the training section. self.set_random_seeds('training', self.config_training) # Total number of detected errors. errors = 0 ################# TRAINING PROBLEM ################# # Build training task manager. self.training = TaskManager('training', self.config_training) errors += self.training.build() # parse the curriculum learning section in the loaded configuration. if 'curriculum_learning' in self.config_training: # Initialize curriculum learning - with values from loaded configuration. self.training.task.curriculum_learning_initialize( self.config_training['curriculum_learning']) # If the 'must_finish' key is not present in config then then it will be finished by default self.config_training['curriculum_learning'].add_default_params( {'must_finish': True}) self.must_finish_curriculum = self.config_training[ 'curriculum_learning']['must_finish'] self.logger.info("Curriculum Learning activated") else: # If not using curriculum learning then it does not have to be finished. self.must_finish_curriculum = False self.curric_done = True ################# VALIDATION PROBLEM ################# # Build validation task manager. self.validation = TaskManager('validation', self.config_validation) errors += self.validation.build() ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration. self.pipeline = PipelineManager(pipeline_name, self.config_pipeline) errors += self.pipeline.build() # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.training.task.summarize_io("training") summary_str += self.validation.task.summarize_io("validation") summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Handshake definitions. self.logger.info("Handshaking training pipeline") defs_training = self.training.task.output_data_definitions() errors += self.pipeline.handshake(defs_training) self.logger.info("Handshaking validation pipeline") defs_valid = self.validation.task.output_data_definitions() errors += self.pipeline.handshake(defs_valid) # Check errors. if errors > 0: self.logger.error( 'Found {} errors, terminating execution'.format(errors)) exit(-2) ################## MODEL LOAD/FREEZE ################# # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config_pipeline: pipeline_name = self.config_pipeline['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the model. if pipeline_name != "": if path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception( "Couldn't load the checkpoint {} indicated in the {}: file does not exist" .format(pipeline_name, msg)) # If we succeeded, we do not want to load the models from the file anymore! else: # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error( "File {} indicated in the {} seems not to be a valid model checkpoint" .format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Finally, freeze the models (that the user wants to freeze). self.pipeline.freeze_models() # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() ################# OPTIMIZER ################# # Set the optimizer. optimizer_conf = dict(self.config_training['optimizer']) optimizer_type = optimizer_conf['type'] del optimizer_conf['type'] # Check if there are any models in the pipeline. if len( list( filter(lambda p: p.requires_grad, self.pipeline.parameters()))) == 0: self.logger.error( 'Cannot proceed with training, as there are no trainable models in the pipeline (or all models are frozen)' ) exit(-7) # Instantiate the optimizer and filter the model parameters based on if they require gradients. self.optimizer = getattr(torch.optim, optimizer_type)(filter( lambda p: p.requires_grad, self.pipeline.parameters()), **optimizer_conf) log_str = 'Optimizer:\n' + '=' * 80 + "\n" log_str += " Type: " + optimizer_type + "\n" log_str += " Params: {}".format(optimizer_conf) self.logger.info(log_str)
def build(task, config, task_subset_name): """ Static method returning particular sampler, depending on the name \ provided in the list of parameters & the specified task class. :param task: Instance of an object derived from the Task class. :type task: ``tasks.Task`` :param config: Parameters used to instantiate the sampler. :type config: :py:class:`ptp.configuration.ConfigInterface` :param task_subset_name: Name of task subset (and associated TaskManager object) ..note:: ``config`` should contains the exact (case-sensitive) class name of the sampler to instantiate. .. warning:: ``torch.utils.data.sampler.BatchSampler``, \ ``torch.utils.data.sampler.DistributedSampler`` are not supported yet. .. note:: ``torch.utils.data.sampler.SubsetRandomSampler`` expects 'indices' to index a subset of the dataset. \ Currently, the user can specify these indices using one of the following options: - Option 1: range. >>> indices = range(20) - Option 2: range as str. >>> range_str = '0, 20' - Option 3: list of indices. >>> yaml_list = yaml.load('[0, 2, 5, 10]') - Option 4: name of the file containing indices. >>> filename = "~/data/mnist/training_indices.txt" .. note:: ``torch.utils.data.sampler.WeightedRandomSampler`` expercse additional parameter 'weights'. :return: Instance of a given sampler or ``None`` if the section not present or couldn't build the sampler. """ # Initialize logger. logger = logging.initialize_logger('SamplerFactory') try: # Check presence of the typename attribute. if 'type' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'type'" ) # Get the class typename. typename = config['type'] logger.info( 'Trying to instantiate the {} sampler object'.format(typename)) ########################################################################### # Handle first special case: SubsetRandomSampler. if typename == 'SubsetRandomSampler': # Check presence of the typename attribute. if 'indices' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'indices' " "required by SubsetRandomSampler") # Get and process the indices. indices = config['indices'] # Analyze the type. if type(indices) == str: # Try to open the file. try: # from expanduser()'s doc: If the expansion fails or if the path does not begin # with a tilde, the path is returned unchanged. -> So operation below should be safe. file = open(os.path.expanduser(indices), "r") # Read the file. indices = file.readline() file.close() except Exception: # Ok, this is not a file. pass finally: # Try to process it as a string. # Get the digits. digits = indices.split(',') indices = [int(x) for x in digits] else: # Assume that type(indices) is a list of ints. digits = indices # Finally, we got the list of digits. if len(digits) == 2: # Create a range. indices = range(int(digits[0]), int(digits[1])) # Else: use them as they are, including single index. # Check if indices are within range. if max(indices) >= len(task): raise ConfigurationError( "SubsetRandomSampler cannot work properly when indices are out of range ({}) " "considering that there are {} samples in the task". format(max(indices), len(task))) # Create the sampler object. sampler = pt_samplers.SubsetRandomSampler(indices) ########################################################################### # Handle second special case: WeightedRandomSampler. elif typename == 'WeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'weights' " "required by WeightedRandomSampler") # Load weights from file. weights = np.fromfile(os.path.expanduser(config['weights']), dtype=float, count=-1, sep=',') # Create sampler class. sampler = pt_samplers.WeightedRandomSampler(weights, len(task), replacement=True) ########################################################################### # Handle third special case: kFoldRandomSampler. elif typename == 'kFoldRandomSampler': # Check presence of the attribute. if 'folds' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'folds' " "required by kFoldRandomSampler") # Create indices, depending on the fold. folds = config["folds"] if folds < 2: raise ConfigurationError( "kFoldRandomSampler requires at least two 'folds'") # Get epochs per fold (default: 1). epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. sampler = ptp_samplers.kFoldRandomSampler( len(task), folds, epochs_per_fold, task_subset_name == 'training') ########################################################################### # Handle fourd special case: kFoldWeightedRandomSampler. elif typename == 'kFoldWeightedRandomSampler': # Check presence of the attribute. if 'weights' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'weights' " "required by kFoldWeightedRandomSampler") # Load weights from file. weights = np.fromfile(os.path.expanduser(config['weights']), dtype=float, count=-1, sep=',') # Check presence of the attribute. if 'folds' not in config: raise ConfigurationError( "The sampler configuration section does not contain the key 'folds' " "required by kFoldWeightedRandomSampler") # Create indices, depending on the fold. folds = config["folds"] if folds < 2: raise ConfigurationError( "kFoldRandomSampler requires at least two 'folds'") # Get epochs per fold (default: 1). epochs_per_fold = config.get("epochs_per_fold", 1) # Create the sampler object. sampler = ptp_samplers.kFoldWeightedRandomSampler( weights, len(task), folds, epochs_per_fold, task_subset_name == 'training') elif typename in ['BatchSampler', 'DistributedSampler']: # Sorry, don't support those. Yet;) raise ConfigurationError( "Sampler Factory currently does not support the '{}' sampler. Please pick one of the others " "or use defaults random sampling".format(typename)) else: # Verify that the specified class is in the samplers package. if typename not in dir(pt_samplers): raise ConfigurationError( "Could not find the specified class '{}' in the samplers package" .format(typename)) # Get the sampler class. sampler_class = getattr(pt_samplers, typename) # Create "regular" sampler. sampler = sampler_class(task) # Return sampler. return sampler except ConfigurationError as e: logger.error(e) # Do not continue with invalid sampler. exit(-1)
def build(problem, config): """ Static method returning particular sampler, depending on the name \ provided in the list of parameters & the specified problem class. :param problem: Instance of an object derived from the Problem class. :type problem: ``problems.Problem`` :param config: Parameters used to instantiate the sampler. :type config: :py:class:`ptp.configuration.ConfigInterface` ..note:: ``config`` should contains the exact (case-sensitive) class name of the sampler to instantiate. .. warning:: ``torch.utils.data.sampler.BatchSampler``, \ ``torch.utils.data.sampler.DistributedSampler`` are not supported yet. .. note:: ``torch.utils.data.sampler.SubsetRandomSampler`` expects 'indices' to index a subset of the dataset. \ Currently, the user can specify these indices using one of the following options: - Option 1: range. >>> indices = range(20) - Option 2: range as str. >>> range_str = '0, 20' - Option 3: list of indices. >>> yaml_list = yaml.load('[0, 2, 5, 10]') - Option 4: name of the file containing indices. >>> filename = "~/data/mnist/training_indices.txt" .. note:: ``torch.utils.data.sampler.WeightedRandomSampler`` expercse additional parameter 'weights'. :return: Instance of a given sampler or ``None`` if the section not present or couldn't build the sampler. """ # Initialize logger. logger = logging.initialize_logger('SamplerFactory') # Check if sampler is required, i.e. 'sampler' section is empty. if not config: logger.info("The sampler configuration section is not present, using default 'random' sampling") return None try: # Check presence of the name attribute. if 'name' not in config: raise ConfigurationError("The sampler configuration section does not contain the key 'name'") # Get the class name. name = config['name'] # Verify that the specified class is in the samplers package. if name not in dir(torch.utils.data.sampler): raise ConfigurationError("Could not find the specified class '{}' in the samplers package".format(name)) # Get the actual class. sampler_class = getattr(torch.utils.data.sampler, name) # Ok, proceed. logger.info('Loading the {} sampler from {}'.format(name, sampler_class.__module__)) # Handle "special" case. if sampler_class.__name__ == 'SubsetRandomSampler': # Check presence of the name attribute. if 'indices' not in config: raise ConfigurationError("The sampler configuration section does not contain the key 'indices' " "required by SubsetRandomSampler.") indices = config['indices'] # Analyze the type. if type(indices) == str: # Try to open the file. try: # from expanduser()'s doc: If the expansion fails or if the path does not begin # with a tilde, the path is returned unchanged. -> So operation below should be safe. file = open(os.path.expanduser(indices), "r") # Read the file. indices = file.readline() file.close() except Exception: # Ok, this is not a file. pass finally: # Try to process it as a string. # Get the digits. digits = indices.split(',') indices = [int(x) for x in digits] else: # Assume that type(indices) is a list of ints. digits = indices # Finally, we got the list of digits. if len(digits) == 2: # Create a range. indices = range(int(digits[0]), int(digits[1])) # Else: use them as they are # Check if indices are within range. if max(indices) >= len(problem): logger.error("SubsetRandomSampler cannot work properly when indices are out of range ({}) " "considering that there are {} samples in the problem!".format(max(indices), len(problem))) exit(-1) # Create the sampler object. sampler = sampler_class(indices) elif sampler_class.__name__ == 'WeightedRandomSampler': # Check presence of the name attribute. if 'weights' not in config: raise ConfigurationError("The sampler configuration section does not contain the key 'weights' " "required by WeightedRandomSampler.") # Load weights from file. weights = np.fromfile(os.path.expanduser(config['weights']), dtype=float, count=-1, sep=',') # Create sampler class. sampler = sampler_class(weights, len(problem), replacement=True) elif sampler_class.__name__ in ['BatchSampler', 'DistributedSampler']: # Sorry, don't support those. Yet;) logger.error("Sampler Factory currently does not support {} sampler. Please pick one of the others " "or use defaults random sampling.".format(sampler_class.__name__)) exit(-2) else: # Create "regular" sampler. sampler = sampler_class(problem) # Return sampler. return sampler except ConfigurationError as e: logger.error(e) logger.warning("Using default sampling without sampler.") return None
def setup_experiment(self): """ Sets up experiment of all trainers: - Calls base class setup_experiment to parse the command line arguments, - Loads the config file(s): >>> configs_to_load = self.recurrent_config_parse(flags.config, []) - Set up the log directory path: >>> os.makedirs(self.log_dir, exist_ok=False) - Add a ``FileHandler`` to the logger: >>> self.add_file_handler_to_logger(self.log_file) - Set random seeds: >>> self.set_random_seeds(self.config['training'], 'training') - Creates the pipeline consisting of many components - Creates training problem manager - Handles curriculum learning if indicated: >>> if 'curriculum_learning' in self.config['training']: >>> ... - Creates training problem manager - Set optimizer: >>> self.optimizer = getattr(torch.optim, optimizer_name) - Performs testing of compatibility of both training and validation pipelines. """ # Call base method to parse all command line arguments and add default sections. super(Trainer, self).setup_experiment() # Check if config file was selected. if self.app_state.args.config == '': print('Please pass configuration file(s) as --c parameter') exit(-1) # Check the presence of the CUDA-compatible devices. if self.app_state.args.use_gpu and (torch.cuda.device_count() == 0): self.logger.error("Cannot use GPU as there are no CUDA-compatible devices present in the system!") exit(-2) # Check if config file exists. root_config = self.app_state.args.config if not os.path.isfile(root_config): print('Error: Configuration file {} does not exist'.format(root_config)) exit(-3) # Extract absolute path to main ptp 'config' directory. abs_config_path = os.path.abspath(root_config) # Save it in app_state! self.app_state.absolute_config_path = abs_config_path[:abs_config_path.find("configs")+8] # Get relative path. rel_config_path = abs_config_path[abs_config_path.find("configs")+8:] # Get the list of configurations which need to be loaded. configs_to_load = config_parse.recurrent_config_parse(rel_config_path, [], self.app_state.absolute_config_path) # Read the YAML files one by one - but in reverse order -> overwrite the first indicated config(s) config_parse.reverse_order_config_load(self.config, configs_to_load, self.app_state.absolute_config_path) # -> At this point, the Param Registry contains the configuration loaded (and overwritten) from several files. # Log the resulting training configuration. conf_str = 'Loaded (initial) configuration:\n' conf_str += '='*80 + '\n' conf_str += yaml.safe_dump(self.config.to_dict(), default_flow_style=False) conf_str += '='*80 + '\n' print(conf_str) # Get training problem name. try: training_problem_type = self.config['training']['problem']['type'] except KeyError: print("Error: Couldn't retrieve the problem 'type' from the 'training' section in the loaded configuration") exit(-1) # Get validation problem name try: _ = self.config['validation']['problem']['type'] except KeyError: print("Error: Couldn't retrieve the problem 'type' from the 'validation' section in the loaded configuration") exit(-1) # Get pipeline name. try: pipeline_name = self.config['pipeline']['name'] except KeyError: print("Error: Couldn't retrieve the pipeline 'name' from the loaded configuration") exit(-1) # Prepare the output path for logging while True: # Dirty fix: if log_dir already exists, wait for 1 second and try again try: time_str = '{0:%Y%m%d_%H%M%S}'.format(datetime.now()) if self.app_state.args.savetag != '': time_str = time_str + "_" + self.app_state.args.savetag self.log_dir = os.path.expanduser(self.app_state.args.expdir) + '/' + training_problem_type + '/' + pipeline_name + '/' + time_str + '/' # Lowercase dir. self.log_dir = self.log_dir.lower() os.makedirs(self.log_dir, exist_ok=False) except FileExistsError: sleep(1) else: break # Set log dir. self.app_state.log_file = self.log_dir + 'trainer.log' # Initialize logger in app state. self.app_state.logger = logging.initialize_logger("AppState") # Add handlers for the logfile to worker logger. logging.add_file_handler_to_logger(self.logger) self.logger.info("Logger directory set to: {}".format(self.log_dir )) # Set cpu/gpu types. self.app_state.set_types() # Models dir. self.checkpoint_dir = self.log_dir + 'checkpoints/' os.makedirs(self.checkpoint_dir, exist_ok=False) # Set random seeds in the training section. self.set_random_seeds('training', self.config['training']) # Total number of detected errors. errors =0 ################# TRAINING PROBLEM ################# # Build training problem manager. self.training = ProblemManager('training', self.config['training']) errors += self.training.build() # parse the curriculum learning section in the loaded configuration. if 'curriculum_learning' in self.config['training']: # Initialize curriculum learning - with values from loaded configuration. self.training.problem.curriculum_learning_initialize(self.config['training']['curriculum_learning']) # Set initial values of curriculum learning. self.curric_done = self.training.problem.curriculum_learning_update_params(0) # If the 'must_finish' key is not present in config then then it will be finished by default self.config['training']['curriculum_learning'].add_default_params({'must_finish': True}) self.must_finish_curriculum = self.config['training']['curriculum_learning']['must_finish'] self.logger.info("Curriculum Learning activated") else: # If not using curriculum learning then it does not have to be finished. self.must_finish_curriculum = False self.curric_done = True ################# VALIDATION PROBLEM ################# # Build validation problem manager. self.validation = ProblemManager('validation', self.config['validation']) errors += self.validation.build() # Generate a single batch used for partial validation. if errors == 0: self.validation_dict = next(iter(self.validation.dataloader)) ###################### PIPELINE ###################### # Build the pipeline using the loaded configuration. self.pipeline = PipelineManager(pipeline_name, self.config['pipeline']) errors += self.pipeline.build() # Check errors. if errors > 0: self.logger.error('Found {} errors, terminating execution'.format(errors)) exit(-2) # Show pipeline. summary_str = self.pipeline.summarize_all_components_header() summary_str += self.training.problem.summarize_io("training") summary_str += self.validation.problem.summarize_io("validation") summary_str += self.pipeline.summarize_all_components() self.logger.info(summary_str) # Handshake definitions. self.logger.info("Handshaking training pipeline") defs_training = self.training.problem.output_data_definitions() errors += self.pipeline.handshake(defs_training) self.logger.info("Handshaking validation pipeline") defs_valid = self.validation.problem.output_data_definitions() errors += self.pipeline.handshake(defs_valid) # Check errors. if errors > 0: self.logger.error('Found {} errors, terminating execution'.format(errors)) exit(-2) ################## MODEL LOAD/FREEZE ################# # Load the pretrained models params from checkpoint. try: # Check command line arguments, then check load option in config. if self.app_state.args.load_checkpoint != "": pipeline_name = self.app_state.args.load_checkpoint msg = "command line (--load)" elif "load" in self.config['pipeline']: pipeline_name = self.config['pipeline']['load'] msg = "'pipeline' section of the configuration file" else: pipeline_name = "" # Try to load the model. if pipeline_name != "": if os.path.isfile(pipeline_name): # Load parameters from checkpoint. self.pipeline.load(pipeline_name) else: raise Exception("Couldn't load the checkpoint {} indicated in the {}: file does not exist".format(pipeline_name, msg)) # Try to load the models parameters - one by one, if set so in the configuration file. self.pipeline.load_models() except KeyError: self.logger.error("File {} indicated in the {} seems not to be a valid model checkpoint".format(pipeline_name, msg)) exit(-5) except Exception as e: self.logger.error(e) # Exit by following the logic: if user wanted to load the model but failed, then continuing the experiment makes no sense. exit(-6) # Finally, freeze the models (that the user wants to freeze). self.pipeline.freeze_models() # Log the model summaries. summary_str = self.pipeline.summarize_models_header() summary_str += self.pipeline.summarize_models() self.logger.info(summary_str) # Move the models in the pipeline to GPU. if self.app_state.args.use_gpu: self.pipeline.cuda() ################# OPTIMIZER ################# # Set the optimizer. optimizer_conf = dict(self.config['training']['optimizer']) optimizer_name = optimizer_conf['name'] del optimizer_conf['name'] # Check if there are any models in the pipeline. if len(list(filter(lambda p: p.requires_grad, self.pipeline.parameters()))) == 0: self.logger.error('Cannot proceed with training, as there are no trainable models in the pipeline (or all models are frozen)') exit(-7) # Instantiate the optimizer and filter the model parameters based on if they require gradients. self.optimizer = getattr(torch.optim, optimizer_name)( filter(lambda p: p.requires_grad, self.pipeline.parameters()), **optimizer_conf) log_str = 'Optimizer:\n' + '='*80 + "\n" log_str += " Name: " + optimizer_name + "\n" log_str += " Params: {}".format(optimizer_conf) self.logger.info(log_str)