def get_preprocessing_params(model_definition): model_definition = merge_with_defaults(model_definition) global_preprocessing_parameters = model_definition['preprocessing'] features = ( model_definition['input_features'] + model_definition['output_features'] ) global_preprocessing_parameters = merge_dict( default_preprocessing_parameters, global_preprocessing_parameters ) merged_preprocessing_params = [] for feature in features: if 'preprocessing' in feature: local_preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing'] ) else: local_preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE] ] merged_preprocessing_params.append( (feature['name'], feature[TYPE], local_preprocessing_parameters) ) return merged_preprocessing_params
def build_data( dataset_df, features, train_set_metadata, global_preprocessing_parameters ): data_dict = {} for feature in features: if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing'] ) else: preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE] ] # deal with encoders that have fixed preprocessing if 'encoder' in feature: encoders_registry = get_from_registry( feature[TYPE], input_type_registry ).encoder_registry encoder_class = encoders_registry[feature['encoder']] if hasattr(encoder_class, 'fixed_preprocessing_parameters'): encoder_fpp = encoder_class.fixed_preprocessing_parameters preprocessing_parameters = merge_dict( preprocessing_parameters, resolve_pointers(encoder_fpp, feature, 'feature.') ) handle_missing_values( dataset_df, feature, preprocessing_parameters ) if feature['name'] not in train_set_metadata: train_set_metadata[feature['name']] = {} train_set_metadata[ feature['name'] ]['preprocessing'] = preprocessing_parameters add_feature_data = get_from_registry( feature[TYPE], base_type_registry ).add_feature_data add_feature_data( feature, dataset_df, data_dict, train_set_metadata, preprocessing_parameters ) return data_dict
def _model_select( dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo], default_configs, user_config, ): """Performs model selection based on dataset or user specified model. Note: Current implementation returns tabnet by default. """ dataset_info = get_dataset_info(dataset) if not isinstance( dataset, DatasetInfo) else dataset fields = dataset_info.fields base_config = default_configs["base_config"] # tabular dataset heuristics if len(fields) > 3: base_config = merge_dict(base_config, default_configs["combiner"]["tabnet"]) # override combiner heuristic if explicitly provided by user if user_config is not None: if "combiner" in user_config.keys(): model_type = user_config["combiner"]["type"] base_config = merge_dict( base_config, default_configs["combiner"][model_type]) else: # text heuristics for input_feature in base_config["input_features"]: # default text encoder is bert # TODO (ASN): add more robust heuristics if input_feature["type"] == "text": input_feature["encoder"] = "bert" base_config = merge_dict(base_config, default_configs["text"]["bert"]) # TODO (ASN): add image heuristics # override and constrain automl config based on user specified values if user_config is not None: base_config = merge_dict(base_config, user_config) # remove all parameters from hyperparameter search that user has # provided explicit values for hyperopt_params = copy.deepcopy(base_config["hyperopt"]["parameters"]) for hyperopt_params in hyperopt_params.keys(): config_section, param = hyperopt_params.split( ".")[0], hyperopt_params.split(".")[1] if config_section in user_config.keys(): if param in user_config[config_section]: del base_config["hyperopt"]["parameters"][hyperopt_params] return base_config
def build_metadata(dataset_df, features, global_preprocessing_parameters): train_set_metadata = {} for feature in features: get_feature_meta = get_from_registry( feature[TYPE], base_type_registry ).get_feature_meta if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing'] ) else: preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE] ] handle_missing_values( dataset_df, feature, preprocessing_parameters ) train_set_metadata[feature['name']] = get_feature_meta( dataset_df[feature['name']].astype(str), preprocessing_parameters ) return train_set_metadata
def test_missing_outputs_drop_rows(): config = { INPUT_FEATURES: [category_feature()], OUTPUT_FEATURES: [category_feature()], DEFAULTS: { CATEGORY: { PREPROCESSING: { MISSING_VALUE_STRATEGY: FILL_WITH_MODE } } }, } merged_config = merge_with_defaults(config) global_preprocessing = merged_config[DEFAULTS] input_feature_config = merged_config[INPUT_FEATURES][0] output_feature_config = merged_config[OUTPUT_FEATURES][0] assert output_feature_config[PREPROCESSING][ MISSING_VALUE_STRATEGY] == DROP_ROW feature_preprocessing = merge_dict( global_preprocessing[output_feature_config[TYPE]][PREPROCESSING], output_feature_config[PREPROCESSING]) assert feature_preprocessing[MISSING_VALUE_STRATEGY] == DROP_ROW feature_preprocessing = global_preprocessing[ input_feature_config[TYPE]][PREPROCESSING] assert feature_preprocessing[MISSING_VALUE_STRATEGY] == FILL_WITH_MODE
def build_dataset_df(dataset_df, features, global_preprocessing_parameters, train_set_metadata=None, random_seed=default_random_seed, **kwargs): global_preprocessing_parameters = merge_dict( default_preprocessing_parameters, global_preprocessing_parameters) if train_set_metadata is None: train_set_metadata = build_metadata(dataset_df, features, global_preprocessing_parameters) data_val = build_data(dataset_df, features, train_set_metadata, global_preprocessing_parameters) data_val[SPLIT] = get_split( dataset_df, force_split=global_preprocessing_parameters['force_split'], split_probabilities=global_preprocessing_parameters[ 'split_probabilities'], stratify=global_preprocessing_parameters['stratify'], random_seed=random_seed) return data_val, train_set_metadata
def create_metrics_report(experiment_name: str) -> Tuple[Dict[str, Any], str]: """Compiles performance and non-performance metrics. `experiment_name`: name referring to the experiment. Returns a full report and the path where it's saved. """ full_report = dict() os.makedirs(os.path.join(os.getcwd(), experiment_name, "metrics_report"), exist_ok=True) for tag in [TRAIN_TAG, EVAL_TAG]: if tag == TRAIN_TAG: resource_usage_path = os.path.join(os.getcwd(), experiment_name, CACHE, "train_resource_usage_metrics.json") performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "training_statistics.json") elif tag == EVAL_TAG: resource_usage_path = os.path.join( os.getcwd(), experiment_name, CACHE, "evaluate_resource_usage_metrics.json" ) performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "test_statistics.json") else: raise ValueError("Tag unrecognized. Please choose 'train' or 'evaluate'.") resource_usage_metrics = load_json(resource_usage_path) performance_metrics = load_json(performance_path) full_report[tag] = merge_dict(performance_metrics, resource_usage_metrics) merged_file_path = os.path.join(os.getcwd(), experiment_name, "metrics_report", "{}.json".format("full_report")) save_json(merged_file_path, full_report) return full_report, merged_file_path
def merge_with_defaults(config): config = copy.deepcopy(config) _perform_sanity_checks(config) _set_feature_column(config) _set_proc_column(config) _merge_hyperopt_with_training(config) # ===== Preprocessing ===== config["preprocessing"] = merge_dict(default_preprocessing_parameters, config.get("preprocessing", {})) stratify = config["preprocessing"]["stratify"] if stratify is not None: features = config["input_features"] + config["output_features"] feature_names = {f[COLUMN] for f in features} if stratify not in feature_names: logger.warning("Stratify is not among the features. " "Cannot establish if it is a binary or category") elif [f for f in features if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}: raise ValueError("Stratify feature must be binary or category") # ===== Training ===== set_default_value(config, TRAINING, default_training_params) for param, value in default_training_params.items(): set_default_value(config[TRAINING], param, value) set_default_value( config[TRAINING], "validation_metric", output_type_registry[config["output_features"][0] [TYPE]].default_validation_metric, ) # ===== Training Optimizer ===== optimizer = config[TRAINING]["optimizer"] default_optimizer_params = get_default_optimizer_params(optimizer[TYPE]) for param in default_optimizer_params: set_default_value(optimizer, param, default_optimizer_params[param]) # ===== Input Features ===== for input_feature in config["input_features"]: get_from_registry(input_feature[TYPE], input_type_registry).populate_defaults(input_feature) # ===== Combiner ===== set_default_value(config, "combiner", {TYPE: default_combiner_type}) # ===== Output features ===== for output_feature in config["output_features"]: get_from_registry( output_feature[TYPE], output_type_registry).populate_defaults(output_feature) return config
def update_feature_from_defaults(config: Dict[str, Any], feature_dict: Dict[str, Any], config_feature_group: str): """Updates feature_dict belonging to an input or output feature using global encoder, decoder and loss related default parameters specified in the Ludwig config. :param config: Ludwig configuration containing parameters for different sections, including global default parameters for preprocessing, encoder, decoder and loss. :type config: dict[str, any] :param feature_dict: Underlying config for the specific input/output feature. This may be updated with values from the global defaults specified in config. :type feature_dict: dict[str, any] :param config_feature_group: Indicates whether the feature is an input feature or output feature (can be either of `input_features` or `output_features`). :type config_feature_group: str """ parameter = ENCODER if config_feature_group == INPUT_FEATURES else DECODER registry_type = input_type_registry if config_feature_group == INPUT_FEATURES else output_type_registry default_params_for_feature_type = get_defaults_section_for_feature_type( feature_dict[TYPE], config[DEFAULTS], parameter ) # Update input feature encoder or output feature decoder if it is specified in global defaults # TODO(#2125): This code block needs some refactoring. if TYPE in default_params_for_feature_type: # Only update encoder or decoder if the feature isn't already using a default encoder or decoder default_encoder_or_decoder = get_default_encoder_or_decoder(feature_dict, config_feature_group) if default_params_for_feature_type[TYPE] != default_encoder_or_decoder: # Update type and populate defaults for the encoder or decoder type feature_dict[parameter] = default_params_for_feature_type[TYPE] get_from_registry(feature_dict[TYPE], registry_type).populate_defaults(feature_dict) # Make a copy of default encoder or decoder parameters without the type key. default_params_for_feature_type = copy.deepcopy(default_params_for_feature_type) default_params_for_feature_type.pop(TYPE, None) # Update encoder or decoder with other encoder/decoder related parameters feature_dict.update(merge_dict(feature_dict, default_params_for_feature_type)) # Update loss parameters for output feature from global defaults if parameter == DECODER: default_loss_params_for_feature_type = get_defaults_section_for_feature_type( feature_dict[TYPE], config[DEFAULTS], LOSS ) feature_dict[LOSS].update(merge_dict(feature_dict[LOSS], default_loss_params_for_feature_type))
def overwrite_defaults(self, feature): attributes = set(self.__dict__.keys()) attributes.update(self.__class__.__dict__.keys()) for k in feature.keys(): if k in attributes: if (isinstance(feature[k], dict) and hasattr(self, k) and isinstance(getattr(self, k), dict)): setattr(self, k, merge_dict(getattr(self, k), feature[k])) else: setattr(self, k, feature[k])
def merge_with_defaults(model_definition): _perform_sanity_checks(model_definition) # ===== Preprocessing ===== model_definition['preprocessing'] = merge_dict( default_preprocessing_parameters, model_definition.get('preprocessing', {})) stratify = model_definition['preprocessing']['stratify'] if stratify is not None: if stratify not in [ x['name'] for x in model_definition['output_features'] ]: raise ValueError('Stratify must be in output features') if ([ x for x in model_definition['output_features'] if x['name'] == stratify ][0][TYPE] not in [BINARY, CATEGORY]): raise ValueError('Stratify feature must be binary or category') # ===== Model ===== set_default_value(model_definition, 'combiner', {'type': default_combiner_type}) # ===== Training ===== set_default_value(model_definition, TRAINING, default_training_params) for param, value in default_training_params.items(): set_default_value(model_definition[TRAINING], param, value) set_default_value( model_definition[TRAINING], 'validation_metric', output_type_registry[model_definition['output_features'][0] [TYPE]].default_validation_metric) # ===== Training Optimizer ===== optimizer = model_definition[TRAINING]['optimizer'] default_optimizer_params = get_default_optimizer_params(optimizer[TYPE]) for param in default_optimizer_params: set_default_value(optimizer, param, default_optimizer_params[param]) # ===== Input Features ===== for input_feature in model_definition['input_features']: get_from_registry(input_feature[TYPE], input_type_registry).populate_defaults(input_feature) # ===== Output features ===== for output_feature in model_definition['output_features']: get_from_registry( output_feature['type'], output_type_registry).populate_defaults(output_feature) return model_definition
def merge_with_defaults(config): _perform_sanity_checks(config) _set_feature_column(config) _set_proc_column(config) _merge_hyperopt_with_training(config) # ===== Preprocessing ===== config['preprocessing'] = merge_dict(default_preprocessing_parameters, config.get('preprocessing', {})) stratify = config['preprocessing']['stratify'] if stratify is not None: features = (config['input_features'] + config['output_features']) feature_names = set(f[COLUMN] for f in features) if stratify not in feature_names: logger.warning('Stratify is not among the features. ' 'Cannot establish if it is a binary or category') elif ([f for f in features if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}): raise ValueError('Stratify feature must be binary or category') # ===== Training ===== set_default_value(config, TRAINING, default_training_params) for param, value in default_training_params.items(): set_default_value(config[TRAINING], param, value) set_default_value( config[TRAINING], 'validation_metric', output_type_registry[ config['output_features'][0][TYPE]].default_validation_metric) # ===== Training Optimizer ===== optimizer = config[TRAINING]['optimizer'] default_optimizer_params = get_default_optimizer_params(optimizer[TYPE]) for param in default_optimizer_params: set_default_value(optimizer, param, default_optimizer_params[param]) # ===== Input Features ===== for input_feature in config['input_features']: get_from_registry(input_feature[TYPE], input_type_registry).populate_defaults(input_feature) # ===== Combiner ===== set_default_value(config, 'combiner', {TYPE: default_combiner_type}) # ===== Output features ===== for output_feature in config['output_features']: get_from_registry( output_feature[TYPE], output_type_registry).populate_defaults(output_feature) return config
def build_metadata(dataset_df, features, global_preprocessing_parameters): train_set_metadata = {} for feature in features: if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE]] # deal with encoders that have fixed preprocessing if 'encoder' in feature: encoders_registry = get_from_registry( feature[TYPE], input_type_registry).encoder_registry encoder_class = encoders_registry[feature['encoder']] if hasattr(encoder_class, 'fixed_preprocessing_parameters'): encoder_fpp = encoder_class.fixed_preprocessing_parameters if 'preprocessing' in feature: all_feature_params = merge_dict(feature, feature['preprocessing']) else: all_feature_params = feature preprocessing_parameters = merge_dict( preprocessing_parameters, resolve_pointers(encoder_fpp, all_feature_params, 'feature.')) handle_missing_values(dataset_df, feature, preprocessing_parameters) get_feature_meta = get_from_registry( feature[TYPE], base_type_registry).get_feature_meta train_set_metadata[feature['name']] = get_feature_meta( dataset_df[feature['name']].astype(str), preprocessing_parameters) return train_set_metadata
def _upgrade_preprocessing_defaults(config: Dict[str, Any]): """Move feature-specific preprocessing parameters into defaults in config (in-place)""" type_specific_preprocessing_params = dict() # If preprocessing section specified and it contains feature specific preprocessing parameters, # make a copy and delete it from the preprocessing section for parameter in list(config.get(PREPROCESSING)): if parameter in base_type_registry: warnings.warn( f"Moving preprocessing configuration for `{parameter}` feature type from `preprocessing` section" " to `defaults` section in Ludwig config. This will be unsupported in v0.8.", DeprecationWarning, ) type_specific_preprocessing_params[parameter] = config[ PREPROCESSING].pop(parameter) # Delete empty preprocessing section if no other preprocessing parameters specified if PREPROCESSING in config and not config[PREPROCESSING]: del config[PREPROCESSING] if DEFAULTS not in config: config[DEFAULTS] = dict() # Update defaults with the default feature specific preprocessing parameters for feature_type, preprocessing_param in type_specific_preprocessing_params.items( ): # If defaults was empty, then create a new key with feature type if feature_type not in config.get(DEFAULTS): if PREPROCESSING in preprocessing_param: config[DEFAULTS][feature_type] = preprocessing_param else: config[DEFAULTS][feature_type] = { PREPROCESSING: preprocessing_param } # Feature type exists but preprocessing hasn't be specified elif PREPROCESSING not in config[DEFAULTS][feature_type]: config[DEFAULTS][feature_type][ PREPROCESSING] = preprocessing_param[PREPROCESSING] # Update default feature specific preprocessing with parameters from config else: config[DEFAULTS][feature_type][PREPROCESSING].update( merge_dict(config[DEFAULTS][feature_type][PREPROCESSING], preprocessing_param[PREPROCESSING]))
def build_data(dataset_df, features, train_set_metadata, global_preprocessing_parameters): data_dict = {} for feature in features: add_feature_data = get_from_registry( feature[TYPE], base_type_registry).add_feature_data if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE]] handle_missing_values(dataset_df, feature, preprocessing_parameters) if feature['name'] not in train_set_metadata: train_set_metadata[feature['name']] = {} train_set_metadata[ feature['name']]['preprocessing'] = preprocessing_parameters add_feature_data(feature, dataset_df, data_dict, train_set_metadata, preprocessing_parameters) return data_dict
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT] ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning( 'You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead' ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP) ) return dataset, train_set_metadata
def _model_select( dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo], default_configs, features_metadata, user_config, use_reference_config: bool, ): """Performs model selection based on dataset or user specified model. Note: Current implementation returns tabnet by default for tabular datasets. """ dataset_info = get_dataset_info(dataset) if not isinstance(dataset, DatasetInfo) else dataset fields = dataset_info.fields base_config = default_configs["base_config"] model_category = None # tabular dataset heuristics if len(fields) > 3: model_category = TABULAR base_config = merge_dict(base_config, default_configs["combiner"][AUTOML_DEFAULT_TABULAR_MODEL]) # override combiner heuristic if explicitly provided by user if user_config is not None: if "combiner" in user_config.keys(): model_type = user_config["combiner"]["type"] base_config = merge_dict(base_config, default_configs["combiner"][model_type]) else: # text heuristics for input_feature in base_config["input_features"]: # default text encoder is bert if input_feature["type"] == TEXT: model_category = TEXT input_feature["encoder"] = AUTOML_DEFAULT_TEXT_ENCODER base_config = merge_dict(base_config, default_configs[TEXT][AUTOML_DEFAULT_TEXT_ENCODER]) base_config[HYPEROPT]["executor"]["num_samples"] = 5 # set for small hyperparameter search space # TODO (ASN): add image heuristics if input_feature["type"] == IMAGE: model_category = IMAGE input_feature["encoder"] = AUTOML_DEFAULT_IMAGE_ENCODER base_config = merge_dict(base_config, default_configs["combiner"]["concat"]) # override and constrain automl config based on user specified values if user_config is not None: base_config = merge_dict(base_config, user_config) # remove all parameters from hyperparameter search that user has # provided explicit values for hyperopt_params = copy.deepcopy(base_config["hyperopt"]["parameters"]) for hyperopt_params in hyperopt_params.keys(): config_section, param = hyperopt_params.split(".")[0], hyperopt_params.split(".")[1] if config_section in user_config.keys(): if param in user_config[config_section]: del base_config["hyperopt"]["parameters"][hyperopt_params] # check if any binary or category output feature has highly imbalanced minority vs majority values # note: check is done after any relevant user_config has been applied has_imbalanced_output(base_config, features_metadata) # if single output feature, set relevant metric and goal if not already set base_config = set_output_feature_metric(base_config) # add as initial trial in the automl search the hyperparameter settings from # the best model for a similar dataset and matching model type, if any. if use_reference_config: ref_configs = _get_reference_configs() base_config = _add_transfer_config(base_config, ref_configs) return base_config, model_category, dataset_info.row_count
def merge_with_defaults(config: dict) -> dict: # noqa: F821 config = copy.deepcopy(config) upgrade_deprecated_fields(config) _perform_sanity_checks(config) _set_feature_column(config) _set_proc_column(config) _merge_hyperopt_with_trainer(config) # ===== Defaults ===== if DEFAULTS not in config: config[DEFAULTS] = dict() # Update defaults with the default feature specific preprocessing parameters for feature_type, preprocessing_defaults in default_feature_specific_preprocessing_parameters.items(): # Create a new key with feature type if defaults is empty if feature_type not in config.get(DEFAULTS): if PREPROCESSING in preprocessing_defaults: config[DEFAULTS][feature_type] = preprocessing_defaults else: config[DEFAULTS][feature_type] = {PREPROCESSING: preprocessing_defaults} # Feature type exists but preprocessing hasn't be specified elif PREPROCESSING not in config[DEFAULTS][feature_type]: config[DEFAULTS][feature_type][PREPROCESSING] = preprocessing_defaults # Preprocessing parameters exist for feature type, update defaults with parameters from config else: config[DEFAULTS][feature_type][PREPROCESSING].update( merge_dict(preprocessing_defaults, config[DEFAULTS][feature_type][PREPROCESSING]) ) # ===== Preprocessing ===== config[PREPROCESSING] = merge_dict(base_preprocessing_parameters, config.get(PREPROCESSING, {})) splitter = get_splitter(**config[PREPROCESSING].get(SPLIT, {})) splitter.validate(config) # ===== Model Type ===== set_default_value(config, MODEL_TYPE, default_model_type) # ===== Training ===== # Convert config dictionary into an instance of BaseTrainerConfig. full_trainer_config, _ = load_trainer_with_kwargs(config[MODEL_TYPE], config[TRAINER] if TRAINER in config else {}) config[TRAINER] = asdict(full_trainer_config) set_default_value( config[TRAINER], "validation_metric", output_type_registry[config[OUTPUT_FEATURES][0][TYPE]].default_validation_metric, ) # ===== Input Features ===== for input_feature in config[INPUT_FEATURES]: if config[MODEL_TYPE] == MODEL_GBM: input_feature[ENCODER] = "passthrough" remove_ecd_params(input_feature) get_from_registry(input_feature[TYPE], input_type_registry).populate_defaults(input_feature) # Update encoder parameters for output feature from global defaults update_feature_from_defaults(config, input_feature, INPUT_FEATURES) # ===== Combiner ===== set_default_value(config, COMBINER, {TYPE: default_combiner_type}) full_combiner_config, _ = load_config_with_kwargs( combiner_registry[config[COMBINER][TYPE]].get_schema_cls(), config[COMBINER] ) config[COMBINER].update(asdict(full_combiner_config)) # ===== Output features ===== for output_feature in config[OUTPUT_FEATURES]: if config[MODEL_TYPE] == MODEL_GBM: output_feature[DECODER] = "passthrough" remove_ecd_params(output_feature) get_from_registry(output_feature[TYPE], output_type_registry).populate_defaults(output_feature) # By default, drop rows with missing output features set_default_value(output_feature, PREPROCESSING, {}) set_default_value(output_feature[PREPROCESSING], "missing_value_strategy", DROP_ROW) # Update decoder and loss related parameters for output feature from global defaults update_feature_from_defaults(config, output_feature, OUTPUT_FEATURES) # ===== Hyperpot ===== if HYPEROPT in config: set_default_value(config[HYPEROPT][EXECUTOR], TYPE, RAY) return config