Пример #1
0
def get_preprocessing_params(model_definition):
    model_definition = merge_with_defaults(model_definition)

    global_preprocessing_parameters = model_definition['preprocessing']
    features = (
            model_definition['input_features'] +
            model_definition['output_features']
    )

    global_preprocessing_parameters = merge_dict(
        default_preprocessing_parameters,
        global_preprocessing_parameters
    )

    merged_preprocessing_params = []
    for feature in features:
        if 'preprocessing' in feature:
            local_preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing']
            )
        else:
            local_preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]
            ]
        merged_preprocessing_params.append(
            (feature['name'], feature[TYPE], local_preprocessing_parameters)
        )

    return merged_preprocessing_params
Пример #2
0
def build_data(
        dataset_df,
        features,
        train_set_metadata,
        global_preprocessing_parameters
):
    data_dict = {}
    for feature in features:
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing']
            )
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]
            ]

        # deal with encoders that have fixed preprocessing
        if 'encoder' in feature:
            encoders_registry = get_from_registry(
                feature[TYPE],
                input_type_registry
            ).encoder_registry

            encoder_class = encoders_registry[feature['encoder']]
            if hasattr(encoder_class, 'fixed_preprocessing_parameters'):
                encoder_fpp = encoder_class.fixed_preprocessing_parameters

                preprocessing_parameters = merge_dict(
                    preprocessing_parameters,
                    resolve_pointers(encoder_fpp, feature, 'feature.')
                )
                
        handle_missing_values(
            dataset_df,
            feature,
            preprocessing_parameters
        )

        if feature['name'] not in train_set_metadata:
            train_set_metadata[feature['name']] = {}

        train_set_metadata[
            feature['name']
        ]['preprocessing'] = preprocessing_parameters

        add_feature_data = get_from_registry(
            feature[TYPE],
            base_type_registry
        ).add_feature_data
        add_feature_data(
            feature,
            dataset_df,
            data_dict,
            train_set_metadata,
            preprocessing_parameters
        )

    return data_dict
Пример #3
0
def _model_select(
    dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
    default_configs,
    user_config,
):
    """Performs model selection based on dataset or user specified model.

    Note: Current implementation returns tabnet by default.
    """

    dataset_info = get_dataset_info(dataset) if not isinstance(
        dataset, DatasetInfo) else dataset
    fields = dataset_info.fields

    base_config = default_configs["base_config"]

    # tabular dataset heuristics
    if len(fields) > 3:
        base_config = merge_dict(base_config,
                                 default_configs["combiner"]["tabnet"])

        # override combiner heuristic if explicitly provided by user
        if user_config is not None:
            if "combiner" in user_config.keys():
                model_type = user_config["combiner"]["type"]
                base_config = merge_dict(
                    base_config, default_configs["combiner"][model_type])
    else:
        # text heuristics
        for input_feature in base_config["input_features"]:
            # default text encoder is bert
            # TODO (ASN): add more robust heuristics
            if input_feature["type"] == "text":
                input_feature["encoder"] = "bert"
                base_config = merge_dict(base_config,
                                         default_configs["text"]["bert"])

            # TODO (ASN): add image heuristics

    # override and constrain automl config based on user specified values
    if user_config is not None:
        base_config = merge_dict(base_config, user_config)

        # remove all parameters from hyperparameter search that user has
        # provided explicit values for
        hyperopt_params = copy.deepcopy(base_config["hyperopt"]["parameters"])
        for hyperopt_params in hyperopt_params.keys():
            config_section, param = hyperopt_params.split(
                ".")[0], hyperopt_params.split(".")[1]
            if config_section in user_config.keys():
                if param in user_config[config_section]:
                    del base_config["hyperopt"]["parameters"][hyperopt_params]

    return base_config
Пример #4
0
def build_metadata(dataset_df, features, global_preprocessing_parameters):
    train_set_metadata = {}
    for feature in features:
        get_feature_meta = get_from_registry(
            feature[TYPE],
            base_type_registry
        ).get_feature_meta
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing']
            )
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]
            ]
        handle_missing_values(
            dataset_df,
            feature,
            preprocessing_parameters
        )
        train_set_metadata[feature['name']] = get_feature_meta(
            dataset_df[feature['name']].astype(str),
            preprocessing_parameters
        )
    return train_set_metadata
Пример #5
0
def test_missing_outputs_drop_rows():
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
        DEFAULTS: {
            CATEGORY: {
                PREPROCESSING: {
                    MISSING_VALUE_STRATEGY: FILL_WITH_MODE
                }
            }
        },
    }

    merged_config = merge_with_defaults(config)

    global_preprocessing = merged_config[DEFAULTS]
    input_feature_config = merged_config[INPUT_FEATURES][0]
    output_feature_config = merged_config[OUTPUT_FEATURES][0]

    assert output_feature_config[PREPROCESSING][
        MISSING_VALUE_STRATEGY] == DROP_ROW

    feature_preprocessing = merge_dict(
        global_preprocessing[output_feature_config[TYPE]][PREPROCESSING],
        output_feature_config[PREPROCESSING])
    assert feature_preprocessing[MISSING_VALUE_STRATEGY] == DROP_ROW

    feature_preprocessing = global_preprocessing[
        input_feature_config[TYPE]][PREPROCESSING]
    assert feature_preprocessing[MISSING_VALUE_STRATEGY] == FILL_WITH_MODE
Пример #6
0
def build_dataset_df(dataset_df,
                     features,
                     global_preprocessing_parameters,
                     train_set_metadata=None,
                     random_seed=default_random_seed,
                     **kwargs):
    global_preprocessing_parameters = merge_dict(
        default_preprocessing_parameters, global_preprocessing_parameters)

    if train_set_metadata is None:
        train_set_metadata = build_metadata(dataset_df, features,
                                            global_preprocessing_parameters)

    data_val = build_data(dataset_df, features, train_set_metadata,
                          global_preprocessing_parameters)

    data_val[SPLIT] = get_split(
        dataset_df,
        force_split=global_preprocessing_parameters['force_split'],
        split_probabilities=global_preprocessing_parameters[
            'split_probabilities'],
        stratify=global_preprocessing_parameters['stratify'],
        random_seed=random_seed)

    return data_val, train_set_metadata
Пример #7
0
def create_metrics_report(experiment_name: str) -> Tuple[Dict[str, Any], str]:
    """Compiles performance and non-performance metrics.

    `experiment_name`: name referring to the experiment.
    Returns a full report and the path where it's saved.
    """
    full_report = dict()
    os.makedirs(os.path.join(os.getcwd(), experiment_name, "metrics_report"), exist_ok=True)
    for tag in [TRAIN_TAG, EVAL_TAG]:
        if tag == TRAIN_TAG:
            resource_usage_path = os.path.join(os.getcwd(), experiment_name, CACHE, "train_resource_usage_metrics.json")
            performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "training_statistics.json")
        elif tag == EVAL_TAG:
            resource_usage_path = os.path.join(
                os.getcwd(), experiment_name, CACHE, "evaluate_resource_usage_metrics.json"
            )
            performance_path = os.path.join(os.getcwd(), experiment_name, EXPERIMENT_RUN, "test_statistics.json")
        else:
            raise ValueError("Tag unrecognized. Please choose 'train' or 'evaluate'.")

        resource_usage_metrics = load_json(resource_usage_path)
        performance_metrics = load_json(performance_path)
        full_report[tag] = merge_dict(performance_metrics, resource_usage_metrics)

    merged_file_path = os.path.join(os.getcwd(), experiment_name, "metrics_report", "{}.json".format("full_report"))
    save_json(merged_file_path, full_report)
    return full_report, merged_file_path
Пример #8
0
def merge_with_defaults(config):
    config = copy.deepcopy(config)
    _perform_sanity_checks(config)
    _set_feature_column(config)
    _set_proc_column(config)
    _merge_hyperopt_with_training(config)

    # ===== Preprocessing =====
    config["preprocessing"] = merge_dict(default_preprocessing_parameters,
                                         config.get("preprocessing", {}))

    stratify = config["preprocessing"]["stratify"]
    if stratify is not None:
        features = config["input_features"] + config["output_features"]
        feature_names = {f[COLUMN] for f in features}
        if stratify not in feature_names:
            logger.warning("Stratify is not among the features. "
                           "Cannot establish if it is a binary or category")
        elif [f for f in features
              if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}:
            raise ValueError("Stratify feature must be binary or category")

    # ===== Training =====
    set_default_value(config, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(config[TRAINING], param, value)

    set_default_value(
        config[TRAINING],
        "validation_metric",
        output_type_registry[config["output_features"][0]
                             [TYPE]].default_validation_metric,
    )

    # ===== Training Optimizer =====
    optimizer = config[TRAINING]["optimizer"]
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in config["input_features"]:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Combiner =====
    set_default_value(config, "combiner", {TYPE: default_combiner_type})

    # ===== Output features =====
    for output_feature in config["output_features"]:
        get_from_registry(
            output_feature[TYPE],
            output_type_registry).populate_defaults(output_feature)

    return config
Пример #9
0
def update_feature_from_defaults(config: Dict[str, Any], feature_dict: Dict[str, Any], config_feature_group: str):
    """Updates feature_dict belonging to an input or output feature using global encoder, decoder and loss related
    default parameters specified in the Ludwig config.

    :param config: Ludwig configuration containing parameters for different sections, including global default
        parameters for preprocessing, encoder, decoder and loss.
    :type config: dict[str, any]
    :param feature_dict: Underlying config for the specific input/output feature. This may be updated with values
        from the global defaults specified in config.
    :type feature_dict: dict[str, any]
    :param config_feature_group: Indicates whether the feature is an input feature or output feature (can be either of
        `input_features` or `output_features`).
    :type config_feature_group: str
    """
    parameter = ENCODER if config_feature_group == INPUT_FEATURES else DECODER
    registry_type = input_type_registry if config_feature_group == INPUT_FEATURES else output_type_registry

    default_params_for_feature_type = get_defaults_section_for_feature_type(
        feature_dict[TYPE], config[DEFAULTS], parameter
    )

    # Update input feature encoder or output feature decoder if it is specified in global defaults
    # TODO(#2125): This code block needs some refactoring.
    if TYPE in default_params_for_feature_type:
        # Only update encoder or decoder if the feature isn't already using a default encoder or decoder
        default_encoder_or_decoder = get_default_encoder_or_decoder(feature_dict, config_feature_group)
        if default_params_for_feature_type[TYPE] != default_encoder_or_decoder:
            # Update type and populate defaults for the encoder or decoder type
            feature_dict[parameter] = default_params_for_feature_type[TYPE]
            get_from_registry(feature_dict[TYPE], registry_type).populate_defaults(feature_dict)
        # Make a copy of default encoder or decoder parameters without the type key.
        default_params_for_feature_type = copy.deepcopy(default_params_for_feature_type)
        default_params_for_feature_type.pop(TYPE, None)

    # Update encoder or decoder with other encoder/decoder related parameters
    feature_dict.update(merge_dict(feature_dict, default_params_for_feature_type))

    # Update loss parameters for output feature from global defaults
    if parameter == DECODER:
        default_loss_params_for_feature_type = get_defaults_section_for_feature_type(
            feature_dict[TYPE], config[DEFAULTS], LOSS
        )
        feature_dict[LOSS].update(merge_dict(feature_dict[LOSS], default_loss_params_for_feature_type))
Пример #10
0
    def overwrite_defaults(self, feature):
        attributes = set(self.__dict__.keys())
        attributes.update(self.__class__.__dict__.keys())

        for k in feature.keys():
            if k in attributes:
                if (isinstance(feature[k], dict) and hasattr(self, k)
                        and isinstance(getattr(self, k), dict)):
                    setattr(self, k, merge_dict(getattr(self, k), feature[k]))
                else:
                    setattr(self, k, feature[k])
Пример #11
0
def merge_with_defaults(model_definition):
    _perform_sanity_checks(model_definition)

    # ===== Preprocessing =====
    model_definition['preprocessing'] = merge_dict(
        default_preprocessing_parameters,
        model_definition.get('preprocessing', {}))

    stratify = model_definition['preprocessing']['stratify']

    if stratify is not None:
        if stratify not in [
                x['name'] for x in model_definition['output_features']
        ]:
            raise ValueError('Stratify must be in output features')
        if ([
                x for x in model_definition['output_features']
                if x['name'] == stratify
        ][0][TYPE] not in [BINARY, CATEGORY]):
            raise ValueError('Stratify feature must be binary or category')
    # ===== Model =====
    set_default_value(model_definition, 'combiner',
                      {'type': default_combiner_type})

    # ===== Training =====
    set_default_value(model_definition, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(model_definition[TRAINING], param, value)

    set_default_value(
        model_definition[TRAINING], 'validation_metric',
        output_type_registry[model_definition['output_features'][0]
                             [TYPE]].default_validation_metric)

    # ===== Training Optimizer =====
    optimizer = model_definition[TRAINING]['optimizer']
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in model_definition['input_features']:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Output features =====
    for output_feature in model_definition['output_features']:
        get_from_registry(
            output_feature['type'],
            output_type_registry).populate_defaults(output_feature)

    return model_definition
Пример #12
0
def merge_with_defaults(config):
    _perform_sanity_checks(config)
    _set_feature_column(config)
    _set_proc_column(config)
    _merge_hyperopt_with_training(config)

    # ===== Preprocessing =====
    config['preprocessing'] = merge_dict(default_preprocessing_parameters,
                                         config.get('preprocessing', {}))

    stratify = config['preprocessing']['stratify']
    if stratify is not None:
        features = (config['input_features'] + config['output_features'])
        feature_names = set(f[COLUMN] for f in features)
        if stratify not in feature_names:
            logger.warning('Stratify is not among the features. '
                           'Cannot establish if it is a binary or category')
        elif ([f for f in features if f[COLUMN] == stratify][0][TYPE]
              not in {BINARY, CATEGORY}):
            raise ValueError('Stratify feature must be binary or category')

    # ===== Training =====
    set_default_value(config, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(config[TRAINING], param, value)

    set_default_value(
        config[TRAINING], 'validation_metric', output_type_registry[
            config['output_features'][0][TYPE]].default_validation_metric)

    # ===== Training Optimizer =====
    optimizer = config[TRAINING]['optimizer']
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in config['input_features']:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Combiner =====
    set_default_value(config, 'combiner', {TYPE: default_combiner_type})

    # ===== Output features =====
    for output_feature in config['output_features']:
        get_from_registry(
            output_feature[TYPE],
            output_type_registry).populate_defaults(output_feature)

    return config
Пример #13
0
def build_metadata(dataset_df, features, global_preprocessing_parameters):
    train_set_metadata = {}
    for feature in features:
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]]

        # deal with encoders that have fixed preprocessing
        if 'encoder' in feature:
            encoders_registry = get_from_registry(
                feature[TYPE], input_type_registry).encoder_registry
            encoder_class = encoders_registry[feature['encoder']]
            if hasattr(encoder_class, 'fixed_preprocessing_parameters'):
                encoder_fpp = encoder_class.fixed_preprocessing_parameters

                if 'preprocessing' in feature:
                    all_feature_params = merge_dict(feature,
                                                    feature['preprocessing'])
                else:
                    all_feature_params = feature

                preprocessing_parameters = merge_dict(
                    preprocessing_parameters,
                    resolve_pointers(encoder_fpp, all_feature_params,
                                     'feature.'))

        handle_missing_values(dataset_df, feature, preprocessing_parameters)

        get_feature_meta = get_from_registry(
            feature[TYPE], base_type_registry).get_feature_meta
        train_set_metadata[feature['name']] = get_feature_meta(
            dataset_df[feature['name']].astype(str), preprocessing_parameters)

    return train_set_metadata
Пример #14
0
def _upgrade_preprocessing_defaults(config: Dict[str, Any]):
    """Move feature-specific preprocessing parameters into defaults in config (in-place)"""
    type_specific_preprocessing_params = dict()

    # If preprocessing section specified and it contains feature specific preprocessing parameters,
    # make a copy and delete it from the preprocessing section
    for parameter in list(config.get(PREPROCESSING)):
        if parameter in base_type_registry:
            warnings.warn(
                f"Moving preprocessing configuration for `{parameter}` feature type from `preprocessing` section"
                " to `defaults` section in Ludwig config. This will be unsupported in v0.8.",
                DeprecationWarning,
            )
            type_specific_preprocessing_params[parameter] = config[
                PREPROCESSING].pop(parameter)

    # Delete empty preprocessing section if no other preprocessing parameters specified
    if PREPROCESSING in config and not config[PREPROCESSING]:
        del config[PREPROCESSING]

    if DEFAULTS not in config:
        config[DEFAULTS] = dict()

    # Update defaults with the default feature specific preprocessing parameters
    for feature_type, preprocessing_param in type_specific_preprocessing_params.items(
    ):
        # If defaults was empty, then create a new key with feature type
        if feature_type not in config.get(DEFAULTS):
            if PREPROCESSING in preprocessing_param:
                config[DEFAULTS][feature_type] = preprocessing_param
            else:
                config[DEFAULTS][feature_type] = {
                    PREPROCESSING: preprocessing_param
                }
        # Feature type exists but preprocessing hasn't be specified
        elif PREPROCESSING not in config[DEFAULTS][feature_type]:
            config[DEFAULTS][feature_type][
                PREPROCESSING] = preprocessing_param[PREPROCESSING]
        # Update default feature specific preprocessing with parameters from config
        else:
            config[DEFAULTS][feature_type][PREPROCESSING].update(
                merge_dict(config[DEFAULTS][feature_type][PREPROCESSING],
                           preprocessing_param[PREPROCESSING]))
Пример #15
0
def build_data(dataset_df, features, train_set_metadata,
               global_preprocessing_parameters):
    data_dict = {}
    for feature in features:
        add_feature_data = get_from_registry(
            feature[TYPE], base_type_registry).add_feature_data
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]]
        handle_missing_values(dataset_df, feature, preprocessing_parameters)
        if feature['name'] not in train_set_metadata:
            train_set_metadata[feature['name']] = {}
        train_set_metadata[
            feature['name']]['preprocessing'] = preprocessing_parameters
        add_feature_data(feature, dataset_df, data_dict, train_set_metadata,
                         preprocessing_parameters)
    return data_dict
Пример #16
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset,
                    dataset[SPLIT]
                )
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning(
                    'You requested the {} split, but the data CSV '
                    'does not contain a "split" column, so the '
                    'full data will be used instead'
                )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        train_set_metadata.get(DATA_TRAIN_HDF5_FP)
    )

    return dataset, train_set_metadata
Пример #17
0
def _model_select(
    dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
    default_configs,
    features_metadata,
    user_config,
    use_reference_config: bool,
):
    """Performs model selection based on dataset or user specified model.

    Note: Current implementation returns tabnet by default for tabular datasets.
    """

    dataset_info = get_dataset_info(dataset) if not isinstance(dataset, DatasetInfo) else dataset
    fields = dataset_info.fields

    base_config = default_configs["base_config"]
    model_category = None

    # tabular dataset heuristics
    if len(fields) > 3:
        model_category = TABULAR
        base_config = merge_dict(base_config, default_configs["combiner"][AUTOML_DEFAULT_TABULAR_MODEL])

        # override combiner heuristic if explicitly provided by user
        if user_config is not None:
            if "combiner" in user_config.keys():
                model_type = user_config["combiner"]["type"]
                base_config = merge_dict(base_config, default_configs["combiner"][model_type])
    else:
        # text heuristics
        for input_feature in base_config["input_features"]:
            # default text encoder is bert
            if input_feature["type"] == TEXT:
                model_category = TEXT
                input_feature["encoder"] = AUTOML_DEFAULT_TEXT_ENCODER
                base_config = merge_dict(base_config, default_configs[TEXT][AUTOML_DEFAULT_TEXT_ENCODER])
                base_config[HYPEROPT]["executor"]["num_samples"] = 5  # set for small hyperparameter search space

            # TODO (ASN): add image heuristics
            if input_feature["type"] == IMAGE:
                model_category = IMAGE
                input_feature["encoder"] = AUTOML_DEFAULT_IMAGE_ENCODER
                base_config = merge_dict(base_config, default_configs["combiner"]["concat"])

    # override and constrain automl config based on user specified values
    if user_config is not None:
        base_config = merge_dict(base_config, user_config)

        # remove all parameters from hyperparameter search that user has
        # provided explicit values for
        hyperopt_params = copy.deepcopy(base_config["hyperopt"]["parameters"])
        for hyperopt_params in hyperopt_params.keys():
            config_section, param = hyperopt_params.split(".")[0], hyperopt_params.split(".")[1]
            if config_section in user_config.keys():
                if param in user_config[config_section]:
                    del base_config["hyperopt"]["parameters"][hyperopt_params]

    # check if any binary or category output feature has highly imbalanced minority vs majority values
    # note: check is done after any relevant user_config has been applied
    has_imbalanced_output(base_config, features_metadata)

    # if single output feature, set relevant metric and goal if not already set
    base_config = set_output_feature_metric(base_config)

    # add as initial trial in the automl search the hyperparameter settings from
    # the best model for a similar dataset and matching model type, if any.
    if use_reference_config:
        ref_configs = _get_reference_configs()
        base_config = _add_transfer_config(base_config, ref_configs)

    return base_config, model_category, dataset_info.row_count
Пример #18
0
def merge_with_defaults(config: dict) -> dict:  # noqa: F821
    config = copy.deepcopy(config)
    upgrade_deprecated_fields(config)
    _perform_sanity_checks(config)
    _set_feature_column(config)
    _set_proc_column(config)
    _merge_hyperopt_with_trainer(config)

    # ===== Defaults =====
    if DEFAULTS not in config:
        config[DEFAULTS] = dict()

    # Update defaults with the default feature specific preprocessing parameters
    for feature_type, preprocessing_defaults in default_feature_specific_preprocessing_parameters.items():
        # Create a new key with feature type if defaults is empty
        if feature_type not in config.get(DEFAULTS):
            if PREPROCESSING in preprocessing_defaults:
                config[DEFAULTS][feature_type] = preprocessing_defaults
            else:
                config[DEFAULTS][feature_type] = {PREPROCESSING: preprocessing_defaults}
        # Feature type exists but preprocessing hasn't be specified
        elif PREPROCESSING not in config[DEFAULTS][feature_type]:
            config[DEFAULTS][feature_type][PREPROCESSING] = preprocessing_defaults
        # Preprocessing parameters exist for feature type, update defaults with parameters from config
        else:
            config[DEFAULTS][feature_type][PREPROCESSING].update(
                merge_dict(preprocessing_defaults, config[DEFAULTS][feature_type][PREPROCESSING])
            )

    # ===== Preprocessing =====
    config[PREPROCESSING] = merge_dict(base_preprocessing_parameters, config.get(PREPROCESSING, {}))
    splitter = get_splitter(**config[PREPROCESSING].get(SPLIT, {}))
    splitter.validate(config)

    # ===== Model Type =====
    set_default_value(config, MODEL_TYPE, default_model_type)

    # ===== Training =====
    # Convert config dictionary into an instance of BaseTrainerConfig.
    full_trainer_config, _ = load_trainer_with_kwargs(config[MODEL_TYPE], config[TRAINER] if TRAINER in config else {})
    config[TRAINER] = asdict(full_trainer_config)

    set_default_value(
        config[TRAINER],
        "validation_metric",
        output_type_registry[config[OUTPUT_FEATURES][0][TYPE]].default_validation_metric,
    )

    # ===== Input Features =====
    for input_feature in config[INPUT_FEATURES]:
        if config[MODEL_TYPE] == MODEL_GBM:
            input_feature[ENCODER] = "passthrough"
            remove_ecd_params(input_feature)
        get_from_registry(input_feature[TYPE], input_type_registry).populate_defaults(input_feature)

        # Update encoder parameters for output feature from global defaults
        update_feature_from_defaults(config, input_feature, INPUT_FEATURES)

    # ===== Combiner =====
    set_default_value(config, COMBINER, {TYPE: default_combiner_type})
    full_combiner_config, _ = load_config_with_kwargs(
        combiner_registry[config[COMBINER][TYPE]].get_schema_cls(), config[COMBINER]
    )
    config[COMBINER].update(asdict(full_combiner_config))

    # ===== Output features =====
    for output_feature in config[OUTPUT_FEATURES]:
        if config[MODEL_TYPE] == MODEL_GBM:
            output_feature[DECODER] = "passthrough"
            remove_ecd_params(output_feature)
        get_from_registry(output_feature[TYPE], output_type_registry).populate_defaults(output_feature)

        # By default, drop rows with missing output features
        set_default_value(output_feature, PREPROCESSING, {})
        set_default_value(output_feature[PREPROCESSING], "missing_value_strategy", DROP_ROW)

        # Update decoder and loss related parameters for output feature from global defaults
        update_feature_from_defaults(config, output_feature, OUTPUT_FEATURES)

    # ===== Hyperpot =====
    if HYPEROPT in config:
        set_default_value(config[HYPEROPT][EXECUTOR], TYPE, RAY)

    return config