Пример #1
0
def build_metadata(dataset_df, features, global_preprocessing_parameters):
    train_set_metadata = {}
    for feature in features:
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature[TYPE]],
                feature['preprocessing']
            )
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature[TYPE]
            ]

        # deal with encoders that have fixed preprocessing
        if 'encoder' in feature:
            encoders_registry = get_from_registry(
                feature[TYPE],
                input_type_registry
            ).encoder_registry
            encoder_class = encoders_registry[feature['encoder']]
            if hasattr(encoder_class, 'fixed_preprocessing_parameters'):
                encoder_fpp = encoder_class.fixed_preprocessing_parameters

                preprocessing_parameters = merge_dict(
                    preprocessing_parameters,
                    resolve_pointers(encoder_fpp, feature, 'feature.')
                )

        handle_missing_values(
            dataset_df,
            feature,
            preprocessing_parameters
        )

        get_feature_meta = get_from_registry(
            feature[TYPE],
            base_type_registry
        ).get_feature_meta
        train_set_metadata[feature['name']] = get_feature_meta(
            dataset_df[feature['name']].astype(str),
            preprocessing_parameters
        )

    return train_set_metadata
Пример #2
0
def update_feature_from_defaults(config: Dict[str, Any], feature_dict: Dict[str, Any], config_feature_group: str):
    """Updates feature_dict belonging to an input or output feature using global encoder, decoder and loss related
    default parameters specified in the Ludwig config.

    :param config: Ludwig configuration containing parameters for different sections, including global default
        parameters for preprocessing, encoder, decoder and loss.
    :type config: dict[str, any]
    :param feature_dict: Underlying config for the specific input/output feature. This may be updated with values
        from the global defaults specified in config.
    :type feature_dict: dict[str, any]
    :param config_feature_group: Indicates whether the feature is an input feature or output feature (can be either of
        `input_features` or `output_features`).
    :type config_feature_group: str
    """
    parameter = ENCODER if config_feature_group == INPUT_FEATURES else DECODER
    registry_type = input_type_registry if config_feature_group == INPUT_FEATURES else output_type_registry

    default_params_for_feature_type = get_defaults_section_for_feature_type(
        feature_dict[TYPE], config[DEFAULTS], parameter
    )

    # Update input feature encoder or output feature decoder if it is specified in global defaults
    # TODO(#2125): This code block needs some refactoring.
    if TYPE in default_params_for_feature_type:
        # Only update encoder or decoder if the feature isn't already using a default encoder or decoder
        default_encoder_or_decoder = get_default_encoder_or_decoder(feature_dict, config_feature_group)
        if default_params_for_feature_type[TYPE] != default_encoder_or_decoder:
            # Update type and populate defaults for the encoder or decoder type
            feature_dict[parameter] = default_params_for_feature_type[TYPE]
            get_from_registry(feature_dict[TYPE], registry_type).populate_defaults(feature_dict)
        # Make a copy of default encoder or decoder parameters without the type key.
        default_params_for_feature_type = copy.deepcopy(default_params_for_feature_type)
        default_params_for_feature_type.pop(TYPE, None)

    # Update encoder or decoder with other encoder/decoder related parameters
    feature_dict.update(merge_dict(feature_dict, default_params_for_feature_type))

    # Update loss parameters for output feature from global defaults
    if parameter == DECODER:
        default_loss_params_for_feature_type = get_defaults_section_for_feature_type(
            feature_dict[TYPE], config[DEFAULTS], LOSS
        )
        feature_dict[LOSS].update(merge_dict(feature_dict[LOSS], default_loss_params_for_feature_type))
Пример #3
0
def calculate_overall_stats(test_stats, output_features, dataset,
                            train_set_metadata):
    for output_feature in output_features:
        feature = get_from_registry(
            output_feature[TYPE],
            output_type_registry
        )
        feature.calculate_overall_stats(
            test_stats, output_feature, dataset, train_set_metadata
        )
Пример #4
0
def convert_predictions(predictions,
                        output_features,
                        training_set_metadata,
                        return_type='dict'):
    convert_fn = get_from_registry(return_type, conversion_registry)
    return convert_fn(
        predictions,
        output_features,
        training_set_metadata,
    )
Пример #5
0
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True):
    tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)()

    format_dtype = int_type(len(unit_to_id) - 1)
    return _get_sequence_vector(sequence,
                                tokenizer,
                                tokenizer_type,
                                format_dtype,
                                unit_to_id,
                                lowercase=lowercase)
Пример #6
0
    def __init__(self, reduce_mode=None):
        super().__init__()
        # save as private variable for debugging
        self._reduce_mode = reduce_mode

        # use registry to find required reduction function
        self._reduce_obj = get_from_registry(
            reduce_mode,
            reduce_mode_registry
        )()
Пример #7
0
def build_feature_parameters(features):
    feature_parameters = {}
    for feature in features:
        fearure_builder_function = get_from_registry(
            feature[TYPE],
            parameters_builders_registry
        )

        feature_parameters[feature[NAME]] = fearure_builder_function(feature)
    return feature_parameters
Пример #8
0
def get_initializer(parameters):
    if parameters is None:
        return lambda *args, **kwargs: _create_and_init(
            initializer_registry[parameters], {}, *args, **kwargs)
    elif isinstance(parameters, str):
        initializer_fun = get_from_registry(parameters, initializer_registry)
        return lambda *args, **kwargs: _create_and_init(
            initializer_fun, {}, *args, **kwargs)
    elif isinstance(parameters, dict):
        initializer_fun = get_from_registry(parameters[TYPE],
                                            initializer_registry)
        init_kwargs = parameters.copy()
        del init_kwargs[TYPE]
        return lambda *args, **kwargs: _create_and_init(
            initializer_fun, init_kwargs, *args, **kwargs)
    else:
        raise ValueError(
            f"Initializers parameters should be either strings or dictionaries, "
            f"but the provided parameters are a {type(parameters)}. "
            f"Parameters values: {parameters}")
Пример #9
0
    def get_feature_meta(column, preprocessing_parameters):
        tokenizer = get_from_registry(preprocessing_parameters['tokenizer'],
                                      tokenizer_registry)()
        max_length = 0
        for timeseries in column:
            processed_line = tokenizer(timeseries)
            max_length = max(max_length, len(processed_line))
        max_length = min(preprocessing_parameters['timeseries_length_limit'],
                         max_length)

        return {'max_timeseries_length': max_length}
Пример #10
0
def build_single_output(output_feature_def, feature_hidden,
                        other_output_features, **kwargs):
    logger.debug(
        f"Output {output_feature_def[TYPE]} feature {output_feature_def[NAME]}"
    )

    output_feature_class = get_from_registry(output_feature_def[TYPE],
                                             output_type_registry)
    output_feature_obj = output_feature_class(output_feature_def)

    return output_feature_obj
Пример #11
0
def ClippedOptimizer(type='sgd',
                     clipglobalnorm=5.0,
                     clipnorm=None,
                     clipvalue=None,
                     horovod=None,
                     **kwargs):
    optimizer = get_from_registry(type.lower(), optimizers_registry)(**kwargs)
    return clip_optimizer(optimizer,
                          clipglobalnorm,
                          clipnorm,
                          clipvalue,
                          horovod=horovod)
Пример #12
0
    def populate_defaults(input_feature):
        set_default_values(input_feature, {
            TIED: None,
            'encoder': 'parallel_cnn',
            'level': 'word'
        })

        encoder_class = get_from_registry(input_feature['encoder'],
                                          TextInputFeature.encoder_registry)

        if hasattr(encoder_class, 'default_params'):
            set_default_values(input_feature, encoder_class.default_params)
Пример #13
0
    def get_feature_meta(column, preprocessing_parameters, backend):
        column = column.astype(str)
        tokenizer = get_from_registry(preprocessing_parameters["tokenizer"],
                                      tokenizer_registry)()
        max_length = 0
        for timeseries in column:
            processed_line = tokenizer(timeseries)
            max_length = max(max_length, len(processed_line))
        max_length = min(preprocessing_parameters["timeseries_length_limit"],
                         max_length)

        return {"max_timeseries_length": max_length}
Пример #14
0
def test_numeric_transformer(transformer_key, tmpdir):
    Transformer = get_from_registry(transformer_key, numeric_transformation_registry)
    transformer_name = Transformer().__class__.__name__
    if transformer_name == "Log1pTransformer":
        raw_values = np.random.lognormal(5, 2, size=100)
    else:
        raw_values = np.random.normal(5, 2, size=100)

    backend = LOCAL_BACKEND
    parameters = Transformer.fit_transform_params(raw_values, backend)
    if transformer_name in {"Log1pTransformer", "IdentityTransformer"}:
        # should be empty
        assert not bool(parameters)
    else:
        # should not be empty
        assert bool(parameters)

    # instantiate numeric transformer
    numeric_transfomer = Transformer(**parameters)

    # transform values
    transformed_values = numeric_transfomer.transform(raw_values)

    # inverse transform the prior transformed values
    reconstructed_values = numeric_transfomer.inverse_transform(transformed_values)

    # should now match
    assert np.allclose(raw_values, reconstructed_values)

    # now test numeric transformer with output feature
    df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=["x", "y"])
    config = {
        "input_features": [{"name": "x", "type": "number"}],
        "output_features": [{"name": "y", "type": "number", "preprocessing": {"normalization": transformer_key}}],
        "combiner": {
            "type": "concat",
        },
        TRAINER: {
            "epochs": 2,
            "batch_size": 16,
        },
    }

    args = {
        "config": config,
        "skip_save_processed_input": True,
        "output_directory": os.path.join(tmpdir, "results"),
        "logging_level": logging.WARN,
    }

    # ensure no exceptions are raised
    experiment_cli(dataset=df, **args)
Пример #15
0
def update_config_with_metadata(config, training_set_metadata):
    # populate input features fields depending on data
    # config = merge_with_defaults(config)
    for input_feature in config['input_features']:
        feature = get_from_registry(input_feature[TYPE], input_type_registry)
        feature.populate_defaults(input_feature)
        feature.update_config_with_metadata(
            input_feature,
            training_set_metadata[input_feature[NAME]],
            config=config)

    # populate output features fields depending on data
    for output_feature in config['output_features']:
        feature = get_from_registry(output_feature[TYPE], output_type_registry)
        feature.populate_defaults(output_feature)
        feature.update_config_with_metadata(
            output_feature, training_set_metadata[output_feature[NAME]])

    for feature in (config['input_features'] + config['output_features']):
        if 'preprocessing' in feature:
            feature['preprocessing'] = training_set_metadata[
                feature[NAME]]['preprocessing']
Пример #16
0
def reduce_sequence_list(sequence_list, mode):
    reduce_mode = get_from_registry(mode, reduce_mode_registry)
    reduced_list = []
    for sequence in sequence_list:
        reduced_list.append(reduce_mode(sequence))
    if len(reduced_list) > 1:
        if reduce_mode == dont_reduce:
            reduced_output = tf.concat(reduced_list, 2)
        else:
            reduced_output = tf.concat(reduced_list, 1)
    else:
        reduced_output = reduced_list[0]
    return reduced_output
Пример #17
0
def update_config_with_metadata(config, training_set_metadata):
    # populate input features fields depending on data
    # config = merge_with_defaults(config)
    for input_feature in config[INPUT_FEATURES]:
        feature = get_from_registry(input_feature[TYPE], input_type_registry)
        feature.populate_defaults(input_feature)
        feature.update_config_with_metadata(
            input_feature,
            training_set_metadata[input_feature[NAME]],
            config=config)

    # populate output features fields depending on data
    for output_feature in config[OUTPUT_FEATURES]:
        feature = get_from_registry(output_feature[TYPE], output_type_registry)
        feature.populate_defaults(output_feature)
        feature.update_config_with_metadata(
            output_feature, training_set_metadata[output_feature[NAME]])

    for feature in config[INPUT_FEATURES] + config[OUTPUT_FEATURES]:
        if PREPROCESSING in feature:
            feature[PREPROCESSING] = training_set_metadata[
                feature[NAME]][PREPROCESSING]
Пример #18
0
def create_optimizer_with_clipper(model,
                                  type="sgd",
                                  clipglobalnorm=5.0,
                                  clipnorm=None,
                                  clipvalue=None,
                                  horovod=None,
                                  **kwargs):
    optimizer_cls = get_from_registry(type.lower(), optimizers_registry)
    optimizer = create_optimizer(optimizer_cls, model, horovod, **kwargs)
    clipper = Clipper(clipglobalnorm=clipglobalnorm,
                      clipnorm=clipnorm,
                      clipvalue=clipvalue)
    return optimizer, clipper
Пример #19
0
def generate_datapoint(features):
    datapoint = []
    for feature in features:
        if ('cycle' in feature and feature['cycle'] is True
                and feature[TYPE] in cyclers_registry):
            cycler_function = cyclers_registry[feature[TYPE]]
            feature_value = cycler_function(feature)
        else:
            generator_function = get_from_registry(feature[TYPE],
                                                   generators_registry)
            feature_value = generator_function(feature)
        datapoint.append(feature_value)
    return datapoint
Пример #20
0
def build_sequence_matrix(
        sequences,
        inverse_vocabulary,
        tokenizer_type,
        length_limit,
        padding_symbol,
        padding='right',
        unknown_symbol=UNKNOWN_SYMBOL,
        lowercase=True,
        tokenizer_vocab_file=None,
        pretrained_model_name_or_path=None,
        processor=PANDAS,
):
    tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)(
        vocab_file=tokenizer_vocab_file,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
    )

    format_dtype = int_type(len(inverse_vocabulary) - 1)

    unit_vectors = sequences.map(lambda sequence: _get_sequence_vector(
        sequence,
        tokenizer,
        tokenizer_type,
        format_dtype,
        inverse_vocabulary,
        lowercase=lowercase,
        unknown_symbol=unknown_symbol
    ))

    max_length = processor.compute(unit_vectors.map(len).max())
    if max_length < length_limit:
        logging.debug('max length of {0}: {1} < limit: {2}'.format(
            format, max_length, length_limit
        ))
    max_length = length_limit

    def pad(vector):
        sequence = np.full((max_length,),
                           inverse_vocabulary[padding_symbol],
                           dtype=format_dtype)
        limit = min(vector.shape[0], max_length)
        if padding == 'right':
            sequence[:limit] = vector[:limit]
        else:  # if padding == 'left
            sequence[max_length - limit:] = vector[:limit]
        return sequence

    padded = processor.map_objects(unit_vectors, pad)
    return padded
Пример #21
0
    def build_single_output(
            output_feature_def: Dict[str, Any],
            output_features: Dict[str, OutputFeature]) -> OutputFeature:
        """Builds a single output feature from the output feature definition."""
        logging.debug(
            f"Output {output_feature_def[TYPE]} feature {output_feature_def[NAME]}"
        )

        output_feature_class = get_from_registry(output_feature_def[TYPE],
                                                 output_type_registry)
        output_feature_obj = output_feature_class(output_feature_def,
                                                  output_features)

        return output_feature_obj
Пример #22
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend):
        proc_df[feature[PROC_COLUMN]] = input_df[feature[COLUMN]].astype(
            np.float32).values

        # normalize data as required
        numeric_transformer = get_from_registry(
            preprocessing_parameters.get('normalization', None),
            numeric_transformation_registry)(**metadata[feature[NAME]])

        proc_df[feature[PROC_COLUMN]] = \
            numeric_transformer.transform(proc_df[feature[PROC_COLUMN]])

        return proc_df
Пример #23
0
 def __init__(self, metadata: Dict[str, Any]):
     super().__init__()
     if metadata["preprocessing"][
             "tokenizer"] not in TORCHSCRIPT_COMPATIBLE_TOKENIZERS:
         raise ValueError(
             f"{metadata['preprocessing']['tokenizer']} is not supported by torchscript. Please use "
             f"one of {TORCHSCRIPT_COMPATIBLE_TOKENIZERS}.")
     self.tokenizer = get_from_registry(
         metadata["preprocessing"]["tokenizer"], tokenizer_registry)()
     self.padding = metadata["preprocessing"]["padding"]
     self.padding_value = float(metadata["preprocessing"]["padding_value"])
     self.max_timeseries_length = int(metadata["max_timeseries_length"])
     self.computed_fill_value = metadata["preprocessing"][
         "computed_fill_value"]
Пример #24
0
    def __init__(self, metadata: Dict[str, Any], is_bag: bool = False):
        super().__init__()
        if metadata["preprocessing"]["tokenizer"] not in TORCHSCRIPT_COMPATIBLE_TOKENIZERS:
            raise ValueError(
                f"{metadata['preprocessing']['tokenizer']} is not supported by torchscript. Please use "
                f"one of {TORCHSCRIPT_COMPATIBLE_TOKENIZERS}."
            )

        self.lowercase = metadata["preprocessing"]["lowercase"]
        self.tokenizer = get_from_registry(metadata["preprocessing"]["tokenizer"], tokenizer_registry)()
        self.vocab_size = metadata["vocab_size"]
        self.unknown_symbol = UNKNOWN_SYMBOL
        self.unit_to_id = metadata["str2idx"]
        self.is_bag = is_bag
Пример #25
0
def create_vocabulary(data,
                      tokenizer_type='space',
                      add_unknown=True,
                      add_padding=True,
                      lowercase=True,
                      num_most_frequent=None,
                      vocab_file=None,
                      unknown_symbol=UNKNOWN_SYMBOL,
                      padding_symbol=PADDING_SYMBOL):
    vocab = None
    max_line_length = 0
    unit_counts = Counter()

    if tokenizer_type == 'bert':
        vocab = load_vocabulary(vocab_file)
        add_unknown = False
        add_padding = False
    elif vocab_file is not None:
        vocab = load_vocabulary(vocab_file)

    tokenizer = get_from_registry(tokenizer_type,
                                  tokenizer_registry)(vocab_file=vocab_file)
    for line in data:
        processed_line = tokenizer(line.lower() if lowercase else line)
        unit_counts.update(processed_line)
        max_line_length = max(max_line_length, len(processed_line))

    if vocab is None:
        vocab = [
            unit for unit, count in unit_counts.most_common(num_most_frequent)
        ]

    vocab_set = set(vocab)
    if add_unknown:
        if unknown_symbol in vocab_set:
            vocab.remove(unknown_symbol)
        vocab = [unknown_symbol] + vocab
    if add_padding:
        if padding_symbol in vocab_set:
            vocab.remove(padding_symbol)
        vocab = [padding_symbol] + vocab

    str2idx = {unit: i for i, unit in enumerate(vocab)}
    str2freq = {
        unit: unit_counts.get(unit) if unit in unit_counts else 0
        for unit in vocab
    }

    return vocab, str2idx, str2freq, max_line_length
Пример #26
0
def load_dataset(dataset, data_format=None, df_lib=PANDAS_DF):
    if not data_format or data_format == "auto":
        data_format = figure_data_format(dataset)

    # use appropriate reader to create dataframe
    if data_format in DATAFRAME_FORMATS:
        return dataset
    elif data_format in DICT_FORMATS:
        return pd.DataFrame(dataset)
    elif data_format in CACHEABLE_FORMATS:
        data_reader = get_from_registry(data_format,
                                        external_data_reader_registry)
        return data_reader(dataset, df_lib)
    else:
        ValueError(f"{data_format} format is not supported")
Пример #27
0
def build_single_output(output_feature_def, feature_hidden,
                        other_output_features, **kwargs):
    logger.debug('Output {} feature {}'.format(output_feature_def[TYPE],
                                               output_feature_def['name']))

    output_feature_class = get_from_registry(output_feature_def[TYPE],
                                             output_type_registry)
    output_feature_obj = output_feature_class(output_feature_def)
    # weighted_train_mean_loss, weighted_eval_loss, output_tensors = output_feature_obj.concat_dependencies_and_build_output(
    #    feature_hidden,
    #    other_output_features,
    #    **kwargs
    # )

    return output_feature_obj
Пример #28
0
def update_model_definition_with_metadata(model_definition,
                                          training_set_metadata):
    # populate input features fields depending on data
    # model_definition = merge_with_defaults(model_definition)
    for input_feature in model_definition['input_features']:
        feature = get_from_registry(input_feature[TYPE], input_type_registry)
        feature.populate_defaults(input_feature)
        feature.update_model_definition_with_metadata(
            input_feature,
            training_set_metadata[input_feature[NAME]],
            model_definition=model_definition)

    # populate output features fields depending on data
    for output_feature in model_definition['output_features']:
        feature = get_from_registry(output_feature[TYPE], output_type_registry)
        feature.populate_defaults(output_feature)
        feature.update_model_definition_with_metadata(
            output_feature, training_set_metadata[output_feature[NAME]])

    for feature in (model_definition['input_features'] +
                    model_definition['output_features']):
        if 'preprocessing' in feature:
            feature['preprocessing'] = training_set_metadata[
                feature[NAME]]['preprocessing']
Пример #29
0
def build_single_input(
    input_feature_def: Dict[str, Any], other_input_features: Dict[str, InputFeature], **kwargs
) -> InputFeature:
    logger.debug(f"Input {input_feature_def[TYPE]} feature {input_feature_def[NAME]}")

    encoder_obj = None
    if input_feature_def.get(constants.TIED, None) is not None:
        tied_input_feature_name = input_feature_def[constants.TIED]
        if tied_input_feature_name in other_input_features:
            encoder_obj = other_input_features[tied_input_feature_name].encoder_obj

    input_feature_class = get_from_registry(input_feature_def[TYPE], input_type_registry)
    input_feature_obj = input_feature_class(input_feature_def, encoder_obj)

    return input_feature_obj
Пример #30
0
def postprocess_results(
    result,
    output_feature,
    metadata,
    experiment_dir_name='',
    skip_save_unprocessed_output=False,
):
    feature = get_from_registry(output_feature[TYPE], output_type_registry)
    return feature.postprocess_results(
        output_feature,
        result,
        metadata,
        experiment_dir_name,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
    )