def build_metadata(dataset_df, features, global_preprocessing_parameters): train_set_metadata = {} for feature in features: if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature[TYPE]], feature['preprocessing'] ) else: preprocessing_parameters = global_preprocessing_parameters[ feature[TYPE] ] # deal with encoders that have fixed preprocessing if 'encoder' in feature: encoders_registry = get_from_registry( feature[TYPE], input_type_registry ).encoder_registry encoder_class = encoders_registry[feature['encoder']] if hasattr(encoder_class, 'fixed_preprocessing_parameters'): encoder_fpp = encoder_class.fixed_preprocessing_parameters preprocessing_parameters = merge_dict( preprocessing_parameters, resolve_pointers(encoder_fpp, feature, 'feature.') ) handle_missing_values( dataset_df, feature, preprocessing_parameters ) get_feature_meta = get_from_registry( feature[TYPE], base_type_registry ).get_feature_meta train_set_metadata[feature['name']] = get_feature_meta( dataset_df[feature['name']].astype(str), preprocessing_parameters ) return train_set_metadata
def update_feature_from_defaults(config: Dict[str, Any], feature_dict: Dict[str, Any], config_feature_group: str): """Updates feature_dict belonging to an input or output feature using global encoder, decoder and loss related default parameters specified in the Ludwig config. :param config: Ludwig configuration containing parameters for different sections, including global default parameters for preprocessing, encoder, decoder and loss. :type config: dict[str, any] :param feature_dict: Underlying config for the specific input/output feature. This may be updated with values from the global defaults specified in config. :type feature_dict: dict[str, any] :param config_feature_group: Indicates whether the feature is an input feature or output feature (can be either of `input_features` or `output_features`). :type config_feature_group: str """ parameter = ENCODER if config_feature_group == INPUT_FEATURES else DECODER registry_type = input_type_registry if config_feature_group == INPUT_FEATURES else output_type_registry default_params_for_feature_type = get_defaults_section_for_feature_type( feature_dict[TYPE], config[DEFAULTS], parameter ) # Update input feature encoder or output feature decoder if it is specified in global defaults # TODO(#2125): This code block needs some refactoring. if TYPE in default_params_for_feature_type: # Only update encoder or decoder if the feature isn't already using a default encoder or decoder default_encoder_or_decoder = get_default_encoder_or_decoder(feature_dict, config_feature_group) if default_params_for_feature_type[TYPE] != default_encoder_or_decoder: # Update type and populate defaults for the encoder or decoder type feature_dict[parameter] = default_params_for_feature_type[TYPE] get_from_registry(feature_dict[TYPE], registry_type).populate_defaults(feature_dict) # Make a copy of default encoder or decoder parameters without the type key. default_params_for_feature_type = copy.deepcopy(default_params_for_feature_type) default_params_for_feature_type.pop(TYPE, None) # Update encoder or decoder with other encoder/decoder related parameters feature_dict.update(merge_dict(feature_dict, default_params_for_feature_type)) # Update loss parameters for output feature from global defaults if parameter == DECODER: default_loss_params_for_feature_type = get_defaults_section_for_feature_type( feature_dict[TYPE], config[DEFAULTS], LOSS ) feature_dict[LOSS].update(merge_dict(feature_dict[LOSS], default_loss_params_for_feature_type))
def calculate_overall_stats(test_stats, output_features, dataset, train_set_metadata): for output_feature in output_features: feature = get_from_registry( output_feature[TYPE], output_type_registry ) feature.calculate_overall_stats( test_stats, output_feature, dataset, train_set_metadata )
def convert_predictions(predictions, output_features, training_set_metadata, return_type='dict'): convert_fn = get_from_registry(return_type, conversion_registry) return convert_fn( predictions, output_features, training_set_metadata, )
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True): tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)() format_dtype = int_type(len(unit_to_id) - 1) return _get_sequence_vector(sequence, tokenizer, tokenizer_type, format_dtype, unit_to_id, lowercase=lowercase)
def __init__(self, reduce_mode=None): super().__init__() # save as private variable for debugging self._reduce_mode = reduce_mode # use registry to find required reduction function self._reduce_obj = get_from_registry( reduce_mode, reduce_mode_registry )()
def build_feature_parameters(features): feature_parameters = {} for feature in features: fearure_builder_function = get_from_registry( feature[TYPE], parameters_builders_registry ) feature_parameters[feature[NAME]] = fearure_builder_function(feature) return feature_parameters
def get_initializer(parameters): if parameters is None: return lambda *args, **kwargs: _create_and_init( initializer_registry[parameters], {}, *args, **kwargs) elif isinstance(parameters, str): initializer_fun = get_from_registry(parameters, initializer_registry) return lambda *args, **kwargs: _create_and_init( initializer_fun, {}, *args, **kwargs) elif isinstance(parameters, dict): initializer_fun = get_from_registry(parameters[TYPE], initializer_registry) init_kwargs = parameters.copy() del init_kwargs[TYPE] return lambda *args, **kwargs: _create_and_init( initializer_fun, init_kwargs, *args, **kwargs) else: raise ValueError( f"Initializers parameters should be either strings or dictionaries, " f"but the provided parameters are a {type(parameters)}. " f"Parameters values: {parameters}")
def get_feature_meta(column, preprocessing_parameters): tokenizer = get_from_registry(preprocessing_parameters['tokenizer'], tokenizer_registry)() max_length = 0 for timeseries in column: processed_line = tokenizer(timeseries) max_length = max(max_length, len(processed_line)) max_length = min(preprocessing_parameters['timeseries_length_limit'], max_length) return {'max_timeseries_length': max_length}
def build_single_output(output_feature_def, feature_hidden, other_output_features, **kwargs): logger.debug( f"Output {output_feature_def[TYPE]} feature {output_feature_def[NAME]}" ) output_feature_class = get_from_registry(output_feature_def[TYPE], output_type_registry) output_feature_obj = output_feature_class(output_feature_def) return output_feature_obj
def ClippedOptimizer(type='sgd', clipglobalnorm=5.0, clipnorm=None, clipvalue=None, horovod=None, **kwargs): optimizer = get_from_registry(type.lower(), optimizers_registry)(**kwargs) return clip_optimizer(optimizer, clipglobalnorm, clipnorm, clipvalue, horovod=horovod)
def populate_defaults(input_feature): set_default_values(input_feature, { TIED: None, 'encoder': 'parallel_cnn', 'level': 'word' }) encoder_class = get_from_registry(input_feature['encoder'], TextInputFeature.encoder_registry) if hasattr(encoder_class, 'default_params'): set_default_values(input_feature, encoder_class.default_params)
def get_feature_meta(column, preprocessing_parameters, backend): column = column.astype(str) tokenizer = get_from_registry(preprocessing_parameters["tokenizer"], tokenizer_registry)() max_length = 0 for timeseries in column: processed_line = tokenizer(timeseries) max_length = max(max_length, len(processed_line)) max_length = min(preprocessing_parameters["timeseries_length_limit"], max_length) return {"max_timeseries_length": max_length}
def test_numeric_transformer(transformer_key, tmpdir): Transformer = get_from_registry(transformer_key, numeric_transformation_registry) transformer_name = Transformer().__class__.__name__ if transformer_name == "Log1pTransformer": raw_values = np.random.lognormal(5, 2, size=100) else: raw_values = np.random.normal(5, 2, size=100) backend = LOCAL_BACKEND parameters = Transformer.fit_transform_params(raw_values, backend) if transformer_name in {"Log1pTransformer", "IdentityTransformer"}: # should be empty assert not bool(parameters) else: # should not be empty assert bool(parameters) # instantiate numeric transformer numeric_transfomer = Transformer(**parameters) # transform values transformed_values = numeric_transfomer.transform(raw_values) # inverse transform the prior transformed values reconstructed_values = numeric_transfomer.inverse_transform(transformed_values) # should now match assert np.allclose(raw_values, reconstructed_values) # now test numeric transformer with output feature df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=["x", "y"]) config = { "input_features": [{"name": "x", "type": "number"}], "output_features": [{"name": "y", "type": "number", "preprocessing": {"normalization": transformer_key}}], "combiner": { "type": "concat", }, TRAINER: { "epochs": 2, "batch_size": 16, }, } args = { "config": config, "skip_save_processed_input": True, "output_directory": os.path.join(tmpdir, "results"), "logging_level": logging.WARN, } # ensure no exceptions are raised experiment_cli(dataset=df, **args)
def update_config_with_metadata(config, training_set_metadata): # populate input features fields depending on data # config = merge_with_defaults(config) for input_feature in config['input_features']: feature = get_from_registry(input_feature[TYPE], input_type_registry) feature.populate_defaults(input_feature) feature.update_config_with_metadata( input_feature, training_set_metadata[input_feature[NAME]], config=config) # populate output features fields depending on data for output_feature in config['output_features']: feature = get_from_registry(output_feature[TYPE], output_type_registry) feature.populate_defaults(output_feature) feature.update_config_with_metadata( output_feature, training_set_metadata[output_feature[NAME]]) for feature in (config['input_features'] + config['output_features']): if 'preprocessing' in feature: feature['preprocessing'] = training_set_metadata[ feature[NAME]]['preprocessing']
def reduce_sequence_list(sequence_list, mode): reduce_mode = get_from_registry(mode, reduce_mode_registry) reduced_list = [] for sequence in sequence_list: reduced_list.append(reduce_mode(sequence)) if len(reduced_list) > 1: if reduce_mode == dont_reduce: reduced_output = tf.concat(reduced_list, 2) else: reduced_output = tf.concat(reduced_list, 1) else: reduced_output = reduced_list[0] return reduced_output
def update_config_with_metadata(config, training_set_metadata): # populate input features fields depending on data # config = merge_with_defaults(config) for input_feature in config[INPUT_FEATURES]: feature = get_from_registry(input_feature[TYPE], input_type_registry) feature.populate_defaults(input_feature) feature.update_config_with_metadata( input_feature, training_set_metadata[input_feature[NAME]], config=config) # populate output features fields depending on data for output_feature in config[OUTPUT_FEATURES]: feature = get_from_registry(output_feature[TYPE], output_type_registry) feature.populate_defaults(output_feature) feature.update_config_with_metadata( output_feature, training_set_metadata[output_feature[NAME]]) for feature in config[INPUT_FEATURES] + config[OUTPUT_FEATURES]: if PREPROCESSING in feature: feature[PREPROCESSING] = training_set_metadata[ feature[NAME]][PREPROCESSING]
def create_optimizer_with_clipper(model, type="sgd", clipglobalnorm=5.0, clipnorm=None, clipvalue=None, horovod=None, **kwargs): optimizer_cls = get_from_registry(type.lower(), optimizers_registry) optimizer = create_optimizer(optimizer_cls, model, horovod, **kwargs) clipper = Clipper(clipglobalnorm=clipglobalnorm, clipnorm=clipnorm, clipvalue=clipvalue) return optimizer, clipper
def generate_datapoint(features): datapoint = [] for feature in features: if ('cycle' in feature and feature['cycle'] is True and feature[TYPE] in cyclers_registry): cycler_function = cyclers_registry[feature[TYPE]] feature_value = cycler_function(feature) else: generator_function = get_from_registry(feature[TYPE], generators_registry) feature_value = generator_function(feature) datapoint.append(feature_value) return datapoint
def build_sequence_matrix( sequences, inverse_vocabulary, tokenizer_type, length_limit, padding_symbol, padding='right', unknown_symbol=UNKNOWN_SYMBOL, lowercase=True, tokenizer_vocab_file=None, pretrained_model_name_or_path=None, processor=PANDAS, ): tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)( vocab_file=tokenizer_vocab_file, pretrained_model_name_or_path=pretrained_model_name_or_path, ) format_dtype = int_type(len(inverse_vocabulary) - 1) unit_vectors = sequences.map(lambda sequence: _get_sequence_vector( sequence, tokenizer, tokenizer_type, format_dtype, inverse_vocabulary, lowercase=lowercase, unknown_symbol=unknown_symbol )) max_length = processor.compute(unit_vectors.map(len).max()) if max_length < length_limit: logging.debug('max length of {0}: {1} < limit: {2}'.format( format, max_length, length_limit )) max_length = length_limit def pad(vector): sequence = np.full((max_length,), inverse_vocabulary[padding_symbol], dtype=format_dtype) limit = min(vector.shape[0], max_length) if padding == 'right': sequence[:limit] = vector[:limit] else: # if padding == 'left sequence[max_length - limit:] = vector[:limit] return sequence padded = processor.map_objects(unit_vectors, pad) return padded
def build_single_output( output_feature_def: Dict[str, Any], output_features: Dict[str, OutputFeature]) -> OutputFeature: """Builds a single output feature from the output feature definition.""" logging.debug( f"Output {output_feature_def[TYPE]} feature {output_feature_def[NAME]}" ) output_feature_class = get_from_registry(output_feature_def[TYPE], output_type_registry) output_feature_obj = output_feature_class(output_feature_def, output_features) return output_feature_obj
def add_feature_data(feature, input_df, proc_df, metadata, preprocessing_parameters, backend): proc_df[feature[PROC_COLUMN]] = input_df[feature[COLUMN]].astype( np.float32).values # normalize data as required numeric_transformer = get_from_registry( preprocessing_parameters.get('normalization', None), numeric_transformation_registry)(**metadata[feature[NAME]]) proc_df[feature[PROC_COLUMN]] = \ numeric_transformer.transform(proc_df[feature[PROC_COLUMN]]) return proc_df
def __init__(self, metadata: Dict[str, Any]): super().__init__() if metadata["preprocessing"][ "tokenizer"] not in TORCHSCRIPT_COMPATIBLE_TOKENIZERS: raise ValueError( f"{metadata['preprocessing']['tokenizer']} is not supported by torchscript. Please use " f"one of {TORCHSCRIPT_COMPATIBLE_TOKENIZERS}.") self.tokenizer = get_from_registry( metadata["preprocessing"]["tokenizer"], tokenizer_registry)() self.padding = metadata["preprocessing"]["padding"] self.padding_value = float(metadata["preprocessing"]["padding_value"]) self.max_timeseries_length = int(metadata["max_timeseries_length"]) self.computed_fill_value = metadata["preprocessing"][ "computed_fill_value"]
def __init__(self, metadata: Dict[str, Any], is_bag: bool = False): super().__init__() if metadata["preprocessing"]["tokenizer"] not in TORCHSCRIPT_COMPATIBLE_TOKENIZERS: raise ValueError( f"{metadata['preprocessing']['tokenizer']} is not supported by torchscript. Please use " f"one of {TORCHSCRIPT_COMPATIBLE_TOKENIZERS}." ) self.lowercase = metadata["preprocessing"]["lowercase"] self.tokenizer = get_from_registry(metadata["preprocessing"]["tokenizer"], tokenizer_registry)() self.vocab_size = metadata["vocab_size"] self.unknown_symbol = UNKNOWN_SYMBOL self.unit_to_id = metadata["str2idx"] self.is_bag = is_bag
def create_vocabulary(data, tokenizer_type='space', add_unknown=True, add_padding=True, lowercase=True, num_most_frequent=None, vocab_file=None, unknown_symbol=UNKNOWN_SYMBOL, padding_symbol=PADDING_SYMBOL): vocab = None max_line_length = 0 unit_counts = Counter() if tokenizer_type == 'bert': vocab = load_vocabulary(vocab_file) add_unknown = False add_padding = False elif vocab_file is not None: vocab = load_vocabulary(vocab_file) tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)(vocab_file=vocab_file) for line in data: processed_line = tokenizer(line.lower() if lowercase else line) unit_counts.update(processed_line) max_line_length = max(max_line_length, len(processed_line)) if vocab is None: vocab = [ unit for unit, count in unit_counts.most_common(num_most_frequent) ] vocab_set = set(vocab) if add_unknown: if unknown_symbol in vocab_set: vocab.remove(unknown_symbol) vocab = [unknown_symbol] + vocab if add_padding: if padding_symbol in vocab_set: vocab.remove(padding_symbol) vocab = [padding_symbol] + vocab str2idx = {unit: i for i, unit in enumerate(vocab)} str2freq = { unit: unit_counts.get(unit) if unit in unit_counts else 0 for unit in vocab } return vocab, str2idx, str2freq, max_line_length
def load_dataset(dataset, data_format=None, df_lib=PANDAS_DF): if not data_format or data_format == "auto": data_format = figure_data_format(dataset) # use appropriate reader to create dataframe if data_format in DATAFRAME_FORMATS: return dataset elif data_format in DICT_FORMATS: return pd.DataFrame(dataset) elif data_format in CACHEABLE_FORMATS: data_reader = get_from_registry(data_format, external_data_reader_registry) return data_reader(dataset, df_lib) else: ValueError(f"{data_format} format is not supported")
def build_single_output(output_feature_def, feature_hidden, other_output_features, **kwargs): logger.debug('Output {} feature {}'.format(output_feature_def[TYPE], output_feature_def['name'])) output_feature_class = get_from_registry(output_feature_def[TYPE], output_type_registry) output_feature_obj = output_feature_class(output_feature_def) # weighted_train_mean_loss, weighted_eval_loss, output_tensors = output_feature_obj.concat_dependencies_and_build_output( # feature_hidden, # other_output_features, # **kwargs # ) return output_feature_obj
def update_model_definition_with_metadata(model_definition, training_set_metadata): # populate input features fields depending on data # model_definition = merge_with_defaults(model_definition) for input_feature in model_definition['input_features']: feature = get_from_registry(input_feature[TYPE], input_type_registry) feature.populate_defaults(input_feature) feature.update_model_definition_with_metadata( input_feature, training_set_metadata[input_feature[NAME]], model_definition=model_definition) # populate output features fields depending on data for output_feature in model_definition['output_features']: feature = get_from_registry(output_feature[TYPE], output_type_registry) feature.populate_defaults(output_feature) feature.update_model_definition_with_metadata( output_feature, training_set_metadata[output_feature[NAME]]) for feature in (model_definition['input_features'] + model_definition['output_features']): if 'preprocessing' in feature: feature['preprocessing'] = training_set_metadata[ feature[NAME]]['preprocessing']
def build_single_input( input_feature_def: Dict[str, Any], other_input_features: Dict[str, InputFeature], **kwargs ) -> InputFeature: logger.debug(f"Input {input_feature_def[TYPE]} feature {input_feature_def[NAME]}") encoder_obj = None if input_feature_def.get(constants.TIED, None) is not None: tied_input_feature_name = input_feature_def[constants.TIED] if tied_input_feature_name in other_input_features: encoder_obj = other_input_features[tied_input_feature_name].encoder_obj input_feature_class = get_from_registry(input_feature_def[TYPE], input_type_registry) input_feature_obj = input_feature_class(input_feature_def, encoder_obj) return input_feature_obj
def postprocess_results( result, output_feature, metadata, experiment_dir_name='', skip_save_unprocessed_output=False, ): feature = get_from_registry(output_feature[TYPE], output_type_registry) return feature.postprocess_results( output_feature, result, metadata, experiment_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, )