Exemplo n.º 1
0
def get_preprocessing_params(model_definition):
    model_definition = merge_with_defaults(model_definition)

    global_preprocessing_parameters = model_definition['preprocessing']
    features = (
            model_definition['input_features'] +
            model_definition['output_features']
    )

    global_preprocessing_parameters = merge_dict(
        default_preprocessing_parameters,
        global_preprocessing_parameters
    )

    merged_preprocessing_params = []
    for feature in features:
        if 'preprocessing' in feature:
            local_preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing']
            )
        else:
            local_preprocessing_parameters = global_preprocessing_parameters[
                feature['type']
            ]
        merged_preprocessing_params.append(
            (feature['name'], feature['type'], local_preprocessing_parameters)
        )

    return merged_preprocessing_params
Exemplo n.º 2
0
def build_dataset_df(dataset_df,
                     features,
                     global_preprocessing_parameters,
                     train_set_metadata=None,
                     random_seed=default_random_seed,
                     **kwargs):
    global_preprocessing_parameters = merge_dict(
        default_preprocessing_parameters, global_preprocessing_parameters)

    if train_set_metadata is None:
        train_set_metadata = build_metadata(dataset_df, features,
                                            global_preprocessing_parameters)

    data_val = build_data(dataset_df, features, train_set_metadata,
                          global_preprocessing_parameters)

    data_val['split'] = get_split(
        dataset_df,
        force_split=global_preprocessing_parameters['force_split'],
        split_probabilities=global_preprocessing_parameters[
            'split_probabilities'],
        stratify=global_preprocessing_parameters['stratify'],
        random_seed=random_seed)

    return data_val, train_set_metadata
Exemplo n.º 3
0
def merge_with_defaults(model_definition):
    _perform_sanity_checks(model_definition)

    # ===== Preprocessing =====
    model_definition['preprocessing'] = merge_dict(
        default_preprocessing_parameters,
        model_definition.get('preprocessing', {})
    )

    stratify = model_definition['preprocessing']['stratify']

    if stratify is not None:
        if stratify not in [x['name'] for x in
                            model_definition['output_features']]:
            raise ValueError('Stratify must be in output features')
        if ([x for x in model_definition['output_features'] if
             x['name'] == stratify][0]['type']
                not in [BINARY, CATEGORY]):
            raise ValueError('Stratify feature must be binary or category')
    # ===== Model =====
    set_default_value(model_definition, 'combiner',
                      {'type': default_combiner_type})

    # ===== Training =====
    set_default_value(model_definition, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(model_definition[TRAINING], param,
                          value)

    set_default_value(
        model_definition[TRAINING],
        'validation_measure',

        output_type_registry[model_definition['output_features'][0][
            'type']].default_validation_measure
    )

    # ===== Training Optimizer =====
    optimizer = model_definition[TRAINING]['optimizer']
    default_optimizer_params = get_default_optimizer_params(optimizer['type'])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in model_definition['input_features']:
        get_from_registry(input_feature['type'],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Output features =====
    for output_feature in model_definition['output_features']:
        get_from_registry(output_feature['type'],
                          output_type_registry).populate_defaults(
            output_feature)

    return model_definition
Exemplo n.º 4
0
def build_metadata(dataset_df, features, global_preprocessing_parameters):
    train_set_metadata = {}
    for feature in features:
        get_feature_meta = get_from_registry(
            feature['type'], base_type_registry).get_feature_meta
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature['type']]
        train_set_metadata[feature['name']] = get_feature_meta(
            dataset_df[feature['name']].astype(str), preprocessing_parameters)
    return train_set_metadata
Exemplo n.º 5
0
    def overwrite_defaults(self, feature):
        attributes = self.__dict__.keys()

        remaining_dict = dict(feature)

        for k in feature.keys():
            if k in attributes:
                if (isinstance(feature[k], dict) and hasattr(self, k)
                        and isinstance(getattr(self, k), dict)):
                    setattr(self, k, merge_dict(getattr(self, k), feature[k]))
                else:
                    setattr(self, k, feature[k])
                del remaining_dict[k]

        return remaining_dict
Exemplo n.º 6
0
def build_data(dataset_df, features, train_set_metadata,
               global_preprocessing_parameters):
    data = {}
    for feature in features:
        add_feature_data = get_from_registry(
            feature['type'], base_type_registry).add_feature_data
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature['type']]
        handle_missing_values(dataset_df, feature, preprocessing_parameters)
        if feature['name'] not in train_set_metadata:
            train_set_metadata[feature['name']] = {}
        train_set_metadata[
            feature['name']]['preprocessing'] = preprocessing_parameters
        add_feature_data(feature, dataset_df, data, train_set_metadata,
                         preprocessing_parameters)
    return data
Exemplo n.º 7
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        data_hdf5_fp,
    )

    return dataset, train_set_metadata
Exemplo n.º 8
0
def preprocess_for_prediction(model_path,
                              split,
                              dataset_type='generic',
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              only_predictions=False):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param dataset_type: Generic
        :type: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param only_predictions: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])

    # Check if hdf5 and json already exist
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        if os.path.isfile(data_hdf5_fp):
            logging.info(
                'Found hdf5 with the same filename of the csv, using it instead'
            )
            data_csv = None
            data_hdf5 = data_hdf5_fp

    # Load data
    _, _, build_dataset, _ = get_dataset_fun(dataset_type)
    train_set_metadata = load_metadata(train_set_metadata)
    features = (
        model_definition['input_features'] +
        ([] if only_predictions else model_definition['output_features']))
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                [] if only_predictions else
                                model_definition['output_features'],
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'], []
                if only_predictions else model_definition['output_features'],
                shuffle_training=False)

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)

    replace_text_feature_level(model_definition, [dataset])

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        [] if only_predictions else model_definition['output_features'],
        data_hdf5,
    )

    return dataset, train_set_metadata
Exemplo n.º 9
0
def preprocess_for_prediction(model_path,
                              split,
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              evaluate_performance=True):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name']))
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                output_features,
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False)

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset, dataset[SPLIT])
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning('You requested the {} split, but the data CSV '
                               'does not contain a "split" column, so the '
                               'full data will be used instead')

    replace_text_feature_level(features, [dataset])

    dataset = Dataset(dataset, model_definition['input_features'],
                      output_features,
                      train_set_metadata.get(DATA_TRAIN_HDF5_FP))

    return dataset, train_set_metadata