Пример #1
0
def preprocess_for_prediction(model_path,
                              split,
                              dataset_type='generic',
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              only_predictions=False):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param dataset_type: Generic
        :type: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param only_predictions: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])

    # Check if hdf5 and json already exist
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        if os.path.isfile(data_hdf5_fp):
            logging.info(
                'Found hdf5 with the same filename of the csv, using it instead'
            )
            data_csv = None
            data_hdf5 = data_hdf5_fp

    # Load data
    _, _, build_dataset, _ = get_dataset_fun(dataset_type)
    train_set_metadata = load_metadata(train_set_metadata)
    features = (
        model_definition['input_features'] +
        ([] if only_predictions else model_definition['output_features']))
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                [] if only_predictions else
                                model_definition['output_features'],
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'], []
                if only_predictions else model_definition['output_features'],
                shuffle_training=False)

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)

    replace_text_feature_level(model_definition, [dataset])

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        [] if only_predictions else model_definition['output_features'],
        data_hdf5,
    )

    return dataset, train_set_metadata
Пример #2
0
def export_neuropod(ludwig_model_path,
                    neuropod_path,
                    neuropod_model_name="ludwig_model"):
    try:
        from neuropod.backends.python.packager import create_python_neuropod
    except ImportError:
        logger.error(
            'The "neuropod" package is not installed in your environment.')
        sys.exit(-1)

    data_paths = [
        {
            "path":
            os.path.join(ludwig_model_path, MODEL_HYPERPARAMETERS_FILE_NAME),
            "packaged_name":
            MODEL_HYPERPARAMETERS_FILE_NAME
        },
        {
            "path": os.path.join(ludwig_model_path,
                                 TRAIN_SET_METADATA_FILE_NAME),
            "packaged_name": TRAIN_SET_METADATA_FILE_NAME
        },
        {
            "path": os.path.join(ludwig_model_path, 'checkpoint'),
            "packaged_name": 'checkpoint'
        },
    ]
    for filename in os.listdir(ludwig_model_path):
        if (MODEL_WEIGHTS_FILE_NAME in filename
                and MODEL_WEIGHTS_PROGRESS_FILE_NAME not in filename):
            data_paths.append({
                "path": os.path.join(ludwig_model_path, filename),
                "packaged_name": filename
            })

    logger.debug('data_paths: {}'.format(data_paths))

    ludwig_model_definition = load_json(
        os.path.join(ludwig_model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    training_set_metadata = load_json(
        os.path.join(ludwig_model_path, TRAIN_SET_METADATA_FILE_NAME))

    input_spec = []
    for feature in ludwig_model_definition['input_features']:
        input_spec.append({
            "name": feature['name'],
            "dtype": "str",
            "shape": (None, 1)
        })
    logger.debug('input_spec: {}'.format(input_spec))

    output_spec = []
    for feature in ludwig_model_definition['output_features']:
        feature_type = feature['type']
        feature_name = feature['name']
        if feature_type == BINARY:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
            output_spec.append({
                "name": feature['name'] + '_probabilities',
                "dtype": "float32",
                "shape": (None, 1)
            })
        elif feature_type == NUMERICAL:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "float32",
                "shape": (None, 1)
            })
        elif feature_type == CATEGORY:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
            output_spec.append({
                "name": feature['name'] + '_probability',
                "dtype": "float32",
                "shape": (None, 1)
            })
            output_spec.append({
                "name":
                feature['name'] + '_probabilities',
                "dtype":
                "float32",
                "shape":
                (None, training_set_metadata[feature_name]['vocab_size'])
            })
        elif feature_type == SEQUENCE:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
        elif feature_type == TEXT:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
        elif feature_type == SET:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
            output_spec.append({
                "name": feature['name'] + '_probability',
                "dtype": "str",
                "shape": (None, 1)
            })
            output_spec.append({
                "name":
                feature['name'] + '_probabilities',
                "dtype":
                "float32",
                "shape":
                (None, training_set_metadata[feature_name]['vocab_size'])
            })
        elif feature_type == VECTOR:
            output_spec.append({
                "name":
                feature['name'] + '_predictions',
                "dtype":
                "float32",
                "shape":
                (None, training_set_metadata[feature_name]['vector_size'])
            })
        else:
            output_spec.append({
                "name": feature['name'] + '_predictions',
                "dtype": "str",
                "shape": (None, 1)
            })
    logger.debug('output_spec: {}'.format(output_spec))

    if os.path.exists(neuropod_path):
        if os.path.isfile(neuropod_path):
            logger.warning('Removing file: {}'.format(neuropod_path))
            os.remove(neuropod_path)
        else:
            logger.warning('Removing directory: {}'.format(neuropod_path))
            shutil.rmtree(neuropod_path, ignore_errors=True)

    from pathlib import Path
    path = Path(ludwig_path)
    logger.debug('python_root: {}'.format(path.parent.parent))

    create_python_neuropod(
        neuropod_path=neuropod_path,
        model_name=neuropod_model_name,
        data_paths=data_paths,
        code_path_spec=[{
            "python_root":
            path.parent.parent,
            "dirs_to_package": [
                "ludwig"  # Package everything in the python_root
            ],
        }],
        entrypoint_package="ludwig.neuropod_export",
        entrypoint="get_model",
        # test_deps=['torch', 'numpy'],
        skip_virtualenv=True,
        input_spec=input_spec,
        output_spec=output_spec)
    logger.info('Neuropod saved to: {}'.format(neuropod_path))
Пример #3
0
def load_metadata(metadata_file_path):
    logging.info('Loading metadata from: {0}'.format(metadata_file_path))
    return data_utils.load_json(metadata_file_path)
Пример #4
0
def preprocess_for_prediction(model_path,
                              split,
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              evaluate_performance=True):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name']))
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                output_features,
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False)

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)

    replace_text_feature_level(features, [dataset])

    dataset = Dataset(dataset, model_definition['input_features'],
                      output_features,
                      train_set_metadata.get(DATA_TRAIN_HDF5_FP))

    return dataset, train_set_metadata
Пример #5
0
def test_kfold_cv_cli(features_to_use: FeaturesToUse):
    # k-fold cross validation cli
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, 'train.csv')
        config_fp = os.path.join(tmpdir, 'config.yaml')
        results_dir = os.path.join(tmpdir, 'results')
        statistics_fp = os.path.join(results_dir,
                                     'kfold_training_statistics.json')
        indices_fp = os.path.join(results_dir, 'kfold_split_indices.json')

        # generate synthetic data for the test
        input_features = features_to_use.input_features

        output_features = features_to_use.output_features

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
            'training': {
                'epochs': 2
            }
        }

        with open(config_fp, 'w') as f:
            yaml.dump(config, f)

        # run k-fold cv
        kfold_cross_validate_cli(k_fold=num_folds,
                                 config_file=config_fp,
                                 dataset=training_data_fp,
                                 output_directory=results_dir,
                                 logging_level='warn')

        # check for expected results
        # check for existence and structure of statistics file
        assert os.path.isfile(statistics_fp)

        # check for required keys
        cv_statistics = load_json(statistics_fp)
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in cv_statistics

        # check for existence and structure of split indices file
        assert os.path.isfile(indices_fp)

        # check for required keys
        cv_indices = load_json(indices_fp)
        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in cv_indices
Пример #6
0
import pprint
import sys

from ludwig.utils.data_utils import load_json

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Display K-fold cross validation results",
        prog="display_kfold_cv_results",
        usage="%(prog)s [options]",
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument("--results_directory",
                        type=str,
                        default="results",
                        help="directory that contains the K-fold cv results")

    args = parser.parse_args(sys.argv[1:])
    results_directory = args.results_directory

    print("Retrieving results from ", results_directory)

    kfold_cv_stats = load_json(
        os.path.join(results_directory, "kfold_training_statistics.json"))

    print("#\n# K-fold Cross Validation Results\n#")
    pprint.pprint(kfold_cv_stats["overall"])
Пример #7
0
def preprocess_for_prediction(model_path,
                              split,
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              evaluate_performance=True):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name']))
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                output_features,
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False)

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset, dataset[SPLIT])
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning('You requested the {} split, but the data CSV '
                               'does not contain a "split" column, so the '
                               'full data will be used instead')

    replace_text_feature_level(features, [dataset])

    dataset = Dataset(dataset, model_definition['input_features'],
                      output_features,
                      train_set_metadata.get(DATA_TRAIN_HDF5_FP))

    return dataset, train_set_metadata
Пример #8
0
def test_kfold_cv_cli():
    # k-fold cross validation cli
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, 'train.csv')
        model_definition_fp = os.path.join(tmpdir, 'model_definition.yaml')
        results_dir = os.path.join(tmpdir, 'results')
        statistics_fp = os.path.join(results_dir,
                                     'kfold_training_statistics.json')
        indices_fp = os.path.join(results_dir, 'kfold_split_indices.json')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        generate_data(input_features, output_features, training_data_fp)

        # generate model definition file
        model_definition = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
            'training': {
                'epochs': 2
            }
        }

        with open(model_definition_fp, 'w') as f:
            yaml.dump(model_definition, f)

        # run k-fold cv
        full_kfold_cross_validate(k_fold=num_folds,
                                  model_definition_file=model_definition_fp,
                                  data_csv=training_data_fp,
                                  output_directory=results_dir,
                                  logging_level='warn')

        # check for expected results
        # check for existence and structure of statistics file
        assert os.path.isfile(statistics_fp)

        # check for required keys
        cv_statistics = load_json(statistics_fp)
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in cv_statistics

        # check for existence and structure of split indices file
        assert os.path.isfile(indices_fp)

        # check for required keys
        cv_indices = load_json(indices_fp)
        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in cv_indices
Пример #9
0
    def load(model_dir,
             logging_level=logging.ERROR,
             use_horovod=None,
             gpus=None,
             gpu_memory_limit=None,
             allow_parallel_threads=True):
        """This function allows for loading pretrained models

        # Inputs

        :param logging_level: Log level that will be sent to stderr.
        :param use_horovod: (bool) use Horovod for distributed training. Will be set
               automatically if `horovodrun` is used to launch the training script.
        :param model_dir: (string) path to the directory containing the model.
               If the model was trained by the `train` or `experiment` command,
               the model is in `results_dir/experiment_dir/model`.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
              per GPU device.
        :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
               multithreading parallelism to improve performance at the cost of
               determinism.

        # Return

        :return: (LudwigModel) a LudwigModel object


        # Example usage

        ```python
        ludwig_model = LudwigModel.load(model_dir)
        ```

        """
        horovod = configure_horovod(use_horovod)
        model_definition = broadcast_return(lambda: load_json(os.path.join(
            model_dir,
            MODEL_HYPERPARAMETERS_FILE_NAME
        )), horovod)

        # initialize model
        ludwig_model = LudwigModel(
            model_definition,
            logging_level=logging_level,
            use_horovod=use_horovod,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )

        # generate model from definition
        ludwig_model.model = LudwigModel.create_model(model_definition)

        # load model weights
        ludwig_model.load_weights(model_dir)

        # load train set metadata
        ludwig_model.training_set_metadata = broadcast_return(
            lambda: load_metadata(
                os.path.join(
                    model_dir,
                    TRAIN_SET_METADATA_FILE_NAME
                )
            ), horovod
        )

        return ludwig_model
Пример #10
0
 def _init_vocab(self, vocab_file: str) -> Dict[str, str]:
     """Loads the vocab from the vocab file."""
     str2idx = load_json(
         torchtext.utils.get_asset_local_path(vocab_file))
     _, idx2str = zip(*sorted((v, k) for k, v in str2idx.items()))
     return str2idx, idx2str
Пример #11
0
def test_kfold_cv_cli(features_to_use: FeaturesToUse):
    # k-fold cross validation cli
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, "train.csv")
        config_fp = os.path.join(tmpdir, "config.yaml")
        results_dir = os.path.join(tmpdir, "results")
        statistics_fp = os.path.join(results_dir,
                                     "kfold_training_statistics.json")
        indices_fp = os.path.join(results_dir, "kfold_split_indices.json")

        # generate synthetic data for the test
        input_features = features_to_use.input_features

        output_features = features_to_use.output_features

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

        with open(config_fp, "w") as f:
            yaml.dump(config, f)

        # run k-fold cv
        kfold_cross_validate_cli(
            k_fold=num_folds,
            config=config_fp,
            dataset=training_data_fp,
            output_directory=results_dir,
            logging_level="warn",
        )

        # check for expected results
        # check for existence and structure of statistics file
        assert os.path.isfile(statistics_fp)

        # check for required keys
        cv_statistics = load_json(statistics_fp)
        for key in ["fold_" + str(i + 1)
                    for i in range(num_folds)] + ["overall"]:
            assert key in cv_statistics

        # check for existence and structure of split indices file
        assert os.path.isfile(indices_fp)

        # check for required keys
        cv_indices = load_json(indices_fp)
        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in cv_indices
Пример #12
0
 def config(self):
     return load_json(
         os.path.join(self.lpath, MODEL_HYPERPARAMETERS_FILE_NAME))