Пример #1
0
def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features,
                                           output_features,
                                           data_csv_path,
                                           nan_percent=0.2)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)
    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        if feature_name not in preproc_inputs.keys():
            continue

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values,
            feature_values_expected), f"feature: {feature_name}"
Пример #2
0
def get_input_tensors(model: LudwigModel,
                      input_set: pd.DataFrame) -> List[Variable]:
    # Convert raw input data into preprocessed tensor data
    dataset, _ = preprocess_for_prediction(
        model.config,
        dataset=input_set,
        training_set_metadata=model.training_set_metadata,
        data_format="auto",
        split="full",
        include_outputs=False,
        backend=model.backend,
        callbacks=model.callbacks,
    )

    # Convert dataset into a dict of tensors, and split each tensor into batches to control GPU memory usage
    inputs = {
        name: torch.from_numpy(dataset.dataset[feature.proc_column]).split(
            model.config["trainer"]["batch_size"])
        for name, feature in model.model.input_features.items()
    }

    # Dict of lists to list of dicts
    input_batches = [dict(zip(inputs, t)) for t in zip(*inputs.values())]

    # Encode the inputs into embedding space. This is necessary to ensure differentiability. Otherwise, category
    # and other features that pass through an embedding will not be explainable via gradient based methods.
    output_batches = []
    for batch in input_batches:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        output = model.model.encode(batch)

        # Extract the output tensor, discarding additional state used for sequence decoding.
        output = {
            k: v["encoder_output"].detach().cpu()
            for k, v in output.items()
        }
        output_batches.append(output)

    # List of dicts to dict of lists
    encoded_inputs = {
        k: torch.cat([d[k] for d in output_batches])
        for k in output_batches[0]
    }

    # Wrap the output into a variable so torch will track the gradient.
    # TODO(travis): this won't work for text decoders, but we don't support explanations for those yet
    data_to_predict = [v for _, v in encoded_inputs.items()]
    data_to_predict = [
        Variable(t, requires_grad=True) for t in data_to_predict
    ]

    return data_to_predict
Пример #3
0
    def collect_activations(
            self,
            layer_names,
            dataset,
            data_format=None,
            batch_size=128,
            # output_directory='results',
            debug=False,
            **kwargs
    ):
        self._check_initialization()
        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        activations = predictor.batch_collect_activations(
            self.model,
            layer_names,
            dataset,
        )

        return activations
Пример #4
0
def test_savedmodel(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, 'generated_images')
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature()
        ]

        predictions_column_name = '{}_predictions'.format(
            output_features[0]['name'])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features,
                                      data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 2
            }
        }
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(
            dataset=data_csv_path)
        original_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################
        # save savedmodel
        #################
        savedmodel_path = os.path.join(dir_path, 'savedmodel')
        shutil.rmtree(savedmodel_path, ignore_errors=True)
        ludwig_model.model.save_savedmodel(savedmodel_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################################################
        # restore savedmodel, obtain predictions and weights
        #################################################
        training_set_metadata_json_fp = os.path.join(
            ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = tf.saved_model.load(savedmodel_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: tf.convert_to_tensor(dataset.dataset[feature.proc_column],
                                       dtype=feature.get_input_dtype())
            for name, feature in ludwig_model.model.input_features.items()
        }

        logits = restored_model(data_to_predict, False, None)

        restored_predictions = tf.argmax(logits[of_name]['logits'],
                                         -1,
                                         name='predictions_{}'.format(of_name))
        restored_predictions = tf.map_fn(
            lambda idx: training_set_metadata[of_name]['idx2str'][idx],
            restored_predictions,
            dtype=tf.string)

        restored_weights = deepcopy(restored_model.trainable_variables)

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(savedmodel_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # check for same number of weights as original model
        assert len(original_weights) == len(loaded_weights)
        assert len(original_weights) == len(restored_weights)

        # check to ensure weight valuess match the original model
        loaded_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           loaded_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        original_weights = sorted(original_weights, key=lambda w: w.name)
        restored_weights = sorted(restored_weights, key=lambda w: w.name)

        restored_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           restored_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        assert loaded_weights_match and restored_weights_match

        #  Are predictions identical to original ones?
        loaded_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            loaded_prediction_df[predictions_column_name])

        restored_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            restored_predictions.numpy().astype('str'))

        assert loaded_predictions_match and restored_predictions_match
Пример #5
0
def collect_activations(model_path,
                        tensors,
                        data_csv=None,
                        data_hdf5=None,
                        split='test',
                        batch_size=128,
                        output_directory='results',
                        gpus=None,
                        gpu_fraction=1.0,
                        debug=False,
                        **kwargs):
    """Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    :param model_path: Is the model from which the tensors will be collected
    :param tensors: List contaning the names of the tensors to collect
    :param data_csv: The CSV filepath which contains the datapoints from which
           the tensors are collected
    :param data_hdf5: The HDF5 file path if the CSV file path does not exist,
           an alternative source of providing the data to the model
    :param split: Split type
    :param batch_size: Batch size
    :param output_directory: Output directory
    :param gpus: The total number of GPUs that the model intends to use
    :param gpu_fraction: The fraction of each GPU that the model intends on
           using
    :param debug: To step through the stack traces and find possible errors
    :returns: None

    """
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    logger.info('Dataset path: {}'.format(
        data_csv if data_csv is not None else data_hdf5))
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    train_set_metadata_fp = os.path.join(model_path,
                                         TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_fp)

    model, model_definition = load_model_and_definition(model_path)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(dataset,
                                                  tensors,
                                                  batch_size,
                                                  gpus=gpus,
                                                  gpu_fraction=gpu_fraction)

    model.close_session()

    # saving
    os.mkdir(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Пример #6
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]

    output_features = [category_feature(vocab_size=2)]

    predictions_column_name = '{}_predictions'.format(
        output_features[0]['name'])

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)
    original_weights = deepcopy(ludwig_model.model.model.trainable_variables)
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    loaded_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    loaded_weights = deepcopy(ludwig_model.model.model.trainable_variables)

    #################################################
    # restore savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    restored_model = tf.saved_model.load(savedmodel_path)

    if_name = list(ludwig_model.model.model.input_features.keys())[0]
    of_name = list(ludwig_model.model.model.output_features.keys())[0]

    data_to_predict = {
        if_name: tf.convert_to_tensor(dataset.dataset[if_name], dtype=tf.int32)
    }

    logits = restored_model(data_to_predict, False, None)

    restored_predictions = tf.argmax(logits[of_name]['logits'],
                                     -1,
                                     name='predictions_{}'.format(of_name))
    restored_predictions = tf.map_fn(
        lambda idx: train_set_metadata[of_name]['idx2str'][idx],
        restored_predictions,
        dtype=tf.string)

    restored_weights = deepcopy(restored_model.trainable_variables)

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    # check for same number of weights as original model
    assert len(original_weights) == len(loaded_weights)
    assert len(original_weights) == len(restored_weights)

    # check to ensure weight valuess match the original model
    loaded_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy()))
        for i in range(len(original_weights))
    ])
    restored_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(),
                       restored_weights[i].numpy()))
        for i in range(len(original_weights))
    ])

    assert loaded_weights_match and restored_weights_match

    #  Are predictions identical to original ones?
    loaded_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        loaded_prediction_df[predictions_column_name])

    restored_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        restored_predictions.numpy().astype('str'))

    assert loaded_predictions_match and restored_predictions_match
Пример #7
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]
    input_feature_name = input_features[0]['name']
    input_feature_tensor_name = '{}/{}_placeholder:0'.format(
        input_feature_name, input_feature_name)
    output_features = [category_feature(vocab_size=2)]
    output_feature_name = output_features[0]['name']
    output_feature_tensor_name = '{}/predictions_{}/predictions_{}:0'.format(
        output_feature_name, output_feature_name, output_feature_name)
    predictions_column_name = '{}_predictions'.format(output_feature_name)
    weight_tensor_name = '{}/fc_0/weights:0'.format(input_feature_name)

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    with ludwig_model.model.session as sess:
        all_variables = tf.compat.v1.trainable_variables()
        all_variables_names = [v.name for v in all_variables]
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    ludwig_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    ludwig_weights = ludwig_model.model.collect_weights(all_variables_names)
    ludwig_model.close()

    #################################################
    # load savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    with tf.compat.v1.Session() as sess:
        tf.saved_model.loader.load(sess, [tf.saved_model.SERVING],
                                   savedmodel_path)

        predictions = sess.run(output_feature_tensor_name,
                               feed_dict={
                                   input_feature_tensor_name:
                                   dataset.get(input_feature_name),
                               })

        savedmodel_prediction_df = pd.DataFrame(
            data=[
                train_set_metadata[output_feature_name]["idx2str"][p]
                for p in predictions
            ],
            columns=[predictions_column_name])

        savedmodel_weights = sess.run({n: n for n in all_variables_names})

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    for var in all_variables_names:
        print("Are the weights in {} identical?".format(var),
              np.all(ludwig_weights[var] == savedmodel_weights[var]))
    print(
        "Are loaded model predictions identical to original ones?",
        np.all(
            original_predictions_df[predictions_column_name] == \
            ludwig_prediction_df[predictions_column_name]
        )
    )
    print(
        "Are savedmodel predictions identical to loaded model?",
        np.all(
            ludwig_prediction_df[predictions_column_name] == \
            savedmodel_prediction_df[predictions_column_name]
        )
    )

    for var in all_variables_names:
        assert np.all(ludwig_weights[var] == savedmodel_weights[var])
    assert np.all(
        original_predictions_df[predictions_column_name] == \
        ludwig_prediction_df[predictions_column_name]
    )
    assert np.all(
        ludwig_prediction_df[predictions_column_name] == \
        savedmodel_prediction_df[predictions_column_name]
    )
Пример #8
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split=TEST,
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 skip_save_test_predictions=False,
                 skip_save_test_statistics=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    if is_on_master():
        logger.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logger.info('Model path: {}'.format(model_path))
        logger.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        # setup directories and file names
        experiment_dir_name = find_non_existing_dir_by_adding_suffix(
            output_directory)

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_exp_dir = not (skip_save_unprocessed_output
                                     and skip_save_test_predictions
                                     and skip_save_test_statistics)
        if should_create_exp_dir:
            os.makedirs(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        if not skip_save_test_predictions:
            save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_test_results(prediction_results)
            if not skip_save_test_statistics:
                save_test_statistics(prediction_results, experiment_dir_name)

        logger.info('Saved to: {0}'.format(experiment_dir_name))
Пример #9
0
def full_predict(model_path,
                 data_csv=None,
                 data_hdf5=None,
                 split='test',
                 batch_size=128,
                 skip_save_unprocessed_output=False,
                 output_directory='results',
                 evaluate_performance=True,
                 gpus=None,
                 gpu_fraction=1.0,
                 use_horovod=False,
                 debug=False,
                 **kwargs):
    # setup directories and file names
    experiment_dir_name = output_directory
    suffix = 0
    while os.path.exists(experiment_dir_name):
        experiment_dir_name = output_directory + '_' + str(suffix)
        suffix += 1

    if is_on_master():
        logging.info('Dataset path: {}'.format(
            data_csv if data_csv is not None else data_hdf5))
        logging.info('Model path: {}'.format(model_path))
        logging.info('Output path: {}'.format(experiment_dir_name))
        logging.info('')

    train_set_metadata_json_fp = os.path.join(model_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp,
        evaluate_performance)

    # run the prediction
    if is_on_master():
        print_boxed('LOADING MODEL')
    model, model_definition = load_model_and_definition(
        model_path, use_horovod=use_horovod)

    prediction_results = predict(dataset, train_set_metadata, model,
                                 model_definition, batch_size,
                                 evaluate_performance, gpus, gpu_fraction,
                                 debug)
    model.close_session()

    if is_on_master():
        os.mkdir(experiment_dir_name)

        # postprocess
        postprocessed_output = postprocess(
            prediction_results, model_definition['output_features'],
            train_set_metadata, experiment_dir_name,
            skip_save_unprocessed_output or not is_on_master())

        save_prediction_outputs(postprocessed_output, experiment_dir_name)

        if evaluate_performance:
            print_prediction_results(prediction_results)
            save_prediction_statistics(prediction_results, experiment_dir_name)

        logging.info('Saved to: {0}'.format(experiment_dir_name))
Пример #10
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
Пример #11
0
    def evaluate(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            skip_save_eval_stats=True,
            collect_predictions=False,
            collect_overall_stats=False,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(
                self.model.output_features,
                predictions,
                dataset,
                training_set_metadata
            )
            stats = {of_name: {**stats[of_name], **overall_stats[of_name]}
            # account for presence of 'combined' key
            if of_name in overall_stats else {**stats[of_name]}
                     for of_name in stats}

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and
                    skip_save_predictions and
                    skip_save_eval_stats
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory
Пример #12
0
    def predict(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        predictions = predictor.batch_predict(
            self.model,
            dataset,
        )

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and skip_save_predictions
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        logger.debug('Postprocessing')
        postproc_predictions = convert_predictions(
            postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            ),
            self.model.output_features,
            self.training_set_metadata,
            return_type=return_type
        )

        if is_on_master():
            if not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

                logger.info('Saved to: {0}'.format(output_directory))

        return postproc_predictions, output_directory
Пример #13
0
def test_torchscript_preproc_timeseries_alternative_type(
        tmpdir, csv_filename, padding, fill_value):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    feature = timeseries_feature(
        preprocessing={
            "padding": padding,
            "timeseries_length_limit": 4,
            "fill_value": "1.0",
        },
        max_len=7,
    )
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features,
                                           output_features,
                                           data_csv_path,
                                           nan_percent=0.2)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)

    def transform_timeseries_from_str_list_to_tensor_list(timeseries_list):
        timeseries = []
        for timeseries_str in timeseries_list:
            timeseries.append(
                torch.tensor([float(x) for x in timeseries_str.split()]))
        return timeseries

    inputs[feature[NAME]] = transform_timeseries_from_str_list_to_tensor_list(
        inputs[feature[NAME]])

    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        assert feature_name in preproc_inputs.keys(
        ), f'feature "{feature_name}" not found.'

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values, feature_values_expected
        ), f'feature "{feature_name}" value mismatch.'
Пример #14
0
def test_torchscript_preproc_vector_alternative_type(tmpdir, csv_filename,
                                                     vector_type):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    feature = vector_feature()
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)

    def transform_vector_list(vector_list, vector_type):
        vectors = []
        for vector_str in vector_list:
            vectors.append(torch.tensor([float(x)
                                         for x in vector_str.split()]))

        if vector_type == torch.Tensor:
            vectors = torch.stack(vectors)
        return vectors

    inputs[feature[NAME]] = transform_vector_list(inputs[feature[NAME]],
                                                  vector_type)

    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        if feature_name not in preproc_inputs.keys():
            continue

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values,
            feature_values_expected), f"feature: {feature_name}"
Пример #15
0
def collect_activations(
        model_path,
        tensors,
        data_csv=None,
        data_hdf5=None,
        split=TEST,
        batch_size=128,
        output_directory='results',
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        debug=False,
        **kwargs
):
    """Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    :param model_path: Is the model from which the tensors will be collected
    :param tensors: List contaning the names of the tensors to collect
    :param data_csv: The CSV filepath which contains the datapoints from which
           the tensors are collected
    :param data_hdf5: The HDF5 file path if the CSV file path does not exist,
           an alternative source of providing the data to the model
    :param split: Split type
    :param batch_size: Batch size
    :param output_directory: Output directory
    :param gpus: The total number of GPUs that the model intends to use
    :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
           per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
           multithreading parallelism to improve performance at the cost of
           determinism.
    :param debug: To step through the stack traces and find possible errors
    :returns: None

    """
    # setup directories and file names
    experiment_dir_name = find_non_existing_dir_by_adding_suffix(output_directory)

    logger.info('Dataset path: {}'.format(
        data_csv if data_csv is not None else data_hdf5)
    )
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(experiment_dir_name))
    logger.info('\n')

    train_set_metadata_fp = os.path.join(
        model_path,
        TRAIN_SET_METADATA_FILE_NAME
    )

    # preprocessing
    dataset, train_set_metadata = preprocess_for_prediction(
        model_path,
        split,
        data_csv,
        data_hdf5,
        train_set_metadata_fp
    )

    model, model_definition = load_model_and_definition(model_path,
                                                        gpus=gpus,
                                                        gpu_memory_limit=gpu_memory_limit,
                                                        allow_parallel_threads=allow_parallel_threads)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(
        dataset,
        tensors,
        batch_size
    )

    # saving
    os.makedirs(experiment_dir_name)
    save_tensors(collected_tensors, experiment_dir_name)

    logger.info('Saved to: {0}'.format(experiment_dir_name))
Пример #16
0
    def evaluate(self,
                 dataset=None,
                 data_format=None,
                 batch_size=128,
                 skip_save_unprocessed_output=True,
                 skip_save_predictions=True,
                 skip_save_eval_stats=True,
                 collect_predictions=False,
                 collect_overall_stats=False,
                 output_directory='results',
                 return_type=pd.DataFrame,
                 debug=False,
                 **kwargs):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'] + \
                           self.model_definition['output_features']

        # preprocessing
        # todo refactoring: maybe replace the self.model_definition paramter
        #  here with features_to_load
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(batch_size=batch_size,
                              horovod=self._horovod,
                              debug=debug)
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(self.model.output_features,
                                                    predictions, dataset,
                                                    training_set_metadata)
            stats = {
                of_name: {
                    **stats[of_name],
                    **overall_stats[of_name]
                }
                # account for presence of 'combined' key
                if of_name in overall_stats else {
                    **stats[of_name]
                }
                for of_name in stats
            }

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (skip_save_unprocessed_output
                                         and skip_save_predictions
                                         and skip_save_eval_stats)
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions, output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory