def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature): data_csv_path = os.path.join(tmpdir, csv_filename) input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path, nan_percent=0.2) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix if feature_name not in preproc_inputs.keys(): continue feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected), f"feature: {feature_name}"
def validate_torchscript_outputs(tmpdir, config, backend, training_data_csv_path, tolerance=1e-8): # Train Ludwig (Pythonic) model: ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path, ) # Obtain predictions from Python model preds_dict, _ = ludwig_model.predict(dataset=training_data_csv_path, return_type=dict) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) outputs = script_module(inputs) # TODO: these are the only outputs we provide from Torchscript for now ts_outputs = {PREDICTIONS, PROBABILITIES, LOGITS} # Compare results from Python trained model against Torchscript for feature_name, feature_outputs_expected in preds_dict.items(): assert feature_name in outputs feature_outputs = outputs[feature_name] for output_name, output_values_expected in feature_outputs_expected.items( ): if output_name not in ts_outputs: continue assert output_name in feature_outputs output_values = feature_outputs[output_name] assert utils.has_no_grad( output_values ), f'"{feature_name}.{output_name}" tensors have gradients' assert utils.is_all_close( output_values, output_values_expected ), f'"{feature_name}.{output_name}" tensors are not close to ludwig model'
def test_torchscript(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), set_feature(vocab_size=3), vector_feature() # TODO(#1333): Re-enable. # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), ] predictions_column_name = "{}_predictions".format(output_features[0]["name"]) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path) original_weights = deepcopy(list(ludwig_model.model.parameters())) ################# # save torchscript ################# torchscript_path = os.path.join(dir_path, "torchscript") shutil.rmtree(torchscript_path, ignore_errors=True) ludwig_model.model.save_torchscript(torchscript_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(list(ludwig_model.model.parameters())) ##################################################### # restore torchscript, obtain predictions and weights ##################################################### training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = torch.jit.load(torchscript_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: torch.from_numpy(dataset.dataset[feature.proc_column]) for name, feature in ludwig_model.model.input_features.items() } # Get predictions from restored torchscript. logits = restored_model(data_to_predict) restored_predictions = torch.argmax( output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1 ) restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions] restored_weights = deepcopy(list(restored_model.parameters())) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(torchscript_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # Check to weight values match the original model. assert utils.is_all_close(original_weights, loaded_weights) assert utils.is_all_close(original_weights, restored_weights) # Check that predictions are identical to the original model. assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
def test_torchscript_preproc_timeseries_alternative_type( tmpdir, csv_filename, padding, fill_value): data_csv_path = os.path.join(tmpdir, csv_filename) feature = timeseries_feature( preprocessing={ "padding": padding, "timeseries_length_limit": 4, "fill_value": "1.0", }, max_len=7, ) input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path, nan_percent=0.2) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) def transform_timeseries_from_str_list_to_tensor_list(timeseries_list): timeseries = [] for timeseries_str in timeseries_list: timeseries.append( torch.tensor([float(x) for x in timeseries_str.split()])) return timeseries inputs[feature[NAME]] = transform_timeseries_from_str_list_to_tensor_list( inputs[feature[NAME]]) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix assert feature_name in preproc_inputs.keys( ), f'feature "{feature_name}" not found.' feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected ), f'feature "{feature_name}" value mismatch.'
def test_torchscript_preproc_vector_alternative_type(tmpdir, csv_filename, vector_type): data_csv_path = os.path.join(tmpdir, csv_filename) feature = vector_feature() input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) def transform_vector_list(vector_list, vector_type): vectors = [] for vector_str in vector_list: vectors.append(torch.tensor([float(x) for x in vector_str.split()])) if vector_type == torch.Tensor: vectors = torch.stack(vectors) return vectors inputs[feature[NAME]] = transform_vector_list(inputs[feature[NAME]], vector_type) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix if feature_name not in preproc_inputs.keys(): continue feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected), f"feature: {feature_name}"