def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature): data_csv_path = os.path.join(tmpdir, csv_filename) input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path, nan_percent=0.2) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix if feature_name not in preproc_inputs.keys(): continue feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected), f"feature: {feature_name}"
def get_input_tensors(model: LudwigModel, input_set: pd.DataFrame) -> List[Variable]: # Convert raw input data into preprocessed tensor data dataset, _ = preprocess_for_prediction( model.config, dataset=input_set, training_set_metadata=model.training_set_metadata, data_format="auto", split="full", include_outputs=False, backend=model.backend, callbacks=model.callbacks, ) # Convert dataset into a dict of tensors, and split each tensor into batches to control GPU memory usage inputs = { name: torch.from_numpy(dataset.dataset[feature.proc_column]).split( model.config["trainer"]["batch_size"]) for name, feature in model.model.input_features.items() } # Dict of lists to list of dicts input_batches = [dict(zip(inputs, t)) for t in zip(*inputs.values())] # Encode the inputs into embedding space. This is necessary to ensure differentiability. Otherwise, category # and other features that pass through an embedding will not be explainable via gradient based methods. output_batches = [] for batch in input_batches: batch = {k: v.to(DEVICE) for k, v in batch.items()} output = model.model.encode(batch) # Extract the output tensor, discarding additional state used for sequence decoding. output = { k: v["encoder_output"].detach().cpu() for k, v in output.items() } output_batches.append(output) # List of dicts to dict of lists encoded_inputs = { k: torch.cat([d[k] for d in output_batches]) for k in output_batches[0] } # Wrap the output into a variable so torch will track the gradient. # TODO(travis): this won't work for text decoders, but we don't support explanations for those yet data_to_predict = [v for _, v in encoded_inputs.items()] data_to_predict = [ Variable(t, requires_grad=True) for t in data_to_predict ] return data_to_predict
def collect_activations( self, layer_names, dataset, data_format=None, batch_size=128, # output_directory='results', debug=False, **kwargs ): self._check_initialization() logger.debug('Preprocessing') # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] # preprocessing dataset, training_set_metadata = preprocess_for_prediction( self.model_definition, dataset=dataset, data_format=data_format, training_set_metadata=self.training_set_metadata, include_outputs=False, ) logger.debug('Predicting') predictor = Predictor( batch_size=batch_size, horovod=self._horovod, debug=debug ) activations = predictor.batch_collect_activations( self.model, layer_names, dataset, ) return activations
def test_savedmodel(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, 'generated_images') audio_dest_folder = os.path.join(tmpdir, 'generated_audio') # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict( dataset=data_csv_path) original_weights = deepcopy(ludwig_model.model.trainable_variables) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# training_set_metadata_json_fp = os.path.join( ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = tf.saved_model.load(savedmodel_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: tf.convert_to_tensor(dataset.dataset[feature.proc_column], dtype=feature.get_input_dtype()) for name, feature in ludwig_model.model.input_features.items() } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: training_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) original_weights = sorted(original_weights, key=lambda w: w.name) restored_weights = sorted(restored_weights, key=lambda w: w.name) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
def collect_activations(model_path, tensors, data_csv=None, data_hdf5=None, split='test', batch_size=128, output_directory='results', gpus=None, gpu_fraction=1.0, debug=False, **kwargs): """Uses the pretrained model to collect the tensors corresponding to a datapoint in the dataset. Saves the tensors to the experiment directory :param model_path: Is the model from which the tensors will be collected :param tensors: List contaning the names of the tensors to collect :param data_csv: The CSV filepath which contains the datapoints from which the tensors are collected :param data_hdf5: The HDF5 file path if the CSV file path does not exist, an alternative source of providing the data to the model :param split: Split type :param batch_size: Batch size :param output_directory: Output directory :param gpus: The total number of GPUs that the model intends to use :param gpu_fraction: The fraction of each GPU that the model intends on using :param debug: To step through the stack traces and find possible errors :returns: None """ # setup directories and file names experiment_dir_name = output_directory suffix = 0 while os.path.exists(experiment_dir_name): experiment_dir_name = output_directory + '_' + str(suffix) suffix += 1 logger.info('Dataset path: {}'.format( data_csv if data_csv is not None else data_hdf5)) logger.info('Model path: {}'.format(model_path)) logger.info('Output path: {}'.format(experiment_dir_name)) logger.info('\n') train_set_metadata_fp = os.path.join(model_path, TRAIN_SET_METADATA_FILE_NAME) # preprocessing dataset, train_set_metadata = preprocess_for_prediction( model_path, split, data_csv, data_hdf5, train_set_metadata_fp) model, model_definition = load_model_and_definition(model_path) # collect activations print_boxed('COLLECT ACTIVATIONS') collected_tensors = model.collect_activations(dataset, tensors, batch_size, gpus=gpus, gpu_fraction=gpu_fraction) model.close_session() # saving os.mkdir(experiment_dir_name) save_tensors(collected_tensors, experiment_dir_name) logger.info('Saved to: {0}'.format(experiment_dir_name))
def test_savedmodel(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) # Single sequence input, single category output sf = sequence_feature() sf['encoder'] = 'parallel_cnn' input_features = [sf] output_features = [category_feature(vocab_size=2)] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ############################## # collect weight tensors names ############################## original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) original_weights = deepcopy(ludwig_model.model.model.trainable_variables) ludwig_model.close() ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path) loaded_prediction_df = ludwig_model.predict(data_csv=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# train_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, train_set_metadata = preprocess_for_prediction( ludwigmodel_path, split=FULL, data_csv=data_csv_path, train_set_metadata=train_set_metadata_json_fp, evaluate_performance=False) restored_model = tf.saved_model.load(savedmodel_path) if_name = list(ludwig_model.model.model.input_features.keys())[0] of_name = list(ludwig_model.model.model.output_features.keys())[0] data_to_predict = { if_name: tf.convert_to_tensor(dataset.dataset[if_name], dtype=tf.int32) } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: train_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
def test_savedmodel(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) # Single sequence input, single category output sf = sequence_feature() sf['encoder'] = 'parallel_cnn' input_features = [sf] input_feature_name = input_features[0]['name'] input_feature_tensor_name = '{}/{}_placeholder:0'.format( input_feature_name, input_feature_name) output_features = [category_feature(vocab_size=2)] output_feature_name = output_features[0]['name'] output_feature_tensor_name = '{}/predictions_{}/predictions_{}:0'.format( output_feature_name, output_feature_name, output_feature_name) predictions_column_name = '{}_predictions'.format(output_feature_name) weight_tensor_name = '{}/fc_0/weights:0'.format(input_feature_name) # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ############################## # collect weight tensors names ############################## with ludwig_model.model.session as sess: all_variables = tf.compat.v1.trainable_variables() all_variables_names = [v.name for v in all_variables] ludwig_model.close() ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path) ludwig_prediction_df = ludwig_model.predict(data_csv=data_csv_path) ludwig_weights = ludwig_model.model.collect_weights(all_variables_names) ludwig_model.close() ################################################# # load savedmodel, obtain predictions and weights ################################################# train_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, train_set_metadata = preprocess_for_prediction( ludwigmodel_path, split=FULL, data_csv=data_csv_path, train_set_metadata=train_set_metadata_json_fp, evaluate_performance=False) with tf.compat.v1.Session() as sess: tf.saved_model.loader.load(sess, [tf.saved_model.SERVING], savedmodel_path) predictions = sess.run(output_feature_tensor_name, feed_dict={ input_feature_tensor_name: dataset.get(input_feature_name), }) savedmodel_prediction_df = pd.DataFrame( data=[ train_set_metadata[output_feature_name]["idx2str"][p] for p in predictions ], columns=[predictions_column_name]) savedmodel_weights = sess.run({n: n for n in all_variables_names}) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### for var in all_variables_names: print("Are the weights in {} identical?".format(var), np.all(ludwig_weights[var] == savedmodel_weights[var])) print( "Are loaded model predictions identical to original ones?", np.all( original_predictions_df[predictions_column_name] == \ ludwig_prediction_df[predictions_column_name] ) ) print( "Are savedmodel predictions identical to loaded model?", np.all( ludwig_prediction_df[predictions_column_name] == \ savedmodel_prediction_df[predictions_column_name] ) ) for var in all_variables_names: assert np.all(ludwig_weights[var] == savedmodel_weights[var]) assert np.all( original_predictions_df[predictions_column_name] == \ ludwig_prediction_df[predictions_column_name] ) assert np.all( ludwig_prediction_df[predictions_column_name] == \ savedmodel_prediction_df[predictions_column_name] )
def full_predict(model_path, data_csv=None, data_hdf5=None, split=TEST, batch_size=128, skip_save_unprocessed_output=False, skip_save_test_predictions=False, skip_save_test_statistics=False, output_directory='results', evaluate_performance=True, gpus=None, gpu_fraction=1.0, use_horovod=False, debug=False, **kwargs): if is_on_master(): logger.info('Dataset path: {}'.format( data_csv if data_csv is not None else data_hdf5)) logger.info('Model path: {}'.format(model_path)) logger.info('') train_set_metadata_json_fp = os.path.join(model_path, TRAIN_SET_METADATA_FILE_NAME) # preprocessing dataset, train_set_metadata = preprocess_for_prediction( model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp, evaluate_performance) # run the prediction if is_on_master(): print_boxed('LOADING MODEL') model, model_definition = load_model_and_definition( model_path, use_horovod=use_horovod) prediction_results = predict(dataset, train_set_metadata, model, model_definition, batch_size, evaluate_performance, gpus, gpu_fraction, debug) model.close_session() if is_on_master(): # setup directories and file names experiment_dir_name = find_non_existing_dir_by_adding_suffix( output_directory) # if we are skipping all saving, # there is no need to create a directory that will remain empty should_create_exp_dir = not (skip_save_unprocessed_output and skip_save_test_predictions and skip_save_test_statistics) if should_create_exp_dir: os.makedirs(experiment_dir_name) # postprocess postprocessed_output = postprocess( prediction_results, model_definition['output_features'], train_set_metadata, experiment_dir_name, skip_save_unprocessed_output or not is_on_master()) if not skip_save_test_predictions: save_prediction_outputs(postprocessed_output, experiment_dir_name) if evaluate_performance: print_test_results(prediction_results) if not skip_save_test_statistics: save_test_statistics(prediction_results, experiment_dir_name) logger.info('Saved to: {0}'.format(experiment_dir_name))
def full_predict(model_path, data_csv=None, data_hdf5=None, split='test', batch_size=128, skip_save_unprocessed_output=False, output_directory='results', evaluate_performance=True, gpus=None, gpu_fraction=1.0, use_horovod=False, debug=False, **kwargs): # setup directories and file names experiment_dir_name = output_directory suffix = 0 while os.path.exists(experiment_dir_name): experiment_dir_name = output_directory + '_' + str(suffix) suffix += 1 if is_on_master(): logging.info('Dataset path: {}'.format( data_csv if data_csv is not None else data_hdf5)) logging.info('Model path: {}'.format(model_path)) logging.info('Output path: {}'.format(experiment_dir_name)) logging.info('') train_set_metadata_json_fp = os.path.join(model_path, TRAIN_SET_METADATA_FILE_NAME) # preprocessing dataset, train_set_metadata = preprocess_for_prediction( model_path, split, data_csv, data_hdf5, train_set_metadata_json_fp, evaluate_performance) # run the prediction if is_on_master(): print_boxed('LOADING MODEL') model, model_definition = load_model_and_definition( model_path, use_horovod=use_horovod) prediction_results = predict(dataset, train_set_metadata, model, model_definition, batch_size, evaluate_performance, gpus, gpu_fraction, debug) model.close_session() if is_on_master(): os.mkdir(experiment_dir_name) # postprocess postprocessed_output = postprocess( prediction_results, model_definition['output_features'], train_set_metadata, experiment_dir_name, skip_save_unprocessed_output or not is_on_master()) save_prediction_outputs(postprocessed_output, experiment_dir_name) if evaluate_performance: print_prediction_results(prediction_results) save_prediction_statistics(prediction_results, experiment_dir_name) logging.info('Saved to: {0}'.format(experiment_dir_name))
def test_torchscript(csv_filename, should_load_model): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") # Single sequence input, single category output input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ category_feature(vocab_size=3), binary_feature(), numerical_feature(), set_feature(vocab_size=3), vector_feature() # TODO(#1333): Re-enable. # sequence_feature(vocab_size=3), # text_feature(vocab_size=3), ] predictions_column_name = "{}_predictions".format(output_features[0]["name"]) # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# backend = LocalTestBackend() config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=backend) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################### # load Ludwig model ################### if should_load_model: ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) ############################## # collect weight tensors names ############################## original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path) original_weights = deepcopy(list(ludwig_model.model.parameters())) ################# # save torchscript ################# torchscript_path = os.path.join(dir_path, "torchscript") shutil.rmtree(torchscript_path, ignore_errors=True) ludwig_model.model.save_torchscript(torchscript_path) ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend) loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path) loaded_weights = deepcopy(list(ludwig_model.model.parameters())) ##################################################### # restore torchscript, obtain predictions and weights ##################################################### training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, training_set_metadata = preprocess_for_prediction( ludwig_model.config, dataset=data_csv_path, training_set_metadata=training_set_metadata_json_fp, backend=backend, ) restored_model = torch.jit.load(torchscript_path) # Check the outputs for one of the features for correctness # Here we choose the first output feature (categorical) of_name = list(ludwig_model.model.output_features.keys())[0] data_to_predict = { name: torch.from_numpy(dataset.dataset[feature.proc_column]) for name, feature in ludwig_model.model.input_features.items() } # Get predictions from restored torchscript. logits = restored_model(data_to_predict) restored_predictions = torch.argmax( output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1 ) restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions] restored_weights = deepcopy(list(restored_model.parameters())) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(torchscript_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # Check to weight values match the original model. assert utils.is_all_close(original_weights, loaded_weights) assert utils.is_all_close(original_weights, restored_weights) # Check that predictions are identical to the original model. assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
def evaluate( self, dataset=None, data_format=None, batch_size=128, skip_save_unprocessed_output=True, skip_save_predictions=True, skip_save_eval_stats=True, collect_predictions=False, collect_overall_stats=False, output_directory='results', return_type=pd.DataFrame, debug=False, **kwargs ): self._check_initialization() logger.debug('Preprocessing') # preprocessing dataset, training_set_metadata = preprocess_for_prediction( self.model_definition, dataset=dataset, data_format=data_format, training_set_metadata=self.training_set_metadata, include_outputs=True, ) logger.debug('Predicting') predictor = Predictor( batch_size=batch_size, horovod=self._horovod, debug=debug ) stats, predictions = predictor.batch_evaluation( self.model, dataset, collect_predictions=collect_predictions or collect_overall_stats, ) # calculate the overall metrics if collect_overall_stats: overall_stats = calculate_overall_stats( self.model.output_features, predictions, dataset, training_set_metadata ) stats = {of_name: {**stats[of_name], **overall_stats[of_name]} # account for presence of 'combined' key if of_name in overall_stats else {**stats[of_name]} for of_name in stats} if is_on_master(): # if we are skipping all saving, # there is no need to create a directory that will remain empty should_create_exp_dir = not ( skip_save_unprocessed_output and skip_save_predictions and skip_save_eval_stats ) if should_create_exp_dir: os.makedirs(output_directory, exist_ok=True) if collect_predictions: logger.debug('Postprocessing') postproc_predictions = postprocess( predictions, self.model.output_features, self.training_set_metadata, output_directory=output_directory, skip_save_unprocessed_output=skip_save_unprocessed_output or not is_on_master(), ) else: postproc_predictions = predictions # = {} if is_on_master(): if postproc_predictions is not None and not skip_save_predictions: save_prediction_outputs(postproc_predictions, output_directory) print_evaluation_stats(stats) if not skip_save_eval_stats: save_evaluation_stats(stats, output_directory) if not skip_save_predictions or not skip_save_eval_stats: logger.info('Saved to: {0}'.format(output_directory)) if collect_predictions: postproc_predictions = convert_predictions( postproc_predictions, self.model.output_features, self.training_set_metadata, return_type=return_type) return stats, postproc_predictions, output_directory
def predict( self, dataset=None, data_format=None, batch_size=128, skip_save_unprocessed_output=True, skip_save_predictions=True, output_directory='results', return_type=pd.DataFrame, debug=False, **kwargs ): self._check_initialization() logger.debug('Preprocessing') # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] # preprocessing dataset, training_set_metadata = preprocess_for_prediction( self.model_definition, dataset=dataset, data_format=data_format, training_set_metadata=self.training_set_metadata, include_outputs=False, ) logger.debug('Predicting') predictor = Predictor( batch_size=batch_size, horovod=self._horovod, debug=debug ) predictions = predictor.batch_predict( self.model, dataset, ) if is_on_master(): # if we are skipping all saving, # there is no need to create a directory that will remain empty should_create_exp_dir = not ( skip_save_unprocessed_output and skip_save_predictions ) if should_create_exp_dir: os.makedirs(output_directory, exist_ok=True) logger.debug('Postprocessing') postproc_predictions = convert_predictions( postprocess( predictions, self.model.output_features, self.training_set_metadata, output_directory=output_directory, skip_save_unprocessed_output=skip_save_unprocessed_output or not is_on_master(), ), self.model.output_features, self.training_set_metadata, return_type=return_type ) if is_on_master(): if not skip_save_predictions: save_prediction_outputs(postproc_predictions, output_directory) logger.info('Saved to: {0}'.format(output_directory)) return postproc_predictions, output_directory
def test_torchscript_preproc_timeseries_alternative_type( tmpdir, csv_filename, padding, fill_value): data_csv_path = os.path.join(tmpdir, csv_filename) feature = timeseries_feature( preprocessing={ "padding": padding, "timeseries_length_limit": 4, "fill_value": "1.0", }, max_len=7, ) input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path, nan_percent=0.2) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) def transform_timeseries_from_str_list_to_tensor_list(timeseries_list): timeseries = [] for timeseries_str in timeseries_list: timeseries.append( torch.tensor([float(x) for x in timeseries_str.split()])) return timeseries inputs[feature[NAME]] = transform_timeseries_from_str_list_to_tensor_list( inputs[feature[NAME]]) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix assert feature_name in preproc_inputs.keys( ), f'feature "{feature_name}" not found.' feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected ), f'feature "{feature_name}" value mismatch.'
def test_torchscript_preproc_vector_alternative_type(tmpdir, csv_filename, vector_type): data_csv_path = os.path.join(tmpdir, csv_filename) feature = vector_feature() input_features = [ feature, ] output_features = [ binary_feature(), ] backend = LocalTestBackend() config = { "input_features": input_features, "output_features": output_features, TRAINER: { "epochs": 2 } } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) # Initialize Ludwig model ludwig_model, script_module = initialize_torchscript_module( tmpdir, config, backend, training_data_csv_path) # Obtain preprocessed inputs from Python model preproc_inputs_expected, _ = preprocess_for_prediction( ludwig_model.config, training_data_csv_path, ludwig_model.training_set_metadata, backend=backend, include_outputs=False, ) df = pd.read_csv(training_data_csv_path) inputs = to_inference_module_input_from_dataframe(df, config, load_paths=True) def transform_vector_list(vector_list, vector_type): vectors = [] for vector_str in vector_list: vectors.append(torch.tensor([float(x) for x in vector_str.split()])) if vector_type == torch.Tensor: vectors = torch.stack(vectors) return vectors inputs[feature[NAME]] = transform_vector_list(inputs[feature[NAME]], vector_type) preproc_inputs = script_module.preprocessor_forward(inputs) # Check that preproc_inputs is the same as preproc_inputs_expected. for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items( ): feature_name = feature_name_expected[:feature_name_expected.rfind( "_")] # remove proc suffix if feature_name not in preproc_inputs.keys(): continue feature_values = preproc_inputs[feature_name] assert utils.is_all_close( feature_values, feature_values_expected), f"feature: {feature_name}"
def collect_activations( model_path, tensors, data_csv=None, data_hdf5=None, split=TEST, batch_size=128, output_directory='results', gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, debug=False, **kwargs ): """Uses the pretrained model to collect the tensors corresponding to a datapoint in the dataset. Saves the tensors to the experiment directory :param model_path: Is the model from which the tensors will be collected :param tensors: List contaning the names of the tensors to collect :param data_csv: The CSV filepath which contains the datapoints from which the tensors are collected :param data_hdf5: The HDF5 file path if the CSV file path does not exist, an alternative source of providing the data to the model :param split: Split type :param batch_size: Batch size :param output_directory: Output directory :param gpus: The total number of GPUs that the model intends to use :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param debug: To step through the stack traces and find possible errors :returns: None """ # setup directories and file names experiment_dir_name = find_non_existing_dir_by_adding_suffix(output_directory) logger.info('Dataset path: {}'.format( data_csv if data_csv is not None else data_hdf5) ) logger.info('Model path: {}'.format(model_path)) logger.info('Output path: {}'.format(experiment_dir_name)) logger.info('\n') train_set_metadata_fp = os.path.join( model_path, TRAIN_SET_METADATA_FILE_NAME ) # preprocessing dataset, train_set_metadata = preprocess_for_prediction( model_path, split, data_csv, data_hdf5, train_set_metadata_fp ) model, model_definition = load_model_and_definition(model_path, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads) # collect activations print_boxed('COLLECT ACTIVATIONS') collected_tensors = model.collect_activations( dataset, tensors, batch_size ) # saving os.makedirs(experiment_dir_name) save_tensors(collected_tensors, experiment_dir_name) logger.info('Saved to: {0}'.format(experiment_dir_name))
def evaluate(self, dataset=None, data_format=None, batch_size=128, skip_save_unprocessed_output=True, skip_save_predictions=True, skip_save_eval_stats=True, collect_predictions=False, collect_overall_stats=False, output_directory='results', return_type=pd.DataFrame, debug=False, **kwargs): self._check_initialization() logger.debug('Preprocessing') # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'] + \ self.model_definition['output_features'] # preprocessing # todo refactoring: maybe replace the self.model_definition paramter # here with features_to_load dataset, training_set_metadata = preprocess_for_prediction( self.model_definition, dataset=dataset, data_format=data_format, training_set_metadata=self.training_set_metadata, include_outputs=True, ) logger.debug('Predicting') predictor = Predictor(batch_size=batch_size, horovod=self._horovod, debug=debug) stats, predictions = predictor.batch_evaluation( self.model, dataset, collect_predictions=collect_predictions or collect_overall_stats, ) # calculate the overall metrics if collect_overall_stats: overall_stats = calculate_overall_stats(self.model.output_features, predictions, dataset, training_set_metadata) stats = { of_name: { **stats[of_name], **overall_stats[of_name] } # account for presence of 'combined' key if of_name in overall_stats else { **stats[of_name] } for of_name in stats } if is_on_master(): # if we are skipping all saving, # there is no need to create a directory that will remain empty should_create_exp_dir = not (skip_save_unprocessed_output and skip_save_predictions and skip_save_eval_stats) if should_create_exp_dir: os.makedirs(output_directory, exist_ok=True) if collect_predictions: logger.debug('Postprocessing') postproc_predictions = postprocess( predictions, self.model.output_features, self.training_set_metadata, output_directory=output_directory, skip_save_unprocessed_output=skip_save_unprocessed_output or not is_on_master(), ) else: postproc_predictions = predictions # = {} if is_on_master(): if postproc_predictions is not None and not skip_save_predictions: save_prediction_outputs(postproc_predictions, output_directory) print_evaluation_stats(stats) if not skip_save_eval_stats: save_evaluation_stats(stats, output_directory) if not skip_save_predictions or not skip_save_eval_stats: logger.info('Saved to: {0}'.format(output_directory)) if collect_predictions: postproc_predictions = convert_predictions( postproc_predictions, self.model.output_features, self.training_set_metadata, return_type=return_type) return stats, postproc_predictions, output_directory