def test_sklearn_mnist_classification(self): test_pipeline = TestPipeline(is_integration_test=False) input_file = 'gs://apache-beam-ml/testing/inputs/it_mnist_data.csv' output_file_dir = 'gs://temp-storage-for-end-to-end-tests' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_path = 'gs://apache-beam-ml/models/mnist_model_svm.pickle' extra_opts = { 'input': input_file, 'output': output_file, 'model_path': model_path, } sklearn_mnist_classification.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) expected_output_filepath = 'gs://apache-beam-ml/testing/expected_outputs/test_sklearn_mnist_classification_actuals.txt' # pylint: disable=line-too-long expected_outputs = process_outputs(expected_output_filepath) predicted_outputs = process_outputs(output_file) self.assertEqual(len(expected_outputs), len(predicted_outputs)) predictions_dict = {} for i in range(len(predicted_outputs)): true_label, prediction = predicted_outputs[i].split(',') predictions_dict[true_label] = prediction for i in range(len(expected_outputs)): true_label, expected_prediction = expected_outputs[i].split(',') self.assertEqual(predictions_dict[true_label], expected_prediction)
def read_image(image_file_name: str, path_to_dir: Optional[str] = None) -> Tuple[str, Image.Image]: if path_to_dir is not None: image_file_name = os.path.join(path_to_dir, image_file_name) with FileSystems().open(image_file_name, 'r') as file: data = Image.open(io.BytesIO(file.read())).convert('RGB') return image_file_name, data
def test_torch_run_inference_bert_for_masked_lm(self): test_pipeline = TestPipeline(is_integration_test=True) # Path to text file containing some sentences file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt' # disable: line-too-long output_file_dir = 'gs://apache-beam-ml/testing/predictions' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_state_dict_path = 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth' extra_opts = { 'input': file_of_sentences, 'output': output_file, 'model_state_dict_path': model_state_dict_path, } pytorch_language_modeling.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) predictions = process_outputs(filepath=output_file) actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt' actuals = process_outputs(filepath=actuals_file) predictions_dict = {} for prediction in predictions: text, predicted_text = prediction.split(';') predictions_dict[text] = predicted_text for actual in actuals: text, actual_predicted_text = actual.split(';') predicted_predicted_text = predictions_dict[text] self.assertEqual(actual_predicted_text, predicted_predicted_text)
def test_torch_run_inference_coco_maskrcnn_resnet50_fpn(self): test_pipeline = TestPipeline(is_integration_test=True) # text files containing absolute path to the coco validation data on GCS file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_coco_validation_inputs.txt' # disable: line-too-long output_file_dir = 'gs://apache-beam-ml/testing/predictions' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_state_dict_path = 'gs://apache-beam-ml/models/torchvision.models.detection.maskrcnn_resnet50_fpn.pth' images_dir = 'gs://apache-beam-ml/datasets/coco/raw-data/val2017' extra_opts = { 'input': file_of_image_names, 'output': output_file, 'model_state_dict_path': model_state_dict_path, 'images_dir': images_dir, } pytorch_image_segmentation.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) predictions = process_outputs(filepath=output_file) actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_coco_maskrcnn_resnet50_fpn_actuals.txt' actuals = process_outputs(filepath=actuals_file) predictions_dict = {} for prediction in predictions: filename, prediction_labels = prediction.split(';') predictions_dict[filename] = prediction_labels for actual in actuals: filename, actual_labels = actual.split(';') prediction_labels = predictions_dict[filename] self.assertEqual(actual_labels, prediction_labels)
def _open_dataset(self, path: str) -> xarray.Dataset: """Open as an XArray Dataset, sometimes with local caching.""" if self.local_copy: with tempfile.TemporaryDirectory() as tmpdir: local_file = fsspec.open_local( f"simplecache::{path}", simplecache={'cache_storage': tmpdir} ) yield xarray.open_dataset(local_file, **self.xarray_open_kwargs) else: with FileSystems().open(path) as file: yield xarray.open_dataset(file, **self.xarray_open_kwargs)
def test_torch_run_inference_imagenet_mobilenetv2(self): test_pipeline = TestPipeline(is_integration_test=True) # text files containing absolute path to the imagenet validation data on GCS file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_mobilenetv2_imagenet_validation_inputs.txt' # disable: line-too-long output_file_dir = 'gs://apache-beam-ml/testing/predictions' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_state_dict_path = 'gs://apache-beam-ml/models/imagenet_classification_mobilenet_v2.pt' extra_opts = { 'input': file_of_image_names, 'output': output_file, 'model_state_dict_path': model_state_dict_path, } pytorch_image_classification.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) predictions = process_outputs(filepath=output_file) for prediction in predictions: filename, prediction = prediction.split(',') self.assertEqual(_EXPECTED_OUTPUTS[filename], prediction)
def process_outputs(filepath): with FileSystems().open(filepath) as f: lines = f.readlines() lines = [l.decode('utf-8').strip('\n') for l in lines] return lines