Exemplo n.º 1
0
    def test_sklearn_mnist_classification(self):
        test_pipeline = TestPipeline(is_integration_test=False)
        input_file = 'gs://apache-beam-ml/testing/inputs/it_mnist_data.csv'
        output_file_dir = 'gs://temp-storage-for-end-to-end-tests'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])
        model_path = 'gs://apache-beam-ml/models/mnist_model_svm.pickle'
        extra_opts = {
            'input': input_file,
            'output': output_file,
            'model_path': model_path,
        }
        sklearn_mnist_classification.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
        self.assertEqual(FileSystems().exists(output_file), True)

        expected_output_filepath = 'gs://apache-beam-ml/testing/expected_outputs/test_sklearn_mnist_classification_actuals.txt'  # pylint: disable=line-too-long
        expected_outputs = process_outputs(expected_output_filepath)

        predicted_outputs = process_outputs(output_file)
        self.assertEqual(len(expected_outputs), len(predicted_outputs))

        predictions_dict = {}
        for i in range(len(predicted_outputs)):
            true_label, prediction = predicted_outputs[i].split(',')
            predictions_dict[true_label] = prediction

        for i in range(len(expected_outputs)):
            true_label, expected_prediction = expected_outputs[i].split(',')
            self.assertEqual(predictions_dict[true_label], expected_prediction)
Exemplo n.º 2
0
def read_image(image_file_name: str,
               path_to_dir: Optional[str] = None) -> Tuple[str, Image.Image]:
    if path_to_dir is not None:
        image_file_name = os.path.join(path_to_dir, image_file_name)
    with FileSystems().open(image_file_name, 'r') as file:
        data = Image.open(io.BytesIO(file.read())).convert('RGB')
        return image_file_name, data
Exemplo n.º 3
0
    def test_torch_run_inference_bert_for_masked_lm(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        # Path to text file containing some sentences
        file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt'  # disable: line-too-long
        output_file_dir = 'gs://apache-beam-ml/testing/predictions'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])

        model_state_dict_path = 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth'
        extra_opts = {
            'input': file_of_sentences,
            'output': output_file,
            'model_state_dict_path': model_state_dict_path,
        }
        pytorch_language_modeling.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)

        self.assertEqual(FileSystems().exists(output_file), True)
        predictions = process_outputs(filepath=output_file)
        actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'
        actuals = process_outputs(filepath=actuals_file)

        predictions_dict = {}
        for prediction in predictions:
            text, predicted_text = prediction.split(';')
            predictions_dict[text] = predicted_text

        for actual in actuals:
            text, actual_predicted_text = actual.split(';')
            predicted_predicted_text = predictions_dict[text]
            self.assertEqual(actual_predicted_text, predicted_predicted_text)
Exemplo n.º 4
0
    def test_torch_run_inference_coco_maskrcnn_resnet50_fpn(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        # text files containing absolute path to the coco validation data on GCS
        file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_coco_validation_inputs.txt'  # disable: line-too-long
        output_file_dir = 'gs://apache-beam-ml/testing/predictions'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])

        model_state_dict_path = 'gs://apache-beam-ml/models/torchvision.models.detection.maskrcnn_resnet50_fpn.pth'
        images_dir = 'gs://apache-beam-ml/datasets/coco/raw-data/val2017'
        extra_opts = {
            'input': file_of_image_names,
            'output': output_file,
            'model_state_dict_path': model_state_dict_path,
            'images_dir': images_dir,
        }
        pytorch_image_segmentation.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)

        self.assertEqual(FileSystems().exists(output_file), True)
        predictions = process_outputs(filepath=output_file)
        actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_coco_maskrcnn_resnet50_fpn_actuals.txt'
        actuals = process_outputs(filepath=actuals_file)

        predictions_dict = {}
        for prediction in predictions:
            filename, prediction_labels = prediction.split(';')
            predictions_dict[filename] = prediction_labels

        for actual in actuals:
            filename, actual_labels = actual.split(';')
            prediction_labels = predictions_dict[filename]
            self.assertEqual(actual_labels, prediction_labels)
Exemplo n.º 5
0
 def _open_dataset(self, path: str) -> xarray.Dataset:
   """Open as an XArray Dataset, sometimes with local caching."""
   if self.local_copy:
     with tempfile.TemporaryDirectory() as tmpdir:
       local_file = fsspec.open_local(
         f"simplecache::{path}",
         simplecache={'cache_storage': tmpdir}
       )
       yield xarray.open_dataset(local_file, **self.xarray_open_kwargs)
   else:
     with FileSystems().open(path) as file:
         yield xarray.open_dataset(file, **self.xarray_open_kwargs)
Exemplo n.º 6
0
    def test_torch_run_inference_imagenet_mobilenetv2(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        # text files containing absolute path to the imagenet validation data on GCS
        file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_mobilenetv2_imagenet_validation_inputs.txt'  # disable: line-too-long
        output_file_dir = 'gs://apache-beam-ml/testing/predictions'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])

        model_state_dict_path = 'gs://apache-beam-ml/models/imagenet_classification_mobilenet_v2.pt'
        extra_opts = {
            'input': file_of_image_names,
            'output': output_file,
            'model_state_dict_path': model_state_dict_path,
        }
        pytorch_image_classification.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)

        self.assertEqual(FileSystems().exists(output_file), True)
        predictions = process_outputs(filepath=output_file)

        for prediction in predictions:
            filename, prediction = prediction.split(',')
            self.assertEqual(_EXPECTED_OUTPUTS[filename], prediction)
Exemplo n.º 7
0
def process_outputs(filepath):
    with FileSystems().open(filepath) as f:
        lines = f.readlines()
    lines = [l.decode('utf-8').strip('\n') for l in lines]
    return lines