def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM `%s`' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
Пример #2
0
    def test_torch_run_inference_coco_maskrcnn_resnet50_fpn(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        # text files containing absolute path to the coco validation data on GCS
        file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_coco_validation_inputs.txt'  # disable: line-too-long
        output_file_dir = 'gs://apache-beam-ml/testing/predictions'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])

        model_state_dict_path = 'gs://apache-beam-ml/models/torchvision.models.detection.maskrcnn_resnet50_fpn.pth'
        images_dir = 'gs://apache-beam-ml/datasets/coco/raw-data/val2017'
        extra_opts = {
            'input': file_of_image_names,
            'output': output_file,
            'model_state_dict_path': model_state_dict_path,
            'images_dir': images_dir,
        }
        pytorch_image_segmentation.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)

        self.assertEqual(FileSystems().exists(output_file), True)
        predictions = process_outputs(filepath=output_file)
        actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_coco_maskrcnn_resnet50_fpn_actuals.txt'
        actuals = process_outputs(filepath=actuals_file)

        predictions_dict = {}
        for prediction in predictions:
            filename, prediction_labels = prediction.split(';')
            predictions_dict[filename] = prediction_labels

        for actual in actuals:
            filename, actual_labels = actual.split(';')
            prediction_labels = predictions_dict[filename]
            self.assertEqual(actual_labels, prediction_labels)
Пример #3
0
  def setUpClass(cls):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--spanner_instance_id',
        default='beam-test',
        help='Spanner instance id',
    )
    parser.add_argument(
        '--spanner_project_id',
        default='beam-testing',
        help='GCP project with spanner instance',
    )
    parser.add_argument(
        '--use_real_spanner',
        action='store_true',
        default=False,
        help='Whether to use emulator or real spanner instance',
    )

    pipeline = TestPipeline(is_integration_test=True)
    argv = pipeline.get_full_options_as_args()

    known_args, _ = parser.parse_known_args(argv)
    cls.project_id = known_args.spanner_project_id
    cls.instance_id = known_args.spanner_instance_id
    use_spanner_emulator = not known_args.use_real_spanner
    cls.table = 'xlang_beam_spanner'
    cls.spanner_helper = SpannerHelper(
        cls.project_id, cls.instance_id, cls.table, use_spanner_emulator)

    coders.registry.register_coder(SpannerTestRow, coders.RowCoder)
    coders.registry.register_coder(SpannerPartTestRow, coders.RowCoder)
    coders.registry.register_coder(SpannerTestKey, coders.RowCoder)
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        dataset = test_pipeline.get_option("project")
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'dataset': dataset,
            'kind': kind,
            'output': output,
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
Пример #5
0
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'kind': kind,
            'output': output,
            # Comment this out to regenerate input data on Datastore (delete
            # existing data first using the bulk delete Dataflow template).
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM [%s]' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
Пример #7
0
class BigQuerySideInputIT(unittest.TestCase):
    DEFAULT_OUTPUT_FILE = \
        'gs://temp-storage-for-end-to-end-tests/py-it-cloud/output'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.uuid = str(uuid.uuid4())

        self.output = '/'.join(
            [self.DEFAULT_OUTPUT_FILE, self.uuid, 'results'])

    @pytest.mark.no_xdist
    @pytest.mark.examples_postcommit
    def test_bigquery_side_input_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        NUM_GROUPS = 3

        extra_opts = {
            'output': self.output,
            'num_groups': str(NUM_GROUPS),
            'on_success_matcher': all_of(state_verifier)
        }

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [self.output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        bigquery_side_input.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #8
0
    def test_torch_run_inference_bert_for_masked_lm(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        # Path to text file containing some sentences
        file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt'  # disable: line-too-long
        output_file_dir = 'gs://apache-beam-ml/testing/predictions'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])

        model_state_dict_path = 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth'
        extra_opts = {
            'input': file_of_sentences,
            'output': output_file,
            'model_state_dict_path': model_state_dict_path,
        }
        pytorch_language_modeling.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)

        self.assertEqual(FileSystems().exists(output_file), True)
        predictions = process_outputs(filepath=output_file)
        actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt'
        actuals = process_outputs(filepath=actuals_file)

        predictions_dict = {}
        for prediction in predictions:
            text, predicted_text = prediction.split(';')
            predictions_dict[text] = predicted_text

        for actual in actuals:
            text, actual_predicted_text = actual.split(';')
            predicted_predicted_text = predictions_dict[text]
            self.assertEqual(actual_predicted_text, predicted_predicted_text)
class UserScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())

    self.output = '/'.join(
        [self.test_pipeline.get_option('output'), self.uuid, 'results'])

  @attr('IT')
  def test_user_score_it(self):

    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    arg_sleep_secs = self.test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    file_verifier = FileChecksumMatcher(
        self.output + '/*-of-*', self.DEFAULT_EXPECTED_CHECKSUM, sleep_secs)

    extra_opts = {
        'input': self.DEFAULT_INPUT_FILE,
        'output': self.output + '/user-score',
        'on_success_matcher': all_of(state_verifier, file_verifier)
    }

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [self.output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    user_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts),
        save_main_session=False)
Пример #10
0
class StreamingWordCountIT(unittest.TestCase):

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def _inject_numbers(self, topic, num_messages):
    """Inject numbers as test data to PubSub."""
    logging.debug('Injecting %d numbers to topic %s',
                  num_messages, topic.full_name)
    for n in range(num_messages):
      topic.publish(str(n))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def tearDown(self):
    self._cleanup_pubsub()

  @attr('IT')
  def test_streaming_wordcount_it(self):
    # Build expected dataset.
    expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)]

    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                               OUTPUT_SUB + self.uuid,
                                               expected_msg,
                                               timeout=400)
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
    def _run_wordcount_it(self, run_wordcount, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)
        extra_opts = {}

        # Set extra options to the pipeline for test purpose
        test_output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        extra_opts['output'] = test_output

        test_input = test_pipeline.get_option('input')
        if test_input:
            extra_opts['input'] = test_input

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        expect_checksum = (test_pipeline.get_option('expect_checksum')
                           or self.DEFAULT_CHECKSUM)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(test_output + '*-of-*', expect_checksum,
                                sleep_secs)
        ]
        extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers)
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [test_output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts),
                      save_main_session=False)
Пример #12
0
class UserScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())

    self.output = '/'.join([self.test_pipeline.get_option('output'),
                            self.uuid,
                            'results'])

  @attr('IT')
  def test_user_score_it(self):

    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    file_verifier = FileChecksumMatcher(self.output + '*-of-*',
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'input': self.DEFAULT_INPUT_FILE,
                  'output': self.output + '/user-score',
                  'on_success_matcher': all_of(state_verifier,
                                               file_verifier)}

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [self.output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    user_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #13
0
    def test_sklearn_mnist_classification(self):
        test_pipeline = TestPipeline(is_integration_test=False)
        input_file = 'gs://apache-beam-ml/testing/inputs/it_mnist_data.csv'
        output_file_dir = 'gs://temp-storage-for-end-to-end-tests'
        output_file = '/'.join(
            [output_file_dir, str(uuid.uuid4()), 'result.txt'])
        model_path = 'gs://apache-beam-ml/models/mnist_model_svm.pickle'
        extra_opts = {
            'input': input_file,
            'output': output_file,
            'model_path': model_path,
        }
        sklearn_mnist_classification.run(
            test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
        self.assertEqual(FileSystems().exists(output_file), True)

        expected_output_filepath = 'gs://apache-beam-ml/testing/expected_outputs/test_sklearn_mnist_classification_actuals.txt'  # pylint: disable=line-too-long
        expected_outputs = process_outputs(expected_output_filepath)

        predicted_outputs = process_outputs(output_file)
        self.assertEqual(len(expected_outputs), len(predicted_outputs))

        predictions_dict = {}
        for i in range(len(predicted_outputs)):
            true_label, prediction = predicted_outputs[i].split(',')
            predictions_dict[true_label] = prediction

        for i in range(len(expected_outputs)):
            true_label, expected_prediction = expected_outputs[i].split(',')
            self.assertEqual(predictions_dict[true_label], expected_prediction)
Пример #14
0
  def test_filters_output_bigquery_matcher(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'FiltersTestIT'
    table = 'cold_days_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table

    pipeline_verifiers = [
        PipelineStateMatcher(),
        BigqueryMatcher(
            project=project, query=query, checksum=self.DEFAULT_CHECKSUM)
    ]
    extra_opts = {
        'output': output_table,
        'on_success_matcher': all_of(*pipeline_verifiers)
    }

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
Пример #15
0
    def _run_wordcount_it(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
    def run_pipeline(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)
        argv = test_pipeline.get_full_options_as_args(**opts)
        parser = argparse.ArgumentParser()
        unused_known_args, pipeline_args = parser.parse_known_args(argv)

        pipeline_options = PipelineOptions(pipeline_args)
        p = beam.Pipeline(options=pipeline_options)
        return dataflow_exercise_metrics_pipeline.apply_and_run(p)
Пример #17
0
 def run_bigquery_io_read_pipeline(self, input_size):
   test_pipeline = TestPipeline(is_integration_test=True)
   pipeline_verifiers = [PipelineStateMatcher(),]
   extra_opts = {'input_table': self.DEFAULT_DATASET + "." +
                                self.DEFAULT_TABLE_PREFIX + input_size,
                 'num_records': self.NUM_RECORDS[input_size],
                 'on_success_matcher': all_of(*pipeline_verifiers)}
   bigquery_io_read_pipeline.run(test_pipeline.get_full_options_as_args(
       **extra_opts))
Пример #18
0
  def test_wordcount_fnapi_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount_fnapi.run(
        test_pipeline.get_full_options_as_args(
            experiment='beam_fn_api',
            on_success_matcher=PipelineStateMatcher()))
Пример #19
0
    def test_train_mode(self):
        """Runs pipeline in train mode outputting train, test and eval filesets."""
        test_pipeline = TestPipeline()
        # Set extra options to the pipeline for test purpose
        test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
        self.addCleanup(shutil.rmtree, test_dir)

        # Checks that pipeline reaches state "Done"
        pipeline_verifiers = [PipelineStateMatcher()]
        extra_opts = {
            'project': PROJECT,
            'output_path': test_dir,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'runner': 'DirectRunner',
        }

        res = preprocess.main(
            test_pipeline.get_full_options_as_args(**extra_opts),
            query=self.TEST_QUERY,
            await_completion=True)

        # Check counts coming out of GetFirstClaim step.
        parse_first_claim_cnt = get_pipeline_metric(
            res, 'parse_firstclaim_success')
        self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)

        # Check counts coming out of AddFeatures step.
        add_features_cnt = get_pipeline_metric(res, 'create_features_success')
        self.assertEqual(self.TOTAL_RECORDS, add_features_cnt)

        # Check counts coming out of AddLabel step.
        broad_cnt = get_pipeline_metric(res, 'add_label_broad')
        narrow_cnt = get_pipeline_metric(res, 'add_label_narrow')
        self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt)

        # Check if the number of records coming out of Train/Test = limit step.
        splits = ['train_cnt', 'eval_cnt', 'test_cnt']
        train_test_split_cnt = sum(
            [get_pipeline_metric(res, m) for m in splits])
        self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt)

        # Check if number of protos created matched output of train/test split.
        create_proto_success = sum([
            get_pipeline_metric(res, 'create_proto_success', index=i)
            for i in range(3)
        ])
        self.assertEqual(self.TOTAL_RECORDS, create_proto_success)

        # Open a tf Example and check fields.
        example = read_example_proto(test_dir)
        for feature_name in preprocess.FEATURE_NAMES:
            self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)
        # Make sure label feature is present.
        labels = ['broad', 'narrow']
        self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
  def run_pipeline(self, **opts):
    test_pipeline = TestPipeline(is_integration_test=True)
    argv = test_pipeline.get_full_options_as_args(**opts)
    parser = argparse.ArgumentParser()
    unused_known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    return dataflow_exercise_metrics_pipeline.apply_and_run(p)
    def run_pipeline(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)
        argv = test_pipeline.get_full_options_as_args(**opts)
        parser = argparse.ArgumentParser()
        unused_known_args, pipeline_args = parser.parse_known_args(argv)

        # We use the save_main_session option because one or more DoFn's in this
        # workflow rely on global context (e.g., a module imported at module level).
        pipeline_options = PipelineOptions(pipeline_args)
        pipeline_options.view_as(SetupOptions).save_main_session = True
        p = beam.Pipeline(options=pipeline_options)
        return dataflow_exercise_metrics_pipeline.apply_and_run(p)
Пример #22
0
 def test_estimate_pi_output_file(self):
   test_pipeline = TestPipeline(is_integration_test=True)
   temp_folder = tempfile.mkdtemp()
   extra_opts = {'output': os.path.join(temp_folder, 'result')}
   estimate_pi.run(test_pipeline.get_full_options_as_args(**extra_opts))
   # Load result file and compare.
   with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file:
     [_, _, estimated_pi] = json.loads(result_file.read().strip())
   # Note: Probabilistically speaking this test can fail with a probability
   # that is very small (VERY) given that we run at least 100 thousand
   # trials.
   self.assertTrue(3.125 <= estimated_pi <= 3.155)
class StreamingWordCountIT(unittest.TestCase):

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(
        project=self.test_pipeline.get_option('project'))
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC)
    self.input_sub = self.input_topic.subscription(INPUT_SUB)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB)

    self._cleanup_pubsub()

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def _inject_numbers(self, topic, num_messages):
    """Inject numbers as test data to PubSub."""
    logging.debug('Injecting %d numbers to topic %s',
                  num_messages, topic.full_name)
    for n in range(num_messages):
      topic.publish(str(n))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def tearDown(self):
    self._cleanup_pubsub()

  @attr('developing_test')
  def test_streaming_wordcount_it(self):
    # Set extra options to the pipeline for test purpose
    pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)]
    extra_opts = {'input_sub': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #24
0
class MatchIntegrationTest(unittest.TestCase):

  INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt'
  KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87'
  INPUT_FILE_LARGE = (
      'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json')

  WIKI_FILES = [
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json',
  ]

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)

  @attr('IT')
  def test_transform_on_gcs(self):
    args = self.test_pipeline.get_full_options_as_args()

    with beam.Pipeline(argv=args) as p:
      matches_pc = (
          p
          | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE])
          | fileio.MatchAll()
          | 'GetPath' >> beam.Map(lambda metadata: metadata.path))

      assert_that(
          matches_pc,
          equal_to([self.INPUT_FILE] + self.WIKI_FILES),
          label='Matched Files')

      checksum_pc = (
          p
          | 'SingleFile' >> beam.Create([self.INPUT_FILE])
          | 'MatchOneAll' >> fileio.MatchAll()
          | fileio.ReadMatches()
          | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n'))
          | 'Checksums' >> beam.Map(compute_hash))

      assert_that(
          checksum_pc,
          equal_to([self.KINGLEAR_CHECKSUM]),
          label='Assert Checksums')
Пример #25
0
  def run_datastore_write(self, limit=None):
    test_pipeline = TestPipeline(is_integration_test=True)
    current_time = datetime.now().strftime("%m%d%H%M%S")
    seed = random.randint(0, 100000)
    kind = 'testkind%s%d' % (current_time, seed)
    pipeline_verifiers = [PipelineStateMatcher()]
    extra_opts = {'kind': kind,
                  'num_entities': self.NUM_ENTITIES,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    if limit is not None:
      extra_opts['limit'] = limit

    datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args(
        **extra_opts))
Пример #26
0
  def run_datastore_write(self, limit=None):
    test_pipeline = TestPipeline(is_integration_test=True)
    current_time = datetime.now().strftime("%m%d%H%M%S")
    seed = random.randint(0, 100000)
    kind = 'testkind%s%d' % (current_time, seed)
    pipeline_verifiers = [PipelineStateMatcher()]
    extra_opts = {'kind': kind,
                  'num_entities': self.NUM_ENTITIES,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    if limit is not None:
      extra_opts['limit'] = limit

    datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args(
        **extra_opts))
class HourlyTeamScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
  OUTPUT_DATASET = 'hourly_team_score_it_dataset'
  OUTPUT_TABLE = 'leader_board'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')

    # Set up BigQuery environment
    from google.cloud import bigquery
    client = bigquery.Client()
    unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
    self.dataset = client.dataset(unique_dataset_name, project=self.project)
    self.dataset.create()

  def _cleanup_dataset(self):
    self.dataset.delete()

  @attr('IT')
  def test_hourly_team_score_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project,
                                                  self.dataset.name,
                                                  self.OUTPUT_TABLE))

    bigquery_verifier = BigqueryMatcher(self.project,
                                        query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'input': self.DEFAULT_INPUT_FILE,
                  'dataset': self.dataset.name,
                  'window_duration': 1,
                  'on_success_matcher': all_of(state_verifier,
                                               bigquery_verifier)}

    # Register clean up before pipeline execution
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_dataset)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    hourly_team_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #28
0
class HourlyTeamScoreIT(unittest.TestCase):

    DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
    # SHA-1 hash generated from sorted rows reading from BigQuery table
    DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
    OUTPUT_DATASET = 'hourly_team_score_it_dataset'
    OUTPUT_TABLE = 'leader_board'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up BigQuery environment
        from google.cloud import bigquery
        client = bigquery.Client()
        unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
        self.dataset = client.dataset(unique_dataset_name,
                                      project=self.project)
        self.dataset.create()

    def _cleanup_dataset(self):
        self.dataset.delete()

    @attr('IT')
    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = ('SELECT COUNT(*) FROM [%s:%s.%s]' %
                 (self.project, self.dataset.name, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset.name,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_dataset)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #29
0
  def test_run_example_with_setup_file(self):
    pipeline = TestPipeline(is_integration_test=True)
    coordinate_output = FileSystems.join(
        pipeline.get_option('output'),
        'juliaset-{}'.format(str(uuid.uuid4())),
        'coordinates.txt')
    extra_args = {
        'coordinate_output': coordinate_output,
        'grid_size': self.GRID_SIZE,
        'setup_file': os.path.normpath(
            os.path.join(os.path.dirname(__file__), '..', 'setup.py')),
        'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)),
    }
    args = pipeline.get_full_options_as_args(**extra_args)

    juliaset.run(args)
Пример #30
0
class MatchIntegrationTest(unittest.TestCase):

  INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt'
  KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87'
  INPUT_FILE_LARGE = (
      'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json')

  WIKI_FILES = [
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json',
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json',
  ]

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)

  @attr('IT')
  def test_transform_on_gcs(self):
    args = self.test_pipeline.get_full_options_as_args()

    with beam.Pipeline(argv=args) as p:
      matches_pc = (p
                    | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE])
                    | fileio.MatchAll()
                    | 'GetPath' >> beam.Map(lambda metadata: metadata.path))

      assert_that(matches_pc,
                  equal_to([self.INPUT_FILE] + self.WIKI_FILES),
                  label='Matched Files')

      checksum_pc = (p
                     | 'SingleFile' >> beam.Create([self.INPUT_FILE])
                     | 'MatchOneAll' >> fileio.MatchAll()
                     | fileio.ReadMatches()
                     | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n'))
                     | 'Checksums' >> beam.Map(compute_hash))

      assert_that(checksum_pc,
                  equal_to([self.KINGLEAR_CHECKSUM]),
                  label='Assert Checksums')
Пример #31
0
  def test_top_wikipedia_sessions_output_files_on_small_input(self):
    test_pipeline = TestPipeline(is_integration_test=True)
    # Setup the files with expected content.
    temp_folder = tempfile.mkdtemp()
    self.create_content_input_file(
        os.path.join(temp_folder, 'input.txt'), '\n'.join(self.EDITS))
    extra_opts = {
        'input': '%s/input.txt' % temp_folder,
        'output': os.path.join(temp_folder, 'result'),
        'sampling_threshold': '1.0'
    }
    top_wikipedia_sessions.run(
        test_pipeline.get_full_options_as_args(**extra_opts))

    # Load result file and compare.
    with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file:
      result = result_file.read().strip().splitlines()

    self.assertEqual(self.EXPECTED, sorted(result, key=lambda x: x.split()[0]))
Пример #32
0
  def test_autocomplete_output_files_on_small_input(self):
    test_pipeline = TestPipeline(is_integration_test=True)
    # Setup the files with expected content.
    OUTPUT_FILE_DIR = \
        'gs://temp-storage-for-end-to-end-tests/py-it-cloud/output'
    output = '/'.join([OUTPUT_FILE_DIR, str(uuid.uuid4()), 'result'])
    INPUT_FILE_DIR = \
        'gs://temp-storage-for-end-to-end-tests/py-it-cloud/input'
    input = '/'.join([INPUT_FILE_DIR, str(uuid.uuid4()), 'input.txt'])
    create_content_input_file(input, ' '.join(self.WORDS))
    extra_opts = {'input': input, 'output': output}

    autocomplete.run(test_pipeline.get_full_options_as_args(**extra_opts))

    # Load result file and compare.
    result = read_gcs_output_file(output).strip()

    self.assertEqual(
        sorted(self.EXPECTED_PREFIXES), sorted(format_output_file(result)))
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output_table = ('BigQueryTornadoesIT'
                    '.monthly_tornadoes_%s' % int(round(time.time() * 1000)))
    query = 'SELECT month, tornado_count FROM [%s]' % output_table
    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=test_pipeline.get_option('project'),
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase):

    DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
    # SHA-1 hash generated from sorted rows reading from BigQuery table
    DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
    OUTPUT_DATASET = 'hourly_team_score_it_dataset'
    OUTPUT_TABLE = 'leader_board'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up BigQuery environment
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   self.OUTPUT_DATASET)

    @pytest.mark.it_postcommit
    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
Пример #35
0
  def test_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output = '/'.join([test_pipeline.get_option('output'),
                       str(int(time.time())),
                       'results'])
    arg_sleep_secs = test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.DEFAULT_CHECKSUM,
                                              sleep_secs)]
    extra_opts = {'output': output,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
Пример #36
0
    def test_coders_output_files_on_small_input(self):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Setup the files with expected content.
        temp_folder = tempfile.mkdtemp()
        self.create_content_input_file(
            os.path.join(temp_folder, 'input.txt'),
            '\n'.join(map(json.dumps, self.SAMPLE_RECORDS)))
        extra_opts = {
            'input': '%s/input.txt' % temp_folder,
            'output': os.path.join(temp_folder, 'result')
        }
        coders.run(test_pipeline.get_full_options_as_args(**extra_opts))

        # Load result file and compare.
        with open_shards(os.path.join(temp_folder,
                                      'result-*-of-*')) as result_file:
            result = result_file.read().strip()

        self.assertEqual(sorted(self.EXPECTED_RESULT),
                         sorted(self.format_result(result)))
Пример #37
0
    def test_autocomplete_output_files_on_small_input(self):
        logging.error('SAVE_MAIN_SESSION')
        test_pipeline = TestPipeline(is_integration_test=True)
        # Setup the files with expected content.
        temp_folder = tempfile.mkdtemp()
        create_content_input_file(os.path.join(temp_folder, 'input.txt'),
                                  ' '.join(self.WORDS))
        extra_opts = {
            'input': '%s/input.txt' % temp_folder,
            'output': os.path.join(temp_folder, 'result')
        }

        autocomplete.run(test_pipeline.get_full_options_as_args(**extra_opts))

        # Load result file and compare.
        with open_shards(os.path.join(temp_folder,
                                      'result-*-of-*')) as result_file:
            result = result_file.read().strip()

        self.assertEqual(sorted(self.EXPECTED_PREFIXES),
                         sorted(format_output_file(result)))
Пример #38
0
    def test_inference_mode(self):
        """Runs a pipeline in inference mode which should output one fileset."""
        test_pipeline = TestPipeline()
        # Set extra options to the pipeline for test purpose
        test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
        self.addCleanup(shutil.rmtree, test_dir)

        # Checks that pipeline reaches state "Done"
        pipeline_verifiers = [PipelineStateMatcher()]
        extra_opts = {
            'project': PROJECT,
            'output_path': test_dir,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'runner': 'DirectRunner',
            'pipeline_mode': 'inference',
        }

        res = preprocess.main(
            test_pipeline.get_full_options_as_args(**extra_opts),
            query=self.TEST_QUERY,
            await_completion=True)

        # Check counts coming out of GetFirstClaim step.
        parse_first_claim_cnt = get_pipeline_metric(
            res, 'parse_firstclaim_success')
        self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)

        # Ensure a proto is created for all input records
        create_proto_success = get_pipeline_metric(res, 'create_proto_success')
        self.assertEqual(self.TOTAL_RECORDS, create_proto_success)

        # Open a tf Example and check fields.
        example = read_example_proto(test_dir)
        for feature_name in preprocess.FEATURE_NAMES:
            self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)

        # Make sure label feature is not present since we are in inference.
        with self.assertRaises(IndexError):
            get_tf_feature(example, 'label', 'bytes_list')
Пример #39
0
  def test_datastore_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)
    dataset = test_pipeline.get_option("project")
    kind = self.DATASTORE_WORDCOUNT_KIND
    output = '/'.join([test_pipeline.get_option('output'),
                       str(int(time.time() * 1000)),
                       'datastore_wordcount_results'])

    arg_sleep_secs = test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.EXPECTED_CHECKSUM,
                                              sleep_secs)]
    extra_opts = {'dataset': dataset,
                  'kind': kind,
                  'output': output,
                  'read_only': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    datastore_wordcount.run(test_pipeline.get_full_options_as_args(
        **extra_opts))
Пример #40
0
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)
    logging.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  @attr('IT')
  def test_value_provider_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    schema = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create([row for row in _ELEMENTS if 'language' in row])

      _ = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_1)),
               schema=value_provider.StaticValueProvider(dict, schema),
               method='STREAMING_INSERTS'))
      _ = (input
           | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_2)),
               method='FILE_LOADS'))

  @attr('IT')
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    full_output_table_1 = '%s:%s' % (self.project, output_table_1)
    full_output_table_2 = '%s:%s' % (self.project, output_table_2)

    schema1 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}
    schema2 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    bad_record = {'language': 1, 'manguage': 2}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      input2 = p | "Broken record" >> beam.Create([bad_record])

      input = (input, input2) | beam.Flatten()

      r = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=lambda x: (full_output_table_1
                                if 'language' in x
                                else full_output_table_2),
               schema=lambda dest: (schema1
                                    if dest == full_output_table_1
                                    else schema2),
               method='STREAMING_INSERTS'))

      assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                  equal_to([(full_output_table_1, bad_record)]))

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      logging.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)
class BigQueryFileLoadsIT(unittest.TestCase):

  BIG_QUERY_DATASET_ID = 'python_bq_file_loads_'
  BIG_QUERY_SCHEMA = (
      '{"fields": [{"name": "name","type": "STRING"},'
      '{"name": "language","type": "STRING"}]}'
  )

  BIG_QUERY_SCHEMA_2 = (
      '{"fields": [{"name": "name","type": "STRING"},'
      '{"name": "foundation","type": "STRING"}]}'
  )

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)
    logging.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  @attr('IT')
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    output_table_3 = '%s%s' % (self.output_table, 3)
    output_table_4 = '%s%s' % (self.output_table, 4)
    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_3,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_4,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      # Get all input in same machine
      input = (input
               | beam.Map(lambda x: (None, x))
               | beam.GroupByKey()
               | beam.FlatMap(lambda elm: elm[1]))

      _ = (input |
           "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_1
                                if 'language' in x
                                else output_table_2),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

      _ = (input |
           "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_3
                                if 'language' in x
                                else output_table_4),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
               max_file_size=20,
               max_files_per_bundle=-1))

  @attr('IT')
  def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None, None)
    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None, None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[])]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (input |
             "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                 table=lambda x: (output_table_1
                                  if 'language' in x
                                  else output_table_2),
                 create_disposition=(
                     beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    hamcrest_assert(p, all_of(*pipeline_verifiers))

  def tearDown(self):
    request = bigquery_api.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      logging.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
class LeaderBoardIT(unittest.TestCase):

  # Input event containing user, team, score, processing time, window start.
  INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224'
  INPUT_TOPIC = 'leader_board_it_input_topic'
  INPUT_SUB = 'leader_board_it_input_subscription'

  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = 'de00231fe6730b972c0ff60a99988438911cda53'
  OUTPUT_DATASET = 'leader_board_it_dataset'
  OUTPUT_TABLE_USERS = 'leader_board_users'
  OUTPUT_TABLE_TEAMS = 'leader_board_teams'
  DEFAULT_INPUT_COUNT = 500

  WAIT_UNTIL_FINISH_DURATION = 10 * 60 * 1000   # in milliseconds

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    _unique_id = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    unique_topic_name = self.INPUT_TOPIC + _unique_id
    unique_subscrition_name = self.INPUT_SUB + _unique_id
    self.input_topic = self.pubsub_client.topic(unique_topic_name)
    self.input_sub = self.input_topic.subscription(unique_subscrition_name)

    self.input_topic.create()
    test_utils.wait_for_topics_created([self.input_topic])
    self.input_sub.create()

    # Set up BigQuery environment
    from google.cloud import bigquery
    client = bigquery.Client()
    unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
    self.dataset = client.dataset(unique_dataset_name, project=self.project)
    self.dataset.create()

    self._test_timestamp = int(time.time() * 1000)

  def _inject_pubsub_game_events(self, topic, message_count):
    """Inject game events as test data to PubSub."""

    logging.debug('Injecting %d game events to topic %s',
                  message_count, topic.full_name)

    for _ in range(message_count):
      topic.publish(self.INPUT_EVENT % self._test_timestamp)

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub])
    test_utils.cleanup_topics([self.input_topic])

  def _cleanup_dataset(self):
    self.dataset.delete()

  @attr('IT')
  def test_leader_board_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    success_condition = 'total_score=5000 LIMIT 1'
    users_query = ('SELECT total_score FROM [%s:%s.%s] '
                   'WHERE %s' % (self.project,
                                 self.dataset.name,
                                 self.OUTPUT_TABLE_USERS,
                                 success_condition))
    bq_users_verifier = BigqueryMatcher(self.project,
                                        users_query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    teams_query = ('SELECT total_score FROM [%s:%s.%s] '
                   'WHERE %s' % (self.project,
                                 self.dataset.name,
                                 self.OUTPUT_TABLE_TEAMS,
                                 success_condition))
    bq_teams_verifier = BigqueryMatcher(self.project,
                                        teams_query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'subscription': self.input_sub.full_name,
                  'dataset': self.dataset.name,
                  'topic': self.input_topic.full_name,
                  'team_window_duration': 1,
                  'wait_until_finish_duration':
                      self.WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               bq_users_verifier,
                                               bq_teams_verifier)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_pubsub)
    self.addCleanup(self._cleanup_dataset)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE_USERS)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE_TEAMS)

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_topic,
                                               self.input_sub])
    self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    leader_board.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
Пример #44
0
 def test_option_args_parsing(self):
   test_pipeline = TestPipeline(argv=self.TEST_CASE['options'])
   self.assertListEqual(
       sorted(test_pipeline.get_full_options_as_args()),
       sorted(self.TEST_CASE['expected_list']))
Пример #45
0
 def test_empty_option_args_parsing(self):
   test_pipeline = TestPipeline()
   self.assertListEqual([],
                        test_pipeline.get_full_options_as_args())
Пример #46
0
 def test_create_test_pipeline_options(self):
   test_pipeline = TestPipeline(argv=self.TEST_CASE['options'])
   test_options = PipelineOptions(test_pipeline.get_full_options_as_args())
   self.assertDictContainsSubset(self.TEST_CASE['expected_dict'],
                                 test_options.get_all_options())
Пример #47
0
 def test_append_extra_options(self):
   test_pipeline = TestPipeline()
   for case in self.EXTRA_OPT_CASES:
     opt_list = test_pipeline.get_full_options_as_args(**case['options'])
     self.assertListEqual(sorted(opt_list), sorted(case['expected']))
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = {
      # TODO(BEAM-4275): DirectRunner doesn't support reading or writing
      # label_ids, nor writing timestamp attributes. Once these features exist,
      # TestDirectRunner and TestDataflowRunner should behave identically.
      'TestDirectRunner': [
          PubsubMessage('data001', {}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage('data002', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
          }),
      ],
      'TestDataflowRunner': [
          # Use ID_LABEL attribute to deduplicate messages with the same ID.
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage('data002', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
          })
      ],
  }
  EXPECTED_OUTPUT_MESSAGES = {
      'TestDirectRunner': [
          PubsubMessage('data001-seen', {'processed': 'IT'}),
          PubsubMessage('data002-seen', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
              'processed': 'IT',
          }),
      ],
      'TestDataflowRunner': [
          PubsubMessage('data001-seen', {'processed': 'IT'}),
          PubsubMessage('data002-seen', {
              TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
              'processed': 'IT',
          }),
      ],
  }

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid))
    self.output_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid),
        self.input_topic.name)
    self.output_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid),
        self.output_topic.name)

  def tearDown(self):
    test_utils.cleanup_subscriptions(self.sub_client,
                                     [self.input_sub, self.output_sub])
    test_utils.cleanup_topics(self.pub_client,
                              [self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Set on_success_matcher to verify pipeline state and pubsub output. These
    # verifications run on a (remote) worker.

    # Expect the state to be RUNNING since a streaming pipeline is usually
    # never DONE. The test runner will cancel the pipeline after verification.
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    if self.runner_name == 'TestDirectRunner':
      strip_attributes = None
    else:
      strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        self.output_sub.name,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=strip_attributes)
    extra_opts = {'input_subscription': self.input_sub.name,
                  'output_topic': self.output_topic.name,
                  'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    for msg in self.INPUT_MESSAGES[self.runner_name]:
      self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  # TODO(BEAM-6660): Enable this test when ready.
  @unittest.skip('This test requires BQ Dataflow native source support for ' +
                 'KMS, which is not available yet.')
  @attr('IT')
  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

  @unittest.skipIf(sys.version_info[0] == 3 and
                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                   'This test still needs to be fixed on Python 3'
                   'TODO: BEAM-6769')
  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
Пример #50
0
class GameStatsIT(unittest.TestCase):

  # Input events containing user, team, score, processing time, window start.
  INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224'
  INPUT_TOPIC = 'game_stats_it_input_topic'
  INPUT_SUB = 'game_stats_it_input_subscription'

  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f'
  OUTPUT_DATASET = 'game_stats_it_dataset'
  OUTPUT_TABLE_SESSIONS = 'game_stats_sessions'
  OUTPUT_TABLE_TEAMS = 'game_stats_teams'
  DEFAULT_INPUT_COUNT = 500

  WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000   # in milliseconds

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    _unique_id = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, self.INPUT_TOPIC + _unique_id))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project,
                                          self.INPUT_SUB + _unique_id),
        self.input_topic.name)

    # Set up BigQuery environment
    self.dataset_ref = utils.create_bq_dataset(self.project,
                                               self.OUTPUT_DATASET)

    self._test_timestamp = int(time.time() * 1000)

  def _inject_pubsub_game_events(self, topic, message_count):
    """Inject game events as test data to PubSub."""

    logging.debug('Injecting %d game events to topic %s',
                  message_count, topic.name)

    for _ in range(message_count):
      self.pub_client.publish(topic.name,
                              (self.INPUT_EVENT % self._test_timestamp
                              ).encode('utf-8'))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub])
    test_utils.cleanup_topics(self.pub_client, [self.input_topic])

  @attr('IT')
  def test_game_stats_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    success_condition = 'mean_duration=300 LIMIT 1'
    sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` '
                      'WHERE %s' % (self.project,
                                    self.dataset_ref.dataset_id,
                                    self.OUTPUT_TABLE_SESSIONS,
                                    success_condition))
    bq_sessions_verifier = BigqueryMatcher(self.project,
                                           sessions_query,
                                           self.DEFAULT_EXPECTED_CHECKSUM)

    # TODO(mariagh): Add teams table verifier once game_stats.py is fixed.

    extra_opts = {'subscription': self.input_sub.name,
                  'dataset': self.dataset_ref.dataset_id,
                  'topic': self.input_topic.name,
                  'fixed_window_duration': 1,
                  'user_activity_window_duration': 1,
                  'wait_until_finish_duration':
                      self.WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               bq_sessions_verifier)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_pubsub)
    self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)

    # Generate input data and inject to PubSub.
    self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    game_stats.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = [
      # Use ID_LABEL attribute to deduplicate messages with the same ID.
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT
      # pipeline writes back the timestamp of each element (as reported by
      # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
      PubsubMessage('data002', {
          TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
      }),
  ]
  EXPECTED_OUTPUT_MESSAGES = [
      PubsubMessage('data001-seen', {'processed': 'IT'}),
      PubsubMessage('data002-seen', {
          TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
          'processed': 'IT',
      }),
  ]

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def tearDown(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Build expected dataset.
    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        OUTPUT_SUB + self.uuid,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE])
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    for msg in self.INPUT_MESSAGES:
      self.input_topic.publish(msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)