示例#1
0
    def setUp(self):
        self.pipeline = TestPipeline(is_integration_test=True)
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))

        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')
        self.metrics_monitor = None
        check = metrics_project_id and self.metrics_namespace and metrics_dataset \
                is not None
        if check:
            schema = [{
                'name': RUNTIME_LABEL,
                'type': 'FLOAT',
                'mode': 'REQUIRED'
            }]
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
                schema_map=schema)
        else:
            logging.error('One or more of parameters for collecting metrics '
                          'are empty. Metrics will not be collected')
示例#2
0
    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))
        self.co_input_options = json.loads(
            self.pipeline.get_option('co_input_options'))

        self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')
        check = metrics_project_id and self.metrics_namespace and metrics_dataset\
                is not None
        if not self.metrics_monitor:
            logging.info('Metrics will not be collected')
        elif check:
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
            )
        else:
            raise ValueError(
                'One or more of parameters for collecting metrics '
                'are empty.')
示例#3
0
  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)

    self.output = self.pipeline.get_option('output')
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    if metrics_project_id and self.metrics_namespace is not None:
      measured_values = [
          {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'},
          {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'}
      ]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')
示例#4
0
class CombineTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  class _GetElement(beam.DoFn):
    def process(self, element):
      yield element

  def testCombineGlobally(self):
    # pylint: disable=expression-not-assigned
    (self.pipeline
     | beam.io.Read(synthetic_pipeline.SyntheticSource(
         self.parseTestPipelineOptions()))
     | 'Measure time: Start' >> beam.ParDo(
         MeasureTime(self.metrics_namespace))
     | 'Combine with Top' >> beam.CombineGlobally(
         beam.combiners.TopCombineFn(1000))
     | 'Consume' >> beam.ParDo(self._GetElement())
     | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )

    result = self.pipeline.run()
    result.wait_until_finish()
    if self.metrics_monitor is not None:
      self.metrics_monitor.send_metrics(result)
示例#5
0
    def setUp(self):
        self.pipeline = TestPipeline()
        self.inputOptions = json.loads(
            self.pipeline.get_option('input_options'))
        self.iterations = self.pipeline.get_option(
            'number_of_counter_operations')
        if self.iterations is None:
            self.iterations = 1
        self.iterations = int(self.iterations)

        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        if not self.metrics_namespace:
            self.metrics_namespace = self.__class__.__name__
        metrics_dataset = self.pipeline.get_option('metrics_dataset')
        self.metrics_monitor = None
        check = metrics_project_id and self.metrics_namespace and metrics_dataset \
                is not None
        if check:
            measured_values = [
                {
                    'name': RUNTIME_LABEL,
                    'type': 'FLOAT',
                    'mode': 'REQUIRED'
                },
            ]
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
                schema_map=measured_values)
        else:
            logging.error('One or more of parameters for collecting metrics '
                          'are empty. Metrics will not be collected')
示例#6
0
class GroupByKeyTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if check:
      schema = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=schema
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  def testGroupByKey(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time: Start' >> beam.ParDo(
           MeasureTime(self.metrics_namespace))
       | 'GroupByKey' >> beam.GroupByKey()
       | 'Ungroup' >> beam.FlatMap(
           lambda elm: [(elm[0], v) for v in elm[1]])
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.co_input_options = json.loads(
        self.pipeline.get_option('co_input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    check = metrics_project_id and self.metrics_namespace and metrics_dataset\
            is not None
    if check:
      measured_values = [{'name': RUNTIME_LABEL,
                          'type': 'FLOAT',
                          'mode': 'REQUIRED'}]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')
示例#8
0
  def setUp(self):
    self.pipeline = TestPipeline()

    self.output = self.pipeline.get_option('output')
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')

    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')
示例#9
0
class GroupByKeyTest(unittest.TestCase):
    def parseTestPipelineOptions(self):
        return {
            'numRecords':
            self.input_options.get('num_records'),
            'keySizeBytes':
            self.input_options.get('key_size'),
            'valueSizeBytes':
            self.input_options.get('value_size'),
            'bundleSizeDistribution': {
                'type':
                self.input_options.get('bundle_size_distribution_type',
                                       'const'),
                'param':
                self.input_options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles':
            self.input_options.get('force_initial_num_bundles', 0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))

        self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')

        check = metrics_project_id and self.metrics_namespace and metrics_dataset \
                is not None
        if not self.metrics_monitor:
            logging.info('Metrics will not be collected')
        elif check:
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
            )
        else:
            raise ValueError(
                'One or more of parameters for collecting metrics '
                'are empty.')

    def testGroupByKey(self):
        with self.pipeline as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions()))
             | 'Measure time: Start' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v)
                                                      for v in elm[1]])
             | 'Measure time: End' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace)))

            result = p.run()
            result.wait_until_finish()
            if self.metrics_monitor is not None:
                self.metrics_monitor.send_metrics(result)
示例#10
0
class CoGroupByKeyTest(unittest.TestCase):
    def parseTestPipelineOptions(self, options):
        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))
        self.co_input_options = json.loads(
            self.pipeline.get_option('co_input_options'))

        self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')
        check = metrics_project_id and self.metrics_namespace and metrics_dataset\
                is not None
        if not self.metrics_monitor:
            logging.info('Metrics will not be collected')
        elif check:
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
            )
        else:
            raise ValueError(
                'One or more of parameters for collecting metrics '
                'are empty.')

    class _Ungroup(beam.DoFn):
        def process(self, element):
            values = element[1]
            inputs = values.get(INPUT_TAG)
            co_inputs = values.get(CO_INPUT_TAG)
            for i in inputs:
                yield i
            for i in co_inputs:
                yield i

    def testCoGroupByKey(self):
        pc1 = (self.pipeline
               | 'Read ' + INPUT_TAG >> beam.io.Read(
                   synthetic_pipeline.SyntheticSource(
                       self.parseTestPipelineOptions(self.input_options)))
               |
               'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
               | 'Measure time: Start pc1' >> beam.ParDo(
                   MeasureTime(self.metrics_namespace)))

        pc2 = (
            self.pipeline
            | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions(self.co_input_options)))
            |
            'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
            | 'Measure time: Start pc2' >> beam.ParDo(
                MeasureTime(self.metrics_namespace)))
        # pylint: disable=expression-not-assigned
        ({
            INPUT_TAG: pc1,
            CO_INPUT_TAG: pc2
        }
         | 'CoGroupByKey: ' >> beam.CoGroupByKey()
         | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))

        result = self.pipeline.run()
        result.wait_until_finish()
        if self.metrics_monitor is not None:
            self.metrics_monitor.send_metrics(result)
示例#11
0
class CoGroupByKeyTest(unittest.TestCase):

  def parseTestPipelineOptions(self, options):
    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.co_input_options = json.loads(
        self.pipeline.get_option('co_input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    check = metrics_project_id and self.metrics_namespace and metrics_dataset\
            is not None
    if check:
      measured_values = [{'name': RUNTIME_LABEL,
                          'type': 'FLOAT',
                          'mode': 'REQUIRED'}]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  class _Ungroup(beam.DoFn):
    def process(self, element):
      values = element[1]
      inputs = values.get(INPUT_TAG)
      co_inputs = values.get(CO_INPUT_TAG)
      for i in inputs:
        yield i
      for i in co_inputs:
        yield i

  def testCoGroupByKey(self):
    with self.pipeline as p:
      pc1 = (p
             | 'Read ' + INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.input_options)))
             | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
             | 'Measure time: Start pc1' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
            )

      pc2 = (p
             | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.co_input_options)))
             | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(
                 lambda x: (x, x))
             | 'Measure time: Start pc2' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
            )
      # pylint: disable=expression-not-assigned
      ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2}
       | 'CoGroupByKey: ' >> beam.CoGroupByKey()
       | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
示例#12
0
class SideInputTest(unittest.TestCase):
  def _parseTestPipelineOptions(self):
    return {
        'numRecords': self.inputOptions.get('num_records'),
        'keySizeBytes': self.inputOptions.get('key_size'),
        'valueSizeBytes': self.inputOptions.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.inputOptions.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.inputOptions.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.inputOptions.get(
            'force_initial_num_bundles', 0
        )
    }

  def _getSideInput(self):
    side_input = self._parseTestPipelineOptions()
    side_input['numRecords'] = side_input['numRecords']
    side_input['keySizeBytes'] = side_input['keySizeBytes']
    side_input['valueSizeBytes'] = side_input['valueSizeBytes']
    return side_input

  def _getPerElementDelaySec(self):
    return self.syntheticStepOptions.get('per_element_delay_sec', 0)

  def _getPerBundleDelaySec(self):
    return self.syntheticStepOptions.get('per_bundle_delay_sec', 0)

  def _getOutputRecordsPerInputRecords(self):
    return self.syntheticStepOptions.get('output_records_per_input_records', 0)

  def setUp(self):
    self.pipeline = TestPipeline()
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    if self.iterations is None:
      self.iterations = 1
    self.iterations = int(self.iterations)

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')

    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  def testSideInput(self):
    def join_fn(element, side_input, iterations):
      list = []
      for i in range(iterations):
        for key, value in side_input:
          if i == iterations - 1:
            list.append({key: element[1]+value})
      yield list

    main_input = (self.pipeline
                  | "Read pcoll 1" >> beam.io.Read(
                      synthetic_pipeline.SyntheticSource(
                          self._parseTestPipelineOptions()))
                  | 'Measure time: Start pcoll 1' >> beam.ParDo(
                      MeasureTime(self.metrics_namespace))
                 )

    side_input = (self.pipeline
                  | "Read pcoll 2" >> beam.io.Read(
                      synthetic_pipeline.SyntheticSource(
                          self._getSideInput()))
                  | 'Measure time: Start pcoll 2' >> beam.ParDo(
                      MeasureTime(self.metrics_namespace))
                 )
    # pylint: disable=expression-not-assigned
    (main_input
     | "Merge" >> beam.ParDo(
         join_fn,
         AsIter(side_input),
         self.iterations)
     | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )

    result = self.pipeline.run()
    result.wait_until_finish()

    if self.metrics_monitor is not None:
      self.metrics_monitor.send_metrics(result)

  if __name__ == '__main__':
    logging.getLogger().setLevel(logging.DEBUG)
    unittest.main()
示例#13
0
class CombineTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  class _GetElement(beam.DoFn):
    def process(self, element):
      yield element

  def testCombineGlobally(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time: Start' >> beam.ParDo(
           MeasureTime(self.metrics_namespace))
       | 'Combine with Top' >> beam.CombineGlobally(
           beam.combiners.TopCombineFn(1000))
       | 'Consume' >> beam.ParDo(self._GetElement())
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
示例#14
0
class ParDoTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {'numRecords': self.input_options.get('num_records'),
            'keySizeBytes': self.input_options.get('key_size'),
            'valueSizeBytes': self.input_options.get('value_size'),
            'bundleSizeDistribution': {
                'type': self.input_options.get(
                    'bundle_size_distribution_type', 'const'
                ),
                'param': self.input_options.get(
                    'bundle_size_distribution_param', 0
                )
            },
            'forceNumInitialBundles': self.input_options.get(
                'force_initial_num_bundles', 0
            )
           }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)

    self.output = self.pipeline.get_option('output')
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    if metrics_project_id and self.metrics_namespace is not None:
      measured_values = [
          {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'},
          {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'}
      ]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  def testParDo(self):

    class _GetElement(beam.DoFn):
      from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes

      @count_bytes(COUNTER_LABEL)
      def process(self, element, namespace, is_returning):
        if is_returning:
          yield element

    if self.iterations is None:
      num_runs = 1
    else:
      num_runs = int(self.iterations)

    with self.pipeline as p:
      pc = (p
            | 'Read synthetic' >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions()
                ))
            | 'Measure time: Start' >> beam.ParDo(
                MeasureTime(self.metrics_namespace))
           )

      for i in range(num_runs):
        is_returning = (i == (num_runs-1))
        pc = (pc
              | 'Step: %d' % i >> beam.ParDo(
                  _GetElement(), self.metrics_namespace, is_returning)
             )

      if self.output is not None:
        pc = (pc
              | "Write" >> beam.io.WriteToText(self.output)
             )

      # pylint: disable=expression-not-assigned
      (pc
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()

      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
示例#15
0
class ParDoTest(unittest.TestCase):
    def parseTestPipelineOptions(self):
        return {
            'numRecords':
            self.input_options.get('num_records'),
            'keySizeBytes':
            self.input_options.get('key_size'),
            'valueSizeBytes':
            self.input_options.get('value_size'),
            'bundleSizeDistribution': {
                'type':
                self.input_options.get('bundle_size_distribution_type',
                                       'const'),
                'param':
                self.input_options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles':
            self.input_options.get('force_initial_num_bundles', 0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()

        self.output = self.pipeline.get_option('output')
        self.iterations = self.pipeline.get_option(
            'number_of_counter_operations')
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))

        self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')

        check = metrics_project_id and self.metrics_namespace and metrics_dataset \
                is not None
        if not self.metrics_monitor:
            logging.info('Metrics will not be collected')
        elif check:
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
            )
        else:
            raise ValueError(
                'One or more of parameters for collecting metrics '
                'are empty.')

    def testParDo(self):
        class _GetElement(beam.DoFn):
            from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes

            @count_bytes
            def process(self, element, namespace, is_returning):
                if is_returning:
                    yield element

        if self.iterations is None:
            num_runs = 1
        else:
            num_runs = int(self.iterations)

        pc = (self.pipeline
              | 'Read synthetic' >> beam.io.Read(
                  synthetic_pipeline.SyntheticSource(
                      self.parseTestPipelineOptions()))
              | 'Measure time: Start' >> beam.ParDo(
                  MeasureTime(self.metrics_namespace)))

        for i in range(num_runs):
            is_returning = (i == (num_runs - 1))
            pc = (pc
                  | 'Step: %d' % i >> beam.ParDo(
                      _GetElement(), self.metrics_namespace, is_returning))

        if self.output is not None:
            pc = (pc | "Write" >> beam.io.WriteToText(self.output))

        # pylint: disable=expression-not-assigned
        (pc
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))

        result = self.pipeline.run()
        result.wait_until_finish()

        if self.metrics_monitor is not None:
            self.metrics_monitor.send_metrics(result)
示例#16
0
class SideInputTest(unittest.TestCase):
  def _parseTestPipelineOptions(self):
    return {
        'numRecords': self.inputOptions.get('num_records'),
        'keySizeBytes': self.inputOptions.get('key_size'),
        'valueSizeBytes': self.inputOptions.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.inputOptions.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.inputOptions.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.inputOptions.get(
            'force_initial_num_bundles', 0
        )
    }

  def _getSideInput(self):
    side_input = self._parseTestPipelineOptions()
    side_input['numRecords'] = side_input['numRecords']
    side_input['keySizeBytes'] = side_input['keySizeBytes']
    side_input['valueSizeBytes'] = side_input['valueSizeBytes']
    return side_input

  def _getPerElementDelaySec(self):
    return self.syntheticStepOptions.get('per_element_delay_sec', 0)

  def _getPerBundleDelaySec(self):
    return self.syntheticStepOptions.get('per_bundle_delay_sec', 0)

  def _getOutputRecordsPerInputRecords(self):
    return self.syntheticStepOptions.get('output_records_per_input_records', 0)

  def setUp(self):
    self.pipeline = TestPipeline()
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    if self.iterations is None:
      self.iterations = 1
    self.iterations = int(self.iterations)

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')

    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  def testSideInput(self):
    def join_fn(element, side_input, iterations):
      list = []
      for i in range(iterations):
        for key, value in side_input:
          if i == iterations - 1:
            list.append({key: element[1]+value})
      yield list

    with self.pipeline as p:
      main_input = (p
                    | "Read pcoll 1" >> beam.io.Read(
                        synthetic_pipeline.SyntheticSource(
                            self._parseTestPipelineOptions()))
                    | 'Measure time: Start pcoll 1' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace))
                   )

      side_input = (p
                    | "Read pcoll 2" >> beam.io.Read(
                        synthetic_pipeline.SyntheticSource(
                            self._getSideInput()))
                    | 'Measure time: Start pcoll 2' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace))
                   )
      # pylint: disable=expression-not-assigned
      (main_input
       | "Merge" >> beam.ParDo(
           join_fn,
           AsIter(side_input),
           self.iterations)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()

      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)

  if __name__ == '__main__':
    logging.getLogger().setLevel(logging.DEBUG)
    unittest.main()