def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads( self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if check: schema = [{ 'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED' }] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=schema) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected')
def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.co_input_options = json.loads( self.pipeline.get_option('co_input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset\ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError( 'One or more of parameters for collecting metrics ' 'are empty.')
def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option('number_of_counter_operations') self.input_options = json.loads(self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None if metrics_project_id and self.metrics_namespace is not None: measured_values = [ {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}, {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'} ] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected')
class CombineTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') class _GetElement(beam.DoFn): def process(self, element): yield element def testCombineGlobally(self): # pylint: disable=expression-not-assigned (self.pipeline | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def setUp(self): self.pipeline = TestPipeline() self.inputOptions = json.loads( self.pipeline.get_option('input_options')) self.iterations = self.pipeline.get_option( 'number_of_counter_operations') if self.iterations is None: self.iterations = 1 self.iterations = int(self.iterations) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.metrics_namespace: self.metrics_namespace = self.__class__.__name__ metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if check: measured_values = [ { 'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED' }, ] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected')
class GroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads(self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if check: schema = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=schema ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap( lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads(self.pipeline.get_option('input_options')) self.co_input_options = json.loads( self.pipeline.get_option('co_input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset\ is not None if check: measured_values = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected')
def setUp(self): self.pipeline = TestPipeline() self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option('number_of_counter_operations') self.input_options = json.loads(self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.')
class GroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get('bundle_size_distribution_type', 'const'), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError( 'One or more of parameters for collecting metrics ' 'are empty.') def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class CoGroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self, options): return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.co_input_options = json.loads( self.pipeline.get_option('co_input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset\ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError( 'One or more of parameters for collecting metrics ' 'are empty.') class _Ungroup(beam.DoFn): def process(self, element): values = element[1] inputs = values.get(INPUT_TAG) co_inputs = values.get(CO_INPUT_TAG) for i in inputs: yield i for i in co_inputs: yield i def testCoGroupByKey(self): pc1 = (self.pipeline | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.input_options))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) pc2 = ( self.pipeline | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.co_input_options))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned ({ INPUT_TAG: pc1, CO_INPUT_TAG: pc2 } | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class CoGroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self, options): return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads(self.pipeline.get_option('input_options')) self.co_input_options = json.loads( self.pipeline.get_option('co_input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset\ is not None if check: measured_values = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') class _Ungroup(beam.DoFn): def process(self, element): values = element[1] inputs = values.get(INPUT_TAG) co_inputs = values.get(CO_INPUT_TAG) for i in inputs: yield i for i in co_inputs: yield i def testCoGroupByKey(self): with self.pipeline as p: pc1 = (p | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.input_options))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) pc2 = (p | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.co_input_options))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map( lambda x: (x, x)) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2} | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class SideInputTest(unittest.TestCase): def _parseTestPipelineOptions(self): return { 'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get( 'bundle_size_distribution_type', 'const' ), 'param': self.inputOptions.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.inputOptions.get( 'force_initial_num_bundles', 0 ) } def _getSideInput(self): side_input = self._parseTestPipelineOptions() side_input['numRecords'] = side_input['numRecords'] side_input['keySizeBytes'] = side_input['keySizeBytes'] side_input['valueSizeBytes'] = side_input['valueSizeBytes'] return side_input def _getPerElementDelaySec(self): return self.syntheticStepOptions.get('per_element_delay_sec', 0) def _getPerBundleDelaySec(self): return self.syntheticStepOptions.get('per_bundle_delay_sec', 0) def _getOutputRecordsPerInputRecords(self): return self.syntheticStepOptions.get('output_records_per_input_records', 0) def setUp(self): self.pipeline = TestPipeline() self.inputOptions = json.loads(self.pipeline.get_option('input_options')) self.iterations = self.pipeline.get_option('number_of_counter_operations') if self.iterations is None: self.iterations = 1 self.iterations = int(self.iterations) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1]+value}) yield list main_input = (self.pipeline | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) side_input = (self.pipeline | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo( join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result) if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) unittest.main()
class CombineTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') class _GetElement(beam.DoFn): def process(self, element): yield element def testCombineGlobally(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class ParDoTest(unittest.TestCase): def parseTestPipelineOptions(self): return {'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get( 'bundle_size_distribution_param', 0 ) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option('number_of_counter_operations') self.input_options = json.loads(self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None if metrics_project_id and self.metrics_namespace is not None: measured_values = [ {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}, {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'} ] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') def testParDo(self): class _GetElement(beam.DoFn): from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes @count_bytes(COUNTER_LABEL) def process(self, element, namespace, is_returning): if is_returning: yield element if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) with self.pipeline as p: pc = (p | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions() )) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) for i in range(num_runs): is_returning = (i == (num_runs-1)) pc = (pc | 'Step: %d' % i >> beam.ParDo( _GetElement(), self.metrics_namespace, is_returning) ) if self.output is not None: pc = (pc | "Write" >> beam.io.WriteToText(self.output) ) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class ParDoTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get('bundle_size_distribution_type', 'const'), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option( 'number_of_counter_operations') self.input_options = json.loads( self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError( 'One or more of parameters for collecting metrics ' 'are empty.') def testParDo(self): class _GetElement(beam.DoFn): from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes @count_bytes def process(self, element, namespace, is_returning): if is_returning: yield element if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) pc = (self.pipeline | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace))) for i in range(num_runs): is_returning = (i == (num_runs - 1)) pc = (pc | 'Step: %d' % i >> beam.ParDo( _GetElement(), self.metrics_namespace, is_returning)) if self.output is not None: pc = (pc | "Write" >> beam.io.WriteToText(self.output)) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class SideInputTest(unittest.TestCase): def _parseTestPipelineOptions(self): return { 'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get( 'bundle_size_distribution_type', 'const' ), 'param': self.inputOptions.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.inputOptions.get( 'force_initial_num_bundles', 0 ) } def _getSideInput(self): side_input = self._parseTestPipelineOptions() side_input['numRecords'] = side_input['numRecords'] side_input['keySizeBytes'] = side_input['keySizeBytes'] side_input['valueSizeBytes'] = side_input['valueSizeBytes'] return side_input def _getPerElementDelaySec(self): return self.syntheticStepOptions.get('per_element_delay_sec', 0) def _getPerBundleDelaySec(self): return self.syntheticStepOptions.get('per_bundle_delay_sec', 0) def _getOutputRecordsPerInputRecords(self): return self.syntheticStepOptions.get('output_records_per_input_records', 0) def setUp(self): self.pipeline = TestPipeline() self.inputOptions = json.loads(self.pipeline.get_option('input_options')) self.iterations = self.pipeline.get_option('number_of_counter_operations') if self.iterations is None: self.iterations = 1 self.iterations = int(self.iterations) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1]+value}) yield list with self.pipeline as p: main_input = (p | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) side_input = (p | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo( join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result) if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) unittest.main()