def test_read_messages_timestamp_attribute_rfc3339_success( self, mock_pubsub): payload = 'payload' message_id = 'message_id' attributes = {'time': '2018-03-12T13:37:01.234567Z'} publish_time = '2018-03-12T13:37:01.234567Z' data = [ create_client_message(payload, message_id, attributes, publish_time) ] expected_data = [ TestWindowedValue( PubsubMessage(payload, attributes), timestamp.Timestamp.from_rfc3339(attributes['time']), [window.GlobalWindow()]), ] mock_pubsub.Client = functools.partial(FakePubsubClient, data) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_data), reify_windows=True) p.run()
def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None): l = [i for i in range(self._SIZE)] matchers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher(project=self.project, query="SELECT number FROM %s" % self.output_table, data=[(i, ) for i in l]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*matchers), wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION, experiments='use_beam_bq_sink', streaming=True) def add_schema_info(element): yield {'number': element} messages = [str(i).encode('utf-8') for i in l] for message in messages: self.pub_client.publish(self.input_topic.name, message) with beam.Pipeline(argv=args) as p: mesages = (p | ReadFromPubSub(subscription=self.input_sub.name) | beam.ParDo(add_schema_info)) _ = mesages | WriteToBigQuery( self.output_table, schema=self.SCHEMA, method=method, triggering_frequency=triggering_frequency)
def test_expand_with_subscription(self): p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub( None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label', with_attributes=False, timestamp_attribute=None) | beam.Map(lambda x: x)) self.assertEqual(bytes, pcoll.element_type) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(p.options) p.replace_all(overrides) # Note that the direct output of ReadFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. read_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly source = read_transform._source self.assertEqual('a_subscription', source.subscription_name) self.assertEqual('a_label', source.id_label)
def expand(self, input): # [START EXERCISE 3]: # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/apache_beam.io.gcp.pubsub.html # Determine whether to use files or topic based on options. if (not self.args.input == None) and (not self.args.input == ""): return ( input # Read game events from files. See exercise2. # Don't forget to parse events or to include the TimestampedValue transform to assign timestamps to events. | beam.io.ReadFromText(self.args.input) | ParDo(ParseEventFn()) | beam.Map(lambda element: TimestampedValue( element, element[self.TIMESTAMP_ATTRIBUTE]))) else: return ( input # Read game events from Pub/Sub topic self.options.topic using custom timestamps, which # are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE. # Use ReadFromPubSub() and use parameters topic and timestamp_attribute. # https://beam.apache.org/documentation/sdks/python-streaming/ | ReadFromPubSub(self.args.topic, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) # Parse the messages the same way as when they come from the text file. Note that we no # longer have to run WithTimestamps transform, as the timestamps are already set by # ReadFromPubSub. | ParDo(ParseEventFn()))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', dest='input_topic', help='Input topic in the form projects/<project>/topics/<topic>') parser.add_argument('--output', dest='output_file', help='Output file where to write') parser.add_argument('--table', dest='table_name', help='BQ table name') parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset') parser.add_argument('--project_id', dest='project_id', help='Project ID') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project=<your-project>']) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromPubSub(topic=known_args.input_topic) def str_to_dict(str_line): import pandas as pd df_rows = eval(str_line) pd.DataFrame.from_dict(df_rows) bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8'))) logging.info(bq_rows) return bq_rows lines = lines | 'String to dict' >> beam.Map(str_to_dict) lines = lines | 'Output to BQ' >> WriteToBigQuery( table=known_args.table_name, dataset=known_args.dataset_id, project=known_args.project_id)
def test_read_messages_timestamp_attribute_fail_parse(self, mock_pubsub): data = b'data' attributes = {'time': '1337 unparseable'} publish_time_secs = 1520861821 publish_time_nanos = 234567000 ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='time')) with self.assertRaisesRegex(ValueError, r'parse'): p.run() mock_pubsub.return_value.acknowledge.assert_not_called() mock_pubsub.return_value.api.transport.channel.close.assert_has_calls( [mock.call()])
def test_read_messages_timestamp_attribute_milli_success( self, mock_pubsub): data = b'data' attributes = {'time': '1337'} publish_time_secs = 1520861821 publish_time_nanos = 234567000 ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp(micros=int(attributes['time']) * 1000), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(subscription=mock.ANY, ack_ids=[ack_id])]) mock_pubsub.return_value.close.assert_has_calls([mock.call()])
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input Pub/Sub subscription to read from.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromPubSub(subscription=known_args.input) | 'extract words' >> beam.FlatMap(extract_words) | 'transform to kv' >> beam.Map(lambda x: (x,1)) | 'window per minute' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterProcessingTime(delay=10), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group by words' >> beam.GroupByKey() | 'count ones' >> beam.Map(count_ones) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery(table=known_args.output)) result = p.run() result.wait_until_finish()
def test_expand(self): options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = ( p | ReadFromPubSub('projects/fakeprj/topics/baz') | WriteToPubSub( 'projects/fakeprj/topics/a_topic', with_attributes=True) | beam.Map(lambda x: x)) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(options) p.replace_all(overrides) # Note that the direct output of ReadFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. write_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly self.assertEqual('a_topic', write_transform.dofn.short_topic_name) self.assertEqual(True, write_transform.dofn.with_attributes) # TODO(BEAM-4275): These properties aren't supported yet in direct runner. self.assertEqual(None, write_transform.dofn.id_label) self.assertEqual(None, write_transform.dofn.timestamp_attribute)
def run(run_local): JOB_NAME = 'firestore-stream-{}'.format( datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) pipeline_options = { 'project': PROJECT, 'staging_location': 'gs://' + BUCKET + '/staging', 'runner': 'DataflowRunner', 'job_name': JOB_NAME, 'disk_size_gb': 100, 'temp_location': 'gs://' + BUCKET + '/temp', 'save_main_session': True, 'requirements_file': 'requirements.txt', 'streaming': True } if run_local: pipeline_options['runner'] = 'DirectRunner' options = PipelineOptions.from_dictionary(pipeline_options) p = beam.Pipeline(options=options) crawled_features = (p | 'ReadPubsub' >> ReadFromPubSub( topic=PUBSUB_TOPIC).with_output_types(bytes) | 'JSONParse' >> beam.Map(lambda x: json.loads(x))) firebase_stream = (crawled_features | 'WriteFirebase' >> beam.ParDo(FirestoreWriteDoFn())) p.run()
def test_expand_with_other_options(self): options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True, timestamp_attribute='time') | beam.Map(lambda x: x)) self.assertEqual(PubsubMessage, pcoll.element_type) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(options) p.replace_all(overrides) # Note that the direct output of ReadFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. read_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly source = read_transform._source self.assertTrue(source.with_attributes) self.assertEqual('time', source.timestamp_attribute)
def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def test_read_messages_success(self, mock_pubsub): data = 'data' publish_time_secs = 1520861821 publish_time_nanos = 234567000 attributes = {'key': 'value'} ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue(PubsubMessage(data, attributes), timestamp.Timestamp(1520861821.234567), [window.GlobalWindow()]) ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True)) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])])
def test_read_messages_timestamp_attribute_missing(self, mock_pubsub): data = b'data' attributes = {} publish_time_secs = 1520861821 publish_time_nanos = 234567000 publish_time = '2018-03-12T13:37:01.234567Z' ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue(PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(publish_time), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='nonexistent')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])]) mock_pubsub.return_value.api.transport.channel.close.assert_has_calls( [mock.call()])
def test_read_messages_timestamp_attribute_rfc3339_success( self, mock_pubsub): data = 'data' attributes = {'time': '2018-03-12T13:37:01.234567Z'} publish_time_secs = 1337000000 publish_time_nanos = 133700000 ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(attributes['time']), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])])
def test_read_messages_timestamp_attribute_missing(self, mock_pubsub): data = 'data' message_id = 'message_id' attributes = {} publish_time = '2018-03-12T13:37:01.234567Z' payloads = [ create_client_message(data, message_id, attributes, publish_time)] expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(publish_time), [window.GlobalWindow()]), ] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='nonexistent')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "--subscription", dest="subscription", required=True, help= 'Input PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>".', ) parser.add_argument( "--bigquery_table", dest="bigquery_table", required=True, help="The fully-qualified BigQuery table to which to write.", ) parser.add_argument( "--bigquery_table_for_failed_rows", dest="bigquery_table_for_failed_rows", required=True, help= "The fully-qualified BigQuery table to which to write failed inserts.", ) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # yapf: disable messages = ( p | "ReadFromPubSub" >> ReadFromPubSub(subscription=known_args.subscription) .with_output_types(bytes) | "ParseAndValidateMessages" >> beam.ParDo(ValidateMessages()) .with_outputs(ValidateMessages.OUTPUT_TAG, main="valid_messages")) valid_messages = messages["valid_messages"] invalid_messages = messages[ValidateMessages.OUTPUT_TAG] (invalid_messages | "InvalidMessages:TupleToDict" >> beam.Map(tuple_to_dict) | "InvalidMessages:WriteToBigQuery" >> WriteRowsToBigQuery( table_name=known_args.bigquery_table_for_failed_rows)) failed_rows = ( valid_messages | "ValidMessages:WriteToBigQuery" >> WriteRowsToBigQuery( table_name=known_args.bigquery_table)) failed_rows_pcoll = failed_rows["FailedRows"] (failed_rows_pcoll | "FailedInserts:TupleToDict" >> beam.Map(tuple_to_dict) | "FailedInserts:WriteToBigQuery" >> WriteRowsToBigQuery( table_name=known_args.bigquery_table_for_failed_rows)) # yapf: enable result = p.run() result.wait_until_finish()
def test_expand_with_both_topic_and_subscription(self): with self.assertRaisesRegexp( ValueError, "Only one of topic or subscription should be provided."): ReadFromPubSub('a_topic', 'a_subscription', 'a_label', timestamp_attribute=None)
def test_expand_with_no_topic_or_subscription(self): with self.assertRaisesRegex( ValueError, "Either a topic or subscription must be provided."): ReadFromPubSub(None, None, 'a_label', with_attributes=False, timestamp_attribute=None)
def test_read_message_id_label_unsupported(self, unused_mock_pubsub): # id_label is unsupported in DirectRunner. options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True with self.assertRaisesRegex(NotImplementedError, r'id_label is not supported'): with TestPipeline(options=options) as p: _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label'))
def run(argv=None): class MessageParser(beam.DoFn): # It is required to parse messages for GBK operation. # Otherwise there are encoding problems. def process(self, item): if item.attributes: k, v = item.attributes.popitem() yield (str(k), str(v)) class ParserToBytes(beam.DoFn): # Parsing to bytes is required for saving in PubSub. def process(self, item): _, v = item yield bytes(v, encoding='utf8') parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument('--metrics_namespace', help=('Namespace of metrics ' '"string".')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # pylint: disable=expression-not-assigned (p | ReadFromPubSub(subscription=known_args.input_subscription, with_attributes=True) | 'Window' >> beam.WindowInto(window.FixedWindows(1000, 0)) | 'Measure time: Start' >> beam.ParDo( MeasureTime(known_args.metrics_namespace)) | 'Count messages' >> beam.ParDo( CountMessages(known_args.metrics_namespace)) | 'Parse' >> beam.ParDo(MessageParser()) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo( MeasureTime(known_args.metrics_namespace)) | 'Parse to bytes' >> beam.ParDo(ParserToBytes()) | 'Write' >> beam.io.WriteToPubSub(topic=known_args.output_topic)) result = p.run() result.wait_until_finish() logging.error(result) return result
def run(): with beam.Pipeline(options=PipelineOptions(streaming=True)) as p: pc = (p | ReadFromPubSub(topic=get_topic_path()) | beam.WindowInto( window.FixedWindows(WINDOW_SIZE), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'AddWindowInfo' >> beam.ParDo(add_window_info) | beam.CombinePerKey(sum) | beam.ParDo(prepare_element) | 'Print' >> beam.ParDo(print_fn) | WriteToBigQuery(BIGQUERY_TABLE_ID))
def test_read_data_success(self, mock_pubsub): data_encoded = u'🤷 ¯\\_(ツ)_/¯'.encode('utf-8') publish_time = '2018-03-12T13:37:01.234567Z' payloads = [create_client_message(data_encoded, None, None, publish_time)] expected_elements = [data_encoded] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None)) assert_that(pcoll, equal_to(expected_elements)) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input topic to process.') parser.add_argument('--output', dest='output', help='Output log name to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True isScreen = lambda event, name: event['jsonPayload']['name'] == name isEvent = lambda event, eventName: event['jsonPayload']['event' ] == eventName syncStart = lambda event: isScreen(event, 'Sync') syncFinish = lambda event: event['jsonPayload'][ 'type'] == 'screen' and type(event['jsonPayload'][ 'properties']) == dict and event['jsonPayload']['properties'][ 'previousScreen'] == 'Sync' and event['jsonPayload'][ 'properties']['currentScreen'] != 'Sync' loadStart = lambda event: isEvent(event, 'APP_LOADED') loadFinish = lambda event: isEvent(event, 'COMPONENT_MOUNT') transactionStart = lambda event: isScreen(event, 'Send') transactionFinish = lambda event: isEvent(event, 'send_invite') or isEvent( event, 'send_dollar_confirm') with beam.Pipeline(options=pipeline_options) as p: events = (p | ReadFromPubSub(known_args.input, with_attributes=True) | ParsePubSubJson()) time_to_sync_measurements = events | TimeBetween( 'time_to_sync', 2 * 60, syncStart, syncFinish) time_to_load = events | TimeBetween('time_to_load', 60, loadStart, loadFinish) time_to_send_transaction = events | TimeBetween( 'time_to_send_transaction', 5 * 60, transactionStart, transactionFinish) ((time_to_sync_measurements, time_to_load, time_to_send_transaction) | beam.Flatten() | WriteToStackdriverLogging(known_args.output))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input topic to process.') parser.add_argument('--output', dest='output', help='Output log name to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p | ReadFromPubSub(known_args.input, with_attributes=True) | ParsePubSubJson() | WriteToStackdriverLogging(known_args.output))
def test_read_data_success(self, mock_pubsub): data_encoded = u'🤷 ¯\\_(ツ)_/¯'.encode('utf-8') ack_id = 'ack_id' pull_response = test_utils.create_pull_response( [test_utils.PullResponseMessage(data_encoded, ack_id=ack_id)]) expected_elements = [data_encoded] mock_pubsub.return_value.pull.return_value = pull_response p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None)) assert_that(pcoll, equal_to(expected_elements)) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])])
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', dest='input_topic', help='Input topic in the form projects/<project>/topics/<topic>') parser.add_argument('--output', dest='output_file', help='Output file where to write') parser.add_argument('--table', dest='table_name', help='BQ table name') parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset') parser.add_argument('--project_id', dest='project_id', help='Project ID') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project=main-training-project', '--streaming']) """ pipeline_args.extend(['--runner=DataflowRunner', '--project=yourprojectid', '--staging_location=gs://yourgsbucket', '--temp_location=gs://yourgsbucket', '--job_name=your-job-name']) """ pipeline_options = PipelineOptions(pipeline_args) #pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromPubSub(topic=known_args.input_topic) def str_to_dict(str_line): import pandas as pd import nonpypimodule import changecommentfield df_rows = eval(str_line) pd.DataFrame.from_dict(df_rows) bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8'))) bq_rows['post'] = nonpypimodule.return_sentence() bq_rows = changecommentfield.change_field(bq_rows) logging.info(bq_rows) return bq_rows lines = lines | 'String to dict' >> beam.Map(str_to_dict) lines = lines | 'Output to BQ' >> WriteToBigQuery( table=known_args.table_name, dataset=known_args.dataset_id, project=known_args.project_id)
def test_expand_deprecated(self): p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/baz') | WriteStringsToPubSub('projects/fakeprj/topics/a_topic') | beam.Map(lambda x: x)) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(p.options) p.replace_all(overrides) # Note that the direct output of ReadFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. write_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly self.assertEqual('a_topic', write_transform.dofn.short_topic_name)
def test_read_message_id_label_unsupported(self, mock_pubsub): # id_label is unsupported in DirectRunner. data = 'data' message_id = 'message_id' attributes = {'time': '1337 unparseable'} publish_time = '2018-03-12T13:37:01.234567Z' payloads = [ create_client_message(data, message_id, attributes, publish_time)] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label')) with self.assertRaisesRegexp(NotImplementedError, r'id_label is not supported'): p.run()
def test_read_messages_timestamp_attribute_fail_parse(self, mock_pubsub): data = 'data' message_id = 'message_id' attributes = {'time': '1337 unparseable'} publish_time = '2018-03-12T13:37:01.234567Z' payloads = [ create_client_message(data, message_id, attributes, publish_time)] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True _ = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True, timestamp_attribute='time')) with self.assertRaisesRegexp(ValueError, r'parse'): p.run()