def pipeline(self): pipeline = beam.Pipeline(options=self.options) messages = self.message_sources(pipeline) messages = (messages | "MergeMessages" >> beam.Flatten() | "MessagesSsvid2Str" >> beam.Map(self.ssvid_to_str) | "Normalize" >> beam.ParDo(NormalizeDoFn()) | "MessagesAddKey" >> beam.Map(self.groupby_fn) | "MessagesGroupByKey" >> beam.GroupByKey()) segments = (pipeline | "ReadSegments" >> self.segment_source | "SegmentsAddKey" >> beam.Map(self.groupby_fn) | "SegmentsGroupByKey" >> beam.GroupByKey()) segmenter = Segment(segments, segmenter_params=self.segmenter_params) segmented = messages | "Segment" >> segmenter messages = segmented[Segment.OUTPUT_TAG_MESSAGES] segments = segmented[Segment.OUTPUT_TAG_SEGMENTS] (messages | "TimestampMessages" >> beam.ParDo(TimestampedValueDoFn()) | "WriteMessages" >> self.message_sink) (segments | "TimestampSegments" >> beam.ParDo(TimestampedValueDoFn()) | "WriteSegments" >> self.segment_sink(segmenter.segment_schema)) return pipeline
def build_pipeline(options): source = beam.io.gcp.bigquery.BigQuerySource(query=options.query) pipeline = beam.Pipeline(options=options) (pipeline | "ReadFromBigQuery" >> ReadAsJSONDict(source) | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn()) | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn()) | "WriteDatePartitions" >> WriteToDatePartitionedFiles( file_path_prefix=options.output_file_prefix, file_name_suffix=options.output_file_suffix, shards_per_day=options.shards_per_day)) return pipeline
def run(args=None): pipeline_options = PipelineOptions(args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True normalize_options = pipeline_options.view_as(NormalizeOptions) gcp_options = pipeline_options.view_as(GoogleCloudOptions) d1, d2 = parse_date_range(normalize_options.date_range) helper = QueryHelper(table=normalize_options.source_table, first_date_ts=d1, last_date_ts=d2) select_fields = ['mmsi', 'timestamp', 'seg_id', 'shipname', 'callsign', 'imo'] where_sql = 'shipname is not null or callsign is not null or imo is not null' if normalize_options.mmsi_quotient > 1: where_sql = "hash(mmsi) % {} = 0 and ({})".format(normalize_options.mmsi_quotient, where_sql) source_schema = helper.filter_table_schema(select_fields) source = BigQuerySource(query=helper.build_query(include_fields=select_fields, where_sql=where_sql)) dest_schema = TableSchema(fields=source_schema.fields) dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_SHIPNAME, type='STRING')) dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_CALLSIGN, type='STRING')) dest_schema.fields.append(TableFieldSchema(name=VALID_IMO, type='INTEGER')) pipeline = beam.Pipeline(options=pipeline_options) ( pipeline | "ReadSource" >> ReadAsJSONDict(source) | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn()) | "AddTimestamp" >> beam.ParDo(TimestampedValueDoFn()) | "NormalizeNames" >> beam.ParDo(NormalizeNamesDoFn()) | "WriteDest" >> WriteToBigQueryDatePartitioned( temp_gcs_location=gcp_options.temp_location, table=normalize_options.dest_table, schema=dest_schema, write_disposition=BigQueryDisposition.WRITE_TRUNCATE) ) result = pipeline.run() success_states = set([PipelineState.DONE]) if normalize_options.wait: result.wait_until_finish() else: success_states.add(PipelineState.RUNNING) return 0 if result.state in success_states else 1
def build_pipeline(options): source = beam.io.gcp.bigquery.BigQuerySource(query=options.query) temp_gcs_location = options.view_as(GoogleCloudOptions).temp_location pipeline = beam.Pipeline(options=options) (pipeline | "ReadFromBigQuery" >> ReadAsJSONDict(source) | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn()) | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn()) | "WriteDatePartitions" >> WriteToBigQueryDatePartitioned( temp_gcs_location=temp_gcs_location, table=options.output_table, schema=options.schema, temp_shards_per_day=3)) return pipeline
def build_pipeline(options): standard_options = options.view_as(StandardOptions) if standard_options.runner is None: standard_options.runner = StandardOptions.DEFAULT_RUNNER pipeline = beam.Pipeline(options=options) #generate some messages to write generator = MessageGenerator(count=int(options.count)) messages = pipeline | "GenerateMessages" >> GenerateMessages(generator=generator) # date partitioning is based on the timestamp used for windowing messages = messages | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn()) messages | "WriteDatePartitions" >> WriteToDatePartitionedFiles(file_path_prefix=options.output_file_prefix, file_name_suffix=options.output_file_suffix, shards_per_day=options.shards_per_day) return pipeline
def build_pipeline(options): pipeline = beam.Pipeline(options=options) temp_gcs_location = options.view_as(GoogleCloudOptions).temp_location # generate some messages to write generator = MessageGenerator(count=int(options.count)) schema = generator.bigquery_schema() messages = pipeline | "GenerateMessages" >> GenerateMessages( generator=generator) # date partitioning is based on the timestamp used for windowing messages = messages | "AddTimestampedValue" >> beam.ParDo( TimestampedValueDoFn()) messages | "WriteDatePartitions" >> WriteToBigQueryDatePartitioned( temp_gcs_location=temp_gcs_location, table=options.output_table, schema=schema, temp_shards_per_day=1) return pipeline