예제 #1
0
    def pipeline(self):
        pipeline = beam.Pipeline(options=self.options)
        messages = self.message_sources(pipeline)
        messages = (messages
                    | "MergeMessages" >> beam.Flatten()
                    | "MessagesSsvid2Str" >> beam.Map(self.ssvid_to_str)
                    | "Normalize" >> beam.ParDo(NormalizeDoFn())
                    | "MessagesAddKey" >> beam.Map(self.groupby_fn)
                    | "MessagesGroupByKey" >> beam.GroupByKey())

        segments = (pipeline
                    | "ReadSegments" >> self.segment_source
                    | "SegmentsAddKey" >> beam.Map(self.groupby_fn)
                    | "SegmentsGroupByKey" >> beam.GroupByKey())

        segmenter = Segment(segments, segmenter_params=self.segmenter_params)
        segmented = messages | "Segment" >> segmenter

        messages = segmented[Segment.OUTPUT_TAG_MESSAGES]
        segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
        (messages
         | "TimestampMessages" >> beam.ParDo(TimestampedValueDoFn())
         | "WriteMessages" >> self.message_sink)
        (segments
         | "TimestampSegments" >> beam.ParDo(TimestampedValueDoFn())
         | "WriteSegments" >> self.segment_sink(segmenter.segment_schema))
        return pipeline
예제 #2
0
def build_pipeline(options):

    source = beam.io.gcp.bigquery.BigQuerySource(query=options.query)

    pipeline = beam.Pipeline(options=options)
    (pipeline
     | "ReadFromBigQuery" >> ReadAsJSONDict(source)
     | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn())
     | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn())
     | "WriteDatePartitions" >> WriteToDatePartitionedFiles(
         file_path_prefix=options.output_file_prefix,
         file_name_suffix=options.output_file_suffix,
         shards_per_day=options.shards_per_day))

    return pipeline
예제 #3
0
def run(args=None):
  pipeline_options = PipelineOptions(args)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options.view_as(SetupOptions).save_main_session = True

  normalize_options = pipeline_options.view_as(NormalizeOptions)
  gcp_options = pipeline_options.view_as(GoogleCloudOptions)

  d1, d2 = parse_date_range(normalize_options.date_range)
  helper = QueryHelper(table=normalize_options.source_table, first_date_ts=d1, last_date_ts=d2)
  select_fields = ['mmsi', 'timestamp', 'seg_id', 'shipname', 'callsign', 'imo']
  where_sql = 'shipname is not null or callsign is not null or imo is not null'
  if normalize_options.mmsi_quotient > 1:
      where_sql = "hash(mmsi) % {} = 0 and ({})".format(normalize_options.mmsi_quotient, where_sql)

  source_schema = helper.filter_table_schema(select_fields)
  source = BigQuerySource(query=helper.build_query(include_fields=select_fields, where_sql=where_sql))

  dest_schema = TableSchema(fields=source_schema.fields)
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_SHIPNAME, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_CALLSIGN, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=VALID_IMO, type='INTEGER'))

  pipeline = beam.Pipeline(options=pipeline_options)
  (
      pipeline
      | "ReadSource" >> ReadAsJSONDict(source)
      | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn())
      | "AddTimestamp" >> beam.ParDo(TimestampedValueDoFn())
      | "NormalizeNames" >> beam.ParDo(NormalizeNamesDoFn())
      | "WriteDest" >> WriteToBigQueryDatePartitioned(
          temp_gcs_location=gcp_options.temp_location,
          table=normalize_options.dest_table,
          schema=dest_schema,
          write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
  )

  result = pipeline.run()
  success_states = set([PipelineState.DONE])

  if normalize_options.wait:
    result.wait_until_finish()
  else:
      success_states.add(PipelineState.RUNNING)

  return 0 if result.state in success_states else 1
예제 #4
0
def build_pipeline(options):

    source = beam.io.gcp.bigquery.BigQuerySource(query=options.query)
    temp_gcs_location = options.view_as(GoogleCloudOptions).temp_location

    pipeline = beam.Pipeline(options=options)
    (pipeline
     | "ReadFromBigQuery" >> ReadAsJSONDict(source)
     | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn())
     | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn())
     | "WriteDatePartitions" >> WriteToBigQueryDatePartitioned(
         temp_gcs_location=temp_gcs_location,
         table=options.output_table,
         schema=options.schema,
         temp_shards_per_day=3))

    return pipeline
def build_pipeline(options):

    standard_options = options.view_as(StandardOptions)
    if standard_options.runner is None:
        standard_options.runner = StandardOptions.DEFAULT_RUNNER

    pipeline = beam.Pipeline(options=options)

    #generate some messages to write
    generator = MessageGenerator(count=int(options.count))
    messages = pipeline | "GenerateMessages" >> GenerateMessages(generator=generator)
    # date partitioning is based on the timestamp used for windowing
    messages = messages | "AddTimestampedValue" >> beam.ParDo(TimestampedValueDoFn())
    messages | "WriteDatePartitions" >> WriteToDatePartitionedFiles(file_path_prefix=options.output_file_prefix,
                                                                    file_name_suffix=options.output_file_suffix,
                                                                    shards_per_day=options.shards_per_day)

    return pipeline
def build_pipeline(options):

    pipeline = beam.Pipeline(options=options)
    temp_gcs_location = options.view_as(GoogleCloudOptions).temp_location

    # generate some messages to write
    generator = MessageGenerator(count=int(options.count))
    schema = generator.bigquery_schema()
    messages = pipeline | "GenerateMessages" >> GenerateMessages(
        generator=generator)

    # date partitioning is based on the timestamp used for windowing
    messages = messages | "AddTimestampedValue" >> beam.ParDo(
        TimestampedValueDoFn())

    messages | "WriteDatePartitions" >> WriteToBigQueryDatePartitioned(
        temp_gcs_location=temp_gcs_location,
        table=options.output_table,
        schema=schema,
        temp_shards_per_day=1)

    return pipeline