示例#1
0
文件: pubsub.py 项目: mahak/beam
 def expand(self, pcoll):
     if self.with_attributes:
         pcoll = pcoll | 'ToProtobuf' >> Map(self.message_to_proto_str)
     else:
         pcoll = pcoll | 'ToProtobuf' >> Map(self.bytes_to_proto_str)
     pcoll.element_type = bytes
     return pcoll | Write(self._sink)
示例#2
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        output = (
            lines
            | 'Split' >>
            (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
            | 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
            | 'GroupAndSum' >> beam.CombinePerKey(sum)
            # For Logging Purposes
            | 'Format' >> beam.MapTuple(format_output))

        # A custom text sink so it displays nicely in GCS :(
        output | 'Write' >> Write(Utf8TextSink(known_args.output))
示例#3
0
  def expand(self, pcoll):
    if self.with_attributes:
      pcoll = pcoll | 'ToProtobuf' >> Map(self.to_proto_str)

    # Without attributes, message data is written as-is. With attributes,
    # message data + attributes are passed as a serialized protobuf string (see
    # ``PubsubMessage._to_proto_str`` for exact protobuf message type).
    pcoll.element_type = bytes
    return pcoll | Write(self._sink)
示例#4
0
 def expand(self, pcoll):
     pcoll = pcoll | 'EncodeString' >> Map(lambda s: s.encode('utf-8'))
     pcoll.element_type = bytes
     return pcoll | Write(self._sink)
示例#5
0
 def expand(self, pcoll):
     return pcoll | Write(self._sink)
示例#6
0
 def expand(self, pcoll):
     return pcoll | Write(_TFRecordSink(*self._args))
示例#7
0
 def expand(self, pcoll):
     pcoll = pcoll | 'encode string' >> ParDo(_encodeUtf8String)
     pcoll.element_type = bytes
     return pcoll | Write(self._sink)