def run(argv=sys.argv[1:]):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Register the custom coder for the Player class, so that it will be used in
    # the computation.
    coders.registry.register_coder(Player, PlayerCoder)

    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read', df.io.TextFileSource(known_args.input))
     # The get_players function is annotated with a type hint above, so the type
     # system knows the output type of the following operation is a key-value pair
     # of a Player and an int. Please see the documentation for details on
     # types that are inferred automatically as well as other ways to specify
     # type hints.
     | df.Map('get players', get_players)
     # The output type hint of the previous step is used to infer that the key
     # type of the following operation is the Player type. Since a custom coder
     # is registered for the Player class above, a PlayerCoder will be used to
     # encode Player objects as keys for this combine operation.
     | df.CombinePerKey(sum) | df.Map(lambda (k, v): '%s,%d' % (k.name, v))
     | df.io.Write('write', df.io.TextFileSink(known_args.output)))
    p.run()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        default='clouddataflow-readonly:samples.weather_stations',
        help=('Input BigQuery table to process specified as: '
              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    parser.add_argument(
        '--output',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the table rows into a PCollection.
    rows = p | df.io.Read('read', df.io.BigQuerySource(known_args.input))
    counts = count_tornadoes(rows)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    counts | df.io.Write(
        'write',
        df.io.BigQuerySink(
            known_args.output,
            schema='month:INTEGER, tornado_count:INTEGER',
            create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))

    # Run the pipeline (all operations are deferred until run() is called).
    p.run()
示例#3
0
    def test_aggregation(self):

        mean = combiners.MeanCombineFn()
        mean.__name__ = 'mean'
        counter_types = [
            (sum, int, 6),
            (min, int, 0),
            (max, int, 3),
            (mean, int, 1),
            (sum, float, 6.0),
            (min, float, 0.0),
            (max, float, 3.0),
            (mean, float, 1.5),
            (any, int, True),
            (all, float, False),
        ]
        aggeregators = [
            Aggregator('%s_%s' % (f.__name__, t.__name__), f, t)
            for f, t, _ in counter_types
        ]

        class UpdateAggregators(df.DoFn):
            def process(self, context):
                for a in aggeregators:
                    context.aggregate_to(a, context.element)

        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create([0, 1, 2, 3]) | df.ParDo(UpdateAggregators())
        res = p.run()
        for (_, _, expected), a in zip(counter_types, aggeregators):
            actual = res.aggregated_values(a).values()[0]
            self.assertEqual(expected, actual)
            self.assertEqual(type(expected), type(actual))
示例#4
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = df.Pipeline('DirectPipelineRunner')
    pcoll = pipeline | df.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
示例#5
0
def run_count3(known_args, options):
  """Runs the third example pipeline."""

  @df.ptransform_fn
  # pylint: disable=invalid-name,unused-argument
  def Count(label, pcoll, factor=1):
    """Count as a decorated function with a side input.

    Args:
      label: optional label for this transform
      pcoll: the PCollection passed in from the previous transform
      factor: the amount by which to count

    Returns:
      A PCollection counting the number of times each unique element occurs.
    """
    return (
        pcoll
        | df.Map('Init', lambda v: (v, factor))
        | df.CombinePerKey(sum))

  logging.info('Running third pipeline')
  p = df.Pipeline(options=options)
  (p | df.io.Read(df.io.TextFileSource(known_args.input))
   | Count(2)  # pylint: disable=no-value-for-parameter
   | df.io.Write(df.io.TextFileSink(known_args.output)))
  p.run()
 def test_basics(self):
     p = df.Pipeline('DirectPipelineRunner')
     rows = (p | df.Create('create', [{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     df.assert_that(
         results,
         df.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run()
示例#7
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))
    counts = (lines
              | (df.ParDo('split',
                          WordExtractingDoFn()).with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    result = p.run()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
示例#8
0
def run(argv=None):
    """Constructs and runs the example filtering pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        help='BigQuery table to read from.',
        default='clouddataflow-readonly:samples.weather_stations')
    parser.add_argument('--output',
                        required=True,
                        help='BigQuery table to write to.')
    parser.add_argument('--month_filter',
                        default=7,
                        help='Numeric value of month to filter on.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    input_data = p | df.Read('input', df.io.BigQuerySource(known_args.input))

    # pylint: disable=expression-not-assigned
    (filter_cold_days(input_data, known_args.month_filter)
     | df.io.Write(
         'save to BQ',
         df.io.BigQuerySink(
             known_args.output,
             schema='year:INTEGER,month:INTEGER,day:INTEGER,mean_temp:FLOAT',
             create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = df.Pipeline(argv=pipeline_args)

  (p  # pylint: disable=expression-not-assigned
   | df.Read('read', df.io.TextFileSource(known_args.input))
   | ComputeTopSessions(known_args.sampling_threshold)
   | df.io.Write('write', df.io.TextFileSink(known_args.output)))

  p.run()
    def test_compute_top_sessions(self):
        p = df.Pipeline('DirectPipelineRunner')
        edits = p | df.Create('create', self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        df.assert_that(result, df.equal_to(self.EXPECTED))
        p.run()
示例#11
0
    def preprocess(self, input_path, input_dict, output_path):
        """

    Args:
      input_path: Input specified as uri to CSV file. Each line of csv file
                  contains colon-separated GCS uri to an image and labels
      input_dict: Input dictionary. Specified as text file uri.
                  Each line of the file stores one label.
    """
        opt = self.pipeline_options.view_as(PrepareImagesOptions)
        p = df.Pipeline(options=self.pipeline_options)

        # Read input data.
        csv_data = df.io.TextFileSource(input_path,
                                        strip_trailing_newlines=True)
        dict_data = df.io.TextFileSource(input_dict,
                                         strip_trailing_newlines=True)
        labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data))
        content = (p | df.Read(StageName.READ_CSV, csv_data)
                   | df.Map(StageName.PARSE_CSV,
                            lambda line: csv.reader([line]).next())
                   | df.ParDo(StageName.EXTRACT_LABEL_IDS,
                              ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels))
                   | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn()))

        # Process input data using common transformations.
        image_graph_uri = os.path.join(opt.input_data_location,
                                       Default.IMAGE_GRAPH_FILENAME)
        examples = (
            content
            | df.ParDo(
                StageName.CONVERT_IMAGE,
                ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width,
                                opt.max_image_height))
            | df.ParDo(
                StageName.ENCODE_EXAMPLE,
                EncodeExampleDoFn(image_graph_uri,
                                  opt.image_graph_jpeg_input_tensor,
                                  opt.image_graph_output_tensor,
                                  opt.training_data_percentage)))

        # Write in JSON format to Text file.
        # Remove redundant whitespace for more compact representation.
        # Images/labels are base64 encoded so will not contain spaces.
        to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0])
                                   )

        for dataset in Dataset.ALL:
            _ = (examples
                 | df.Filter(StageName.FILTER + dataset,
                             lambda x, dataset=dataset: x[1] == dataset)
                 | df.Map(StageName.TO_JSON + dataset, to_json)
                 | df.Write(
                     StageName.SAVE + dataset,
                     df.io.TextFileSink('{}.{}.json'.format(
                         output_path, dataset),
                                        num_shards=opt.output_shard_count)))

        # Execute the pipeline.
        p.run()
def run(argv=None):
  """Build and run the pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_topic', dest='input_topic', required=True,
      help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  parser.add_argument(
      '--output_topic', dest='output_topic', required=True,
      help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = df.Pipeline(argv=pipeline_args)

  # Read the text file[pattern] into a PCollection.
  lines = p | df.io.Read(
      'read', df.io.PubSubSource(known_args.input_topic))

  # Capitalize the characters in each line.
  transformed = (lines
                 | (df.Map('capitalize', lambda x: x.upper())))

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  transformed | df.io.Write(
      'pubsub_write', df.io.PubSubSink(known_args.output_topic))

  p.run()
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        required=True,
        help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    parser.add_argument(
        '--output_topic',
        required=True,
        help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic))

    # Capitalize the characters in each line.
    transformed = (lines
                   | (df.FlatMap('split', lambda x: re.findall(
                       r'[A-Za-z\']+', x)).with_output_types(unicode))
                   | df.Map('pair_with_one', lambda x: (x, 1))
                   | df.WindowInto(window.FixedWindows(15, 0))
                   | df.GroupByKey('group')
                   | df.Map('count', lambda (word, ones): (word, sum(ones)))
                   | df.Map('format', lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | df.io.Write('pubsub_write',
                              df.io.PubSubSink(known_args.output_topic))

    p.run()
示例#14
0
 def test_runtime_checks_on(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p.options.view_as(TypeOptions).runtime_type_check = True
         p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
         p.run()
示例#15
0
    def test_bad_types(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)

        # [START type_hints_missing_define_numbers]
        numbers = p | df.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # [START type_hints_missing_apply]
        evens = numbers | df.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            p.options.view_as(TypeOptions).pipeline_type_check = True
            evens = numbers | df.Filter(lambda x: x % 2 == 0).with_input_types(
                int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @df.typehints.with_input_types(int)
            class FilterEvensDoFn(df.DoFn):
                def process(self, context):
                    if context.element % 2 == 0:
                        yield context.element

            evens = numbers | df.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | df.Create('words', ['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = df.typehints.TypeVariable('T')

        @df.typehints.with_input_types(T)
        @df.typehints.with_output_types(df.typehints.Tuple[int, T])
        class MyTransform(df.PTransform):
            def apply(self, pcoll):
                return pcoll | df.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | df.Map(lambda x: x).with_input_types(
                df.typehints.Tuple[int, int])
示例#16
0
 def test_tfidf_transform(self):
     p = df.Pipeline('DirectPipelineRunner')
     uri_to_line = p | df.Create('create sample', [('1.txt', 'abc def ghi'),
                                                   ('2.txt', 'abc def'),
                                                   ('3.txt', 'abc')])
     result = (uri_to_line
               | tfidf.TfIdf()
               | df.Map('flatten', lambda (word, (uri, tfidf)):
                        (word, uri, tfidf)))
示例#17
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectPipelineRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)).
                 with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
示例#18
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.foo',
                       coder=coders.ToStringCoder())
     p = df.Pipeline('DirectPipelineRunner')
     p | df.Create([]) | df.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
示例#19
0
def run(argv=None):
    """Runs the workflow counting the long words and short words separately."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output prefix for files to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.Read('read', df.io.TextFileSource(known_args.input))

    # with_outputs allows accessing the side outputs of a DoFn.
    split_lines_result = (
        lines
        | df.ParDo(SplitLinesToWordsFn()).with_outputs(
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
            main='words'))

    # split_lines_result is an object of type DoOutputsTuple. It supports
    # accessing result in alternative ways.
    words, _, _ = split_lines_result
    short_words = split_lines_result[
        SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
    character_count = split_lines_result.tag_character_count

    # pylint: disable=expression-not-assigned
    (character_count
     | df.Map('pair_with_key', lambda x: ('chars_temp_key', x))
     | df.GroupByKey()
     | df.Map('count chars', lambda (_, counts): sum(counts))
     | df.Write('write chars',
                df.io.TextFileSink(known_args.output + '-chars')))

    # pylint: disable=expression-not-assigned
    (short_words
     | CountWords('count short words')
     | df.Write('write short words',
                df.io.TextFileSink(known_args.output + '-short-words')))

    # pylint: disable=expression-not-assigned
    (words
     | CountWords('count words')
     | df.Write('write words',
                df.io.TextFileSink(known_args.output + '-words')))

    p.run()
    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(sum))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()
    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(multiply))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()
示例#22
0
def run(argv=None):
    # pylint: disable=expression-not-assigned

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file pattern to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file pattern to write results to.')
    parser.add_argument('--checksum_output',
                        required=True,
                        help='Checksum output file pattern.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    output = (lines
              | df.Map('split', lambda x: (x[:10], x[10:99]))
              | df.GroupByKey('group')
              | df.FlatMap(
                  'format', lambda
                  (key, vals): ['%s%s' % (key, val) for val in vals]))

    input_csum = (lines
                  | df.Map('input-csum', crc32line)
                  | df.CombineGlobally('combine-input-csum', sum)
                  | df.Map('hex-format', lambda x: '%x' % x))
    input_csum | df.io.Write(
        'write-input-csum',
        df.io.TextFileSink(known_args.checksum_output + '-input'))

    # Write the output using a "Write" transform that has side effects.
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))
    # Write the output checksum
    output_csum = (output
                   | df.Map('output-csum', crc32line)
                   | df.CombineGlobally('combine-output-csum', sum)
                   | df.Map('hex-format-output', lambda x: '%x' % x))
    output_csum | df.io.Write(
        'write-output-csum',
        df.io.TextFileSink(known_args.checksum_output + '-output'))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
示例#23
0
    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
示例#24
0
    def test_deferred_side_inputs(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', [3])
        result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input))
        assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))
        p.run()

        bad_side_input = p | df.Create('bad_side', ['z'])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('again', repeat,
                                pvalue.AsSingleton(bad_side_input))
示例#25
0
    def test_fixed_shard_write(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          num_shards=3,
                          shard_name_template='_NN_SSS_',
                          coder=coders.ToStringCoder())
        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create(['a', 'b']) | df.io.Write(sink)  # pylint: disable=expression-not-assigned

        p.run()

        concat = ''.join(
            open(temp_path + '_03_%03d_.foo' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)
示例#26
0
def run_count2(known_args, options):
  """Runs the second example pipeline."""

  @df.ptransform_fn
  def Count(label, pcoll):      # pylint: disable=invalid-name,unused-argument
    """Count as a decorated function."""
    return (
        pcoll
        | df.Map('Init', lambda v: (v, 1))
        | df.CombinePerKey(sum))

  logging.info('Running second pipeline')
  p = df.Pipeline(options=options)
  (p | df.io.Read(df.io.TextFileSource(known_args.input))
   | Count()  # pylint: disable=no-value-for-parameter
   | df.io.Write(df.io.TextFileSink(known_args.output)))
  p.run()
示例#27
0
def run_count1(known_args, options):
  """Runs the first example pipeline."""

  class Count(df.PTransform):
    """Count as a subclass of PTransform, with an apply method."""

    def apply(self, pcoll):
      return (
          pcoll
          | df.Map('Init', lambda v: (v, 1))
          | df.CombinePerKey(sum))

  logging.info('Running first pipeline')
  p = df.Pipeline(options=options)
  (p | df.io.Read(df.io.TextFileSource(known_args.input)) | Count()
   | df.io.Write(df.io.TextFileSink(known_args.output)))
  p.run()
示例#28
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='BigQuery request input table.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output table.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    output_table = '%s' % known_args.output
    input_query = """
    SELECT
      page, url,
      DOMAIN(page) as domain,
      IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party,
    FROM [%s]
  """ % known_args.input

    classifiers = {}
    for file in ['ad', 'tracker', 'social']:
        rules = [line.rstrip('\n') for line in open('local/' + file + '.txt')]
        classifier = AdblockRules(rules,
                                  supported_options=['domain', 'third-party'],
                                  skip_unsupported_rules=False,
                                  use_re2=True)
        del rules
        classifiers[file] = classifier

    p = df.Pipeline(argv=pipeline_args)

    (p
     | df.Read('read', df.io.BigQuerySource(query=input_query))
     | df.ParDo('classify', EasylistClassifyDoFn(), classifiers)
     # | df.io.Write('write', df.io.TextFileSink('out')))
     | df.Write(
         'write',
         df.io.BigQuerySink(
             output_table,
             schema='page:STRING, url:STRING, type:STRING',
             create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))

    p.run()
示例#29
0
def run(argv=None):
    """Main entry point; defines and runs the tfidf pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--uris', required=True, help='URIs to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    # Read documents specified by the uris command line option.
    pcoll = read_documents(p, glob.glob(known_args.uris))
    # Compute TF-IDF information for each word.
    output = pcoll | TfIdf()
    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))
    p.run()
示例#30
0
 def test_top_prefixes(self):
     p = df.Pipeline('DirectPipelineRunner')
     words = p | df.Create('create', self.WORDS)
     result = words | autocomplete.TopPerPrefix('test', 5)
     # values must be hashable for now
     result = result | df.Map(lambda (k, vs): (k, tuple(vs)))
     assert_that(
         result,
         contains_in_any_order([
             ('t', ((3, 'to'), (2, 'this'), (1, 'that'))),
             ('to', ((3, 'to'), )),
             ('th', ((2, 'this'), (1, 'that'))),
             ('thi', ((2, 'this'), )),
             ('this', ((2, 'this'), )),
             ('tha', ((1, 'that'), )),
             ('that', ((1, 'that'), )),
         ]))
     p.run()