Пример #1
0
 def expand(self, pcoll):
     compare = self._compare
     if (not self._args and not self._kwargs
             and pcoll.windowing.is_default()):
         if self._reverse:
             if compare is None or compare is operator.lt:
                 compare = operator.gt
             else:
                 original_compare = compare
                 compare = lambda a, b: original_compare(b, a)
         # This is a more efficient global algorithm.
         top_per_bundle = pcoll | core.ParDo(
             _TopPerBundle(self._n, compare, self._key))
         # If pcoll is empty, we can't guerentee that top_per_bundle
         # won't be empty, so inject at least one empty accumulator
         # so that downstream is guerenteed to produce non-empty output.
         empty_bundle = pcoll.pipeline | core.Create([(None, [])])
         return ((top_per_bundle, empty_bundle) | core.Flatten()
                 | core.GroupByKey()
                 | core.ParDo(
                     _MergeTopPerBundle(self._n, compare, self._key)))
     else:
         if self.has_defaults:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, compare, self._key,
                              self._reverse), *self._args, **
                 self._kwargs)
         else:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, compare, self._key,
                              self._reverse), *self._args, **
                 self._kwargs).without_defaults()
Пример #2
0
 def expand(self, pcoll):
     do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None])
     init_result_coll = do_once | 'InitializeWrite' >> core.Map(
         lambda _, sink: sink.initialize_write(), self.sink)
     if getattr(self.sink, 'num_shards', 0):
         min_shards = self.sink.num_shards
         if min_shards == 1:
             keyed_pcoll = pcoll | core.Map(lambda x: (None, x))
         else:
             keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards))
         write_result_coll = (
             keyed_pcoll
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             |
             'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink),
                                          AsSingleton(init_result_coll)))
     else:
         min_shards = 1
         write_result_coll = (
             pcoll
             | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink),
                                            AsSingleton(init_result_coll))
             | 'Pair' >> core.Map(lambda x: (None, x))
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             | 'Extract' >> core.FlatMap(lambda x: x[1]))
     return do_once | 'FinalizeWrite' >> core.FlatMap(
         _finalize_write, self.sink, AsSingleton(init_result_coll),
         AsIter(write_result_coll), min_shards)
Пример #3
0
 def expand(self, pcoll):
     if self.has_defaults:
         return (pcoll
                 | core.ParDo(self.add_timestamp).with_output_types(
                     Tuple[T, TimestampType])
                 | core.CombineGlobally(LatestCombineFn()))
     else:
         return (pcoll
                 | core.ParDo(self.add_timestamp).with_output_types(
                     Tuple[T, TimestampType])
                 | core.CombineGlobally(
                     LatestCombineFn()).without_defaults())
Пример #4
0
def main(gcs_path, out, start=None, end=None, pipeline_args=None):
    steps = [
        apache_beam.FlatMap('Parse XML and filter', parse_xml),
        apache_beam.Map(
            'Coerce "wikitext" key to string type',
            force_string_function('wikitext')),
        apache_beam.FlatMap('Parse markdown into plaintext', parse_wikitext),
        apache_beam.Map(
            'Coerce "text" key to string type', force_string_function('text')),
        apache_beam.Map(
            'Filter out any vestigial HTML', html_to_text),

        core.ParDo('batch', BatchFn(10)),
        apache_beam.FlatMap(
            'Entities (batch)', analyze_entities_batch),
    ]

    p = apache_beam.Pipeline(argv=pipeline_args)

    if start:
        value = p | apache_beam.Read(
            'Pick up at step {}'.format(start), apache_beam.io.TextFileSource(
                gcs_path)) | \
            apache_beam.Map('Parse JSON', json.loads)
    else:
        value = p | apache_beam.Read(
            'Read XML', custom_sources.XmlFileSource('page', gcs_path))

    for step in steps[start:end]:
        value = value | step

    if end:
        if not out.startswith('gs://'):
            raise ValueError('Output must be GCS path if an end is specified.')
        value = value | apache_beam.Map('to JSON', json.dumps) | \
            apache_beam.Write('Dump to GCS', apache_beam.io.TextFileSink(out))
    else:
        value = value | apache_beam.Write(
            'Dump metadata to BigQuery', apache_beam.io.BigQuerySink(
                out,
                schema=', '.join([
                    'article_id:STRING',
                    'article_title:STRING',
                    'article_sentiment_polarity:FLOAT',
                    'article_sentiment_magnitude:FLOAT',
                    'entity_name:STRING',
                    'entity_type:STRING',
                    'entity_wikipedia_url:STRING',
                    'entity_salience:FLOAT',
                    'entity_num_mentions:INTEGER',
                ]),
                create_disposition=(
                    apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=(
                    apache_beam.io.BigQueryDisposition.WRITE_APPEND)))

    p.run()
Пример #5
0
 def expand(self, pcoll):
   compare = self._compare
   if (not self._args and not self._kwargs and
       not self._key and pcoll.windowing.is_default()):
     if self._reverse:
       if compare is None or compare is operator.lt:
         compare = operator.gt
       else:
         original_compare = compare
         compare = lambda a, b: original_compare(b, a)
     # This is a more efficient global algorithm.
     return (
         pcoll
         | core.ParDo(_TopPerBundle(self._n, compare))
         | core.GroupByKey()
         | core.ParDo(_MergeTopPerBundle(self._n, compare)))
   else:
     return pcoll | core.CombineGlobally(
         TopCombineFn(self._n, compare, self._key, self._reverse),
         *self._args, **self._kwargs)
Пример #6
0
    def Of(pcoll, n, compare=None, *args, **kwargs):
        """Obtain a list of the compare-most N elements in a PCollection.

    This transform will retrieve the n greatest elements in the PCollection
    to which it is applied, where "greatest" is determined by the comparator
    function supplied as the compare argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        if not args and not kwargs and not key and pcoll.windowing.is_default(
        ):
            if reverse:
                if compare is None or compare is operator.lt:
                    compare = operator.gt
                else:
                    original_compare = compare
                    compare = lambda a, b: original_compare(b, a)
            # This is a more efficient global algorithm.
            return (pcoll
                    | core.ParDo(_TopPerBundle(n, compare))
                    | core.GroupByKey()
                    | core.ParDo(_MergeTopPerBundle(n, compare)))
        else:
            return pcoll | core.CombineGlobally(
                TopCombineFn(n, compare, key, reverse), *args, **kwargs)
Пример #7
0
 def expand(self, pcoll):
     if pcoll.windowing.is_default():
         # This is a more efficient global algorithm.
         top_per_bundle = pcoll | core.ParDo(
             _TopPerBundle(self._n, self._key, self._reverse))
         # If pcoll is empty, we can't guerentee that top_per_bundle
         # won't be empty, so inject at least one empty accumulator
         # so that downstream is guerenteed to produce non-empty output.
         empty_bundle = pcoll.pipeline | core.Create([(None, [])])
         return ((top_per_bundle, empty_bundle) | core.Flatten()
                 | core.GroupByKey()
                 | core.ParDo(
                     _MergeTopPerBundle(self._n, self._key,
                                        self._reverse)))
     else:
         if self.has_defaults:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, self._key, self._reverse))
         else:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, self._key,
                              self._reverse)).without_defaults()
Пример #8
0
from apache_beam.transforms.window import WindowFn
from apache_beam.utils.timestamp import MAX_TIMESTAMP
from apache_beam.utils.timestamp import MIN_TIMESTAMP


def context(element, timestamp):
  return WindowFn.AssignContext(timestamp, element)


class ReifyWindowsFn(core.DoFn):
  def process(self, element, window=core.DoFn.WindowParam):
    key, values = element
    yield "%s @ %s" % (key, window), values


reify_windows = core.ParDo(ReifyWindowsFn())


class WindowTest(unittest.TestCase):

  def test_timestamped_value_cmp(self):
    self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2))
    self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.0))
    self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('a', 2.1))
    self.assertNotEqual(TimestampedValue('a', 2), TimestampedValue('b', 2))

  def test_global_window(self):
    self.assertEqual(GlobalWindow(), GlobalWindow())
    self.assertNotEqual(GlobalWindow(),
                        IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP))
    self.assertNotEqual(IntervalWindow(MIN_TIMESTAMP, MAX_TIMESTAMP),
Пример #9
0
 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[K, Tuple[T, TimestampType]])
             | core.CombinePerKey(LatestCombineFn()))
Пример #10
0
 def expand(self, pcoll):
     return (pcoll
             | 'DeduplicateFn' >> core.ParDo(self._create_deduplicate_fn()))
Пример #11
0
 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[T, TimestampType])  # type: ignore[misc]
             | core.CombineGlobally(LatestCombineFn()))