示例#1
0
 def testRow(self):
     self.assertReturnType(
         row_type.RowTypeConstraint([('x', int), ('y', str)]),
         lambda x, y: beam.Row(x=x + 1, y=y), [int, str])
     self.assertReturnType(
         row_type.RowTypeConstraint([('x', int), ('y', str)]),
         lambda x: beam.Row(x=x, y=str(x)), [int])
示例#2
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(levelname)-8s: %(message)s')
    parser = argparse.ArgumentParser(description=__doc__.strip())
    parser.add_argument('filename', help='Beancount ledger filename')
    args, pipeline_args = parser.parse_known_args()

    # Read the ledger.
    logging.info("Reading ledger.")
    t1 = time.time()
    entries, errors, options_map = loader.load_file(args.filename)
    postings = (beam.Row(posting.account, posting.units.number,
                         posting.units.currency)
                for entry in data.filter_txns(entries)
                for posting in entry.postings)
    price_map = prices.build_price_map(entries)
    t2 = time.time()
    logging.info("Read ledger in %.1fsecs.", t2 - t1)

    with CreatePipeline(pipeline_args) as pipeline:
        _ = (pipeline
             | beam.Create(postings)
             | SqlTransform("""
                 SELECT account FROM PCOLLECTION
             """,
                            dialect="zetasql")
             | beam.Map(print))
示例#3
0
 def reformat_record(field):
     post_id = str(field.get("id"))
     return beam.Row(
         post_id=post_id,
         title=field.get("title"),
         tags=field.get("tags"),
     )
示例#4
0
    def Stats(samples):
        # First, convert the Python list of floats to a numpy array, to save
        # scipy from repeatedly converting it implicitly.
        arr = np.array(samples, dtype=np.float64)

        # Immediately clear the Python list to minimise memory pressure.
        samples.clear()

        # Calculate the various summary statistics.
        stats = scipy.stats.describe(arr)
        quartiles = scipy.stats.mstats.mquantiles(arr)
        return beam.Row(
            num_samples=stats.nobs,
            kurtosis=stats.kurtosis,
            skewness=stats.skewness,
            iqr=scipy.stats.iqr(arr),
            variance=stats.variance,
            min=stats.minmax[0],
            max=stats.minmax[1],
            mean=stats.mean,
            first_quartile=quartiles[0],
            median=quartiles[1],
            third_quartile=quartiles[2],
            cv=scipy.stats.mstats.variation(arr),
            std_dev=math.sqrt(stats.variance),
            std_err=scipy.stats.sem(arr),
        )
示例#5
0
 def process(self, element, *args, **kwargs):
     res = beam.Row(
         user=element['actor'],
         action=element['action'],
         created_at=datetime.fromisoformat(
             element['created_at']).strftime('%Y-%m-%d %H:%M:%S.%f'))
     yield res
示例#6
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Import this here to avoid pickling the main session.
    import re

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        words = (
            lines
            | 'Split' >> beam.FlatMap(
                lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
            # Map to Row objects to generate a schema suitable for conversion
            # to a dataframe.
            | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

        df = to_dataframe(words)
        df['count'] = 1
        counted = df.groupby('word').sum()
        counted.to_csv(known_args.output)
示例#7
0
 def test_as_external_transform(self):
   with FullyQualifiedNamedTransform.with_filter('*'):
     with beam.Pipeline() as p:
       assert_that(
           p
           | beam.Create(['a', 'b', 'c'])
           | beam.ExternalTransform(
               PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN,
               ImplicitSchemaPayloadBuilder({
                   'constructor': 'apache_beam.transforms'
                   '.fully_qualified_named_transform_test._TestTransform',
                   'args': beam.Row(arg0='x'),
                   'kwargs': beam.Row(suffix='y'),
               }),
               expansion_service.ExpansionServiceServicer()),
           equal_to(['xay', 'xby', 'xcy']))
示例#8
0
 def test_row(self):
     with TestPipeline() as p:
         out = (p
                | beam.Create([1, 2, 10])
                | beam.Map(lambda x: beam.Row(a=x, b=str(x)))
                | SqlTransform(
                    "SELECT a*a as s, LENGTH(b) AS c FROM PCOLLECTION"))
         assert_that(out, equal_to([(1, 1), (4, 1), (100, 2)]))
示例#9
0
def ToRowAirport(values):

    return beam.Row(
        i94port=str(values["i94port"])
        if "i94port" in values.keys() else str("NaN"),
        municipality=str(values["municipality"])
        if "municipality" in values.keys() else str("NaN"),
    )
示例#10
0
 def DateTransform(yyyymmdd):
     if yyyymmdd == 'yesterday':
         d = (datetime.datetime.utcnow() -
              datetime.timedelta(days=1)).date()
     else:
         d = datetime.date(int(yyyymmdd[:4]), int(yyyymmdd[4:6]),
                           int(yyyymmdd[6:8]))
     return beam.Row(date=d)
示例#11
0
def normalize(x):
    if isinstance(x, tuple) and hasattr(x, '_fields'):
        # A named tuple.
        return beam.Row(**dict(zip(x._fields, x)))
    elif isinstance(x, typing.Iterable) and not isinstance(x, (str, beam.Row)):
        return UnorderedList(normalize(e) for e in x)
    else:
        return x
示例#12
0
 def test_instance_check_windowed_value_holder(self):
     windowed_value = WindowedValue(
         'a', Timestamp(5), [beam.window.IntervalWindow(5, 10)],
         PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
     self.assertTrue(
         isinstance(WindowedValueHolder(windowed_value),
                    WindowedValueHolder))
     self.assertTrue(
         isinstance(
             beam.Row(windowed_value=windowed_value,
                      urn=common_urns.coders.ROW.urn), WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(windowed_value=windowed_value),
                    WindowedValueHolder))
     self.assertFalse(isinstance(windowed_value, WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(x=windowed_value), WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(windowed_value=1), WindowedValueHolder))
示例#13
0
  def test_unbatching_series(self):
    with beam.Pipeline() as p:
      result = (
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.),
                         (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))
          | transforms.DataframeTransform(lambda df: df.Animal))

      assert_that(result, equal_to(['Falcon', 'Falcon', 'Parrot', 'Parrot']))
示例#14
0
 def process_weather_entries(el):
     key = el[0]
     weather = el[1]
     return beam.Row(
         country_code=key.country_code,
         obsdate=key.obsdate,
         temp=mean([x.temp for x in weather if x.temp < 9999.9]),
         windspeed=mean(
             [x.windspeed for x in weather if x.windspeed < 999.9]),
         tornadoes=any([x.tornado_or_funnel for x in weather]),
     )
示例#15
0
  def test_batching_beam_row_input(self):
    with beam.Pipeline() as p:
      result = (
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.),
                         (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))
          | transforms.DataframeTransform(
              lambda df: df.groupby('Animal').mean(), include_indexes=True))

      assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # Import this here to avoid pickling the main session.
  import re

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | 'Read' >> ReadFromText(known_args.input)

    words = (
        lines
        | 'Split' >> beam.FlatMap(
            lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
        # Map to Row objects to generate a schema suitable for conversion
        # to a dataframe.
        | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

    df = to_dataframe(words)
    for k, pc in
    print(df)
    """
    df['count'] = 1
    counted = df.groupby('word').sum()
    counted.to_csv(known_args.output)

    # Deferred DataFrames can also be converted back to schema'd PCollections
    counted_pc = to_pcollection(counted, include_indexes=True)

    # Print out every word that occurred >50 times
    _ = (
        counted_pc
        | beam.Filter(lambda row: row.count > 50)
        | beam.Map(lambda row: f'{row.word}: {row.count}')
        | beam.Map(print))
    """

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()
示例#17
0
 def calc_max_consecutive_tornado_days(el):
     key, entries = el
     entries = sorted(entries, key=lambda x: x.obsdate, reverse=True)
     return (
         key.year,
         beam.Row(
             country_code=key.country_code,
             max_consecutive_tornado_days=max_consec_sequence_len(
                 [x.tornadoes for x in entries], True),
         ),
     )
    def test_batching_beam_row_to_dataframe(self):
        with beam.Pipeline() as p:
            df = convert.to_dataframe(
                p
                | beam.Create([(u'Falcon', 380.), (
                    u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)])
                | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

            result = convert.to_pcollection(df.groupby('Animal').mean(),
                                            include_indexes=True)

            assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
示例#19
0
  def test_batching_beam_row_to_dataframe(self):
    with beam.Pipeline() as p:
      df = convert.to_dataframe(
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (
              u'Parrot', 24.), (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

      result = convert.to_pcollection(df.groupby('Animal').mean())

      assert_that(
          result,
          df_equal_to(
              pd.DataFrame({
                  'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.]
              }).set_index('Animal')))
示例#20
0
  def test_batching_beam_row_input(self):
    with beam.Pipeline() as p:
      result = (
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.),
                         (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))
          |
          transforms.DataframeTransform(lambda df: df.groupby('Animal').mean()))

      assert_that(
          result,
          df_equal_to(
              pd.DataFrame({
                  'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.]
              }).set_index('Animal')))
示例#21
0
    def process(self, element):

        if (element["dt"] > datetime.datetime(2011, 12, 31, 23, 59, 0, 0)
            ) and (element["dt"] < datetime.datetime(2013, 1, 1, 0, 0, 0, 0)):
            return [
                beam.Row(
                    dt=element["dt"] if "dt" in element.keys() else str("NaN"),
                    avg_temp=float(element["AverageTemperature"])
                    if "AverageTemperature" in element.keys() else str("NaN"),
                    municipality=str(element["City"])
                    if "City" in element.keys() else str("NaN"),
                    country=str(element["Country"])
                    if "Country" in element.keys() else str("NaN"),
                    month=element["dt"].month
                    if "dt" in element.keys() else str("NaN"),
                )
            ]
示例#22
0
def ToRowImmigration(values):

    return beam.Row(
        arrdate=int(values["arrdate"]) if values["arrdate"] is not None else 0,
        depdate=int(values["depdate"]) if values["depdate"] is not None else 0,
        i94mon=int(values["i94mon"])
        if "i94mon" in values.keys() else int("NaN"),
        i94visa=int(values["i94visa"])
        if "i94visa" in values.keys() else int("NaN"),
        i94port=str(values["i94port"])
        if "i94port" in values.keys() else str("NaN"),
        i94addr=str(values["i94addr"])
        if "i94addr" in values.keys() else str("NaN"),
        biryear=int(values["biryear"])
        if values["biryear"] != None in values.keys() else 0,
        gender=str(values["gender"])
        if "gender" in values.keys() else str("NaN"),
    )
示例#23
0
  def test_dataframes(self):
    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
    data = p | beam.Create(
        [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
    df = to_dataframe(data)

    # Watch the local scope for Interactive Beam so that values will be cached.
    ib.watch(locals())

    # This is normally done in the interactive_utils when a transform is
    # applied but needs an IPython environment. So we manually run this here.
    ie.current_env().track_user_pipelines()

    df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
    pd.testing.assert_frame_equal(
        df_expected, ib.collect(df, n=10).reset_index(drop=True))
示例#24
0
def ToRowCity(values):

    return beam.Row(
        avg_median_age=float(values["avg_median_age"])
        if "avg_median_age" in values.keys() else float("NaN"),
        avg_male_population=float(values["avg_male_population"])
        if "avg_male_population" in values.keys() else float("NaN"),
        avg_female_population=float(values["avg_female_population"])
        if "avg_female_population" in values.keys() else float("NaN"),
        avg_total_population=float(values["avg_total_population"])
        if "avg_total_population" in values.keys() else float("NaN"),
        avg_number_veterans=float(values["avg_number_veterans"])
        if "avg_number_veterans" in values.keys() else float("NaN"),
        avg_foreign_born=float(values["avg_foreign_born"])
        if "avg_foreign_born" in values.keys() else float("NaN"),
        avg_average_household_size=float(values["avg_average_household_size"])
        if "avg_average_household_size" in values.keys() else float("NaN"),
        i94addr=str(values["i94addr"]),
    )
def run(output_topic, pipeline_args):
    pipeline_options = PipelineOptions(pipeline_args,
                                       save_main_session=True,
                                       streaming=True)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime',
                timestamp_attribute="ts").with_output_types(bytes)
            | "Parse JSON payload" >> beam.Map(json.loads)
            # Use beam.Row to create a schema-aware PCollection
            | "Create beam Row" >> beam.Map(
                lambda x: beam.Row(ride_status=str(x['ride_status']),
                                   passenger_count=int(x['passenger_count'])))
            # SqlTransform will computes result within an existing window
            | "15s fixed windows" >> beam.WindowInto(
                beam.window.FixedWindows(15))
            # Aggregate drop offs and pick ups that occur within each 15s window
            | SqlTransform("""
             SELECT
               ride_status,
               COUNT(*) AS num_rides,
               SUM(passenger_count) AS total_passengers
             FROM PCOLLECTION
             WHERE NOT ride_status = 'enroute'
             GROUP BY ride_status""")
            # SqlTransform yields python objects with attributes corresponding to
            # the outputs of the query.
            # Collect those attributes, as well as window information, into a dict
            | "Assemble Dictionary" >> beam.Map(
                lambda row, window=beam.DoFn.WindowParam: {
                    "ride_status": row.ride_status,
                    "num_rides": row.num_rides,
                    "total_passengers": row.total_passengers,
                    "window_start": window.start.to_rfc3339(),
                    "window_end": window.end.to_rfc3339()
                })
            | "Convert to JSON" >> beam.Map(json.dumps)
            | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8"))
            | beam.io.WriteToPubSub(topic=output_topic))
    def testInstanceToType(self):
        class MyClass(object):
            def method(self):
                pass

        test_cases = [
            (typehints.Dict[str, int], {
                'a': 1
            }),
            (typehints.Dict[str, typehints.Union[str, int]], {
                'a': 1,
                'b': 'c'
            }),
            (typehints.Dict[typehints.Any, typehints.Any], {}),
            (typehints.Set[str], {'a'}),
            (typehints.Set[typehints.Union[str, float]], {'a', 0.4}),
            (typehints.Set[typehints.Any], set()),
            (typehints.FrozenSet[str], frozenset(['a'])),
            (typehints.FrozenSet[typehints.Union[str,
                                                 float]], frozenset(['a',
                                                                     0.4])),
            (typehints.FrozenSet[typehints.Any], frozenset()),
            (typehints.Tuple[int], (1, )),
            (typehints.Tuple[int, int, str], (1, 2, '3')),
            (typehints.Tuple[()], ()),
            (typehints.List[int], [1]),
            (typehints.List[typehints.Union[int, str]], [1, 'a']),
            (typehints.List[typehints.Any], []),
            (type(None), None),
            (type(MyClass), MyClass),
            (MyClass, MyClass()),
            (type(MyClass.method), MyClass.method),
            (types.MethodType, MyClass().method),
            (row_type.RowTypeConstraint([('x', int)]), beam.Row(x=37)),
        ]
        for expected_type, instance in test_cases:
            self.assertEqual(expected_type,
                             trivial_inference.instance_to_type(instance),
                             msg=instance)
示例#27
0
    def test_group_by_attr_expr(self):
        # [START groupby_attr_expr]
        with beam.Pipeline() as p:
            grouped = (p | beam.Create(GROCERY_LIST)
                       | beam.GroupBy('recipe',
                                      is_berry=lambda x: 'berry' in x.fruit))
            # [END groupby_attr_expr]

            expected = [
                #[START groupby_attr_expr_result]
                (NamedTuple(recipe='pie', is_berry=True), [
                    beam.Row(recipe='pie',
                             fruit='strawberry',
                             quantity=3,
                             unit_price=1.50),
                    beam.Row(recipe='pie',
                             fruit='raspberry',
                             quantity=1,
                             unit_price=3.50),
                    beam.Row(recipe='pie',
                             fruit='blackberry',
                             quantity=1,
                             unit_price=4.00),
                    beam.Row(recipe='pie',
                             fruit='blueberry',
                             quantity=1,
                             unit_price=2.00),
                ]),
                (NamedTuple(recipe='muffin', is_berry=True), [
                    beam.Row(recipe='muffin',
                             fruit='blueberry',
                             quantity=2,
                             unit_price=2.00),
                ]),
                (NamedTuple(recipe='muffin', is_berry=False), [
                    beam.Row(recipe='muffin',
                             fruit='banana',
                             quantity=3,
                             unit_price=1.00),
                ]),
                #[END groupby_attr_expr_result]
            ]
            assert_that(grouped | beam.MapTuple(normalize_kv),
                        equal_to(expected))
示例#28
0
    def test_dataframe_caching(self, cell):

        # Create a pipeline that exercises the DataFrame API. This will also use
        # caching in the background.
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            data = p | beam.Create([
                1, 2, 3
            ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))

            with beam.dataframe.allow_non_parallel_operations():
                df = to_dataframe(data).reset_index(drop=True)

            ib.collect(df)

        with cell:  # Cell 3
            df['output'] = df['square'] * df['cube']
            ib.collect(df)

        with cell:  # Cell 4
            df['output'] = 0
            ib.collect(df)

        # We use a trace through the graph to perform an isomorphism test. The end
        # output should look like a linear graph. This indicates that the dataframe
        # transform was correctly broken into separate pieces to cache. If caching
        # isn't enabled, all the dataframe computation nodes are connected to a
        # single shared node.
        trace = []

        # Only look at the top-level transforms for the isomorphism. The test
        # doesn't care about the transform implementations, just the overall shape.
        class TopLevelTracer(beam.pipeline.PipelineVisitor):
            def _find_root_producer(self,
                                    node: beam.pipeline.AppliedPTransform):
                if node is None or not node.full_label:
                    return None

                parent = self._find_root_producer(node.parent)
                if parent is None:
                    return node

                return parent

            def _add_to_trace(self, node, trace):
                if '/' not in str(node):
                    if node.inputs:
                        producer = self._find_root_producer(
                            node.inputs[0].producer)
                        producer_name = producer.full_label if producer else ''
                        trace.append((producer_name, node.full_label))

            def visit_transform(self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

            def enter_composite_transform(
                    self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

        p.visit(TopLevelTracer())

        # Do the isomorphism test which states that the topological sort of the
        # graph yields a linear graph.
        trace_string = '\n'.join(str(t) for t in trace)
        prev_producer = ''
        for producer, consumer in trace:
            self.assertEqual(producer, prev_producer, trace_string)
            prev_producer = consumer
示例#29
0
    elif isinstance(x, typing.Iterable) and not isinstance(x, (str, beam.Row)):
        return UnorderedList(normalize(e) for e in x)
    else:
        return x


def normalize_kv(k, v):
    return normalize(k), normalize(v)


# For documentation.
NamedTuple = beam.Row

# [START groupby_table]
GROCERY_LIST = [
    beam.Row(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.50),
    beam.Row(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.50),
    beam.Row(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.00),
    beam.Row(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.00),
    beam.Row(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.00),
    beam.Row(recipe='muffin', fruit='banana', quantity=3, unit_price=1.00),
]
# [END groupby_table]


class GroupByTest(unittest.TestCase):
    def test_groupby_expr(self):
        # [START groupby_expr]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create([
示例#30
0
def SQLDictToRow(pcoll):
    """Convert a dict (from ReadFromBigQuery) to a beam.Row."""
    return pcoll | beam.Map(lambda d: beam.Row(**d))