示例#1
0
    # [START combine_custom_average]
    class AverageFn(beam.CombineFn):
      def create_accumulator(self):
        return (0.0, 0)

      def add_input(self, (sum, count), input):
        return sum + input, count + 1

      def merge_accumulators(self, accumulators):
        sums, counts = zip(*accumulators)
        return sum(sums), sum(counts)

      def extract_output(self, (sum, count)):
        return sum / count if count else float('NaN')
    average = pc | beam.CombineGlobally(AverageFn())
    # [END combine_custom_average]
    self.assertEqual([4.25], average)

  def test_keys(self):
    occurrences = [('cat', 1), ('cat', 5), ('dog', 5), ('cat', 9), ('dog', 2)]
    unique_keys = occurrences | snippets.Keys()
    self.assertEqual({'cat', 'dog'}, set(unique_keys))

  def test_count(self):
    occurrences = ['cat', 'dog', 'cat', 'cat', 'dog']
    perkey_counts = occurrences | snippets.Count()
    self.assertEqual({('cat', 3), ('dog', 2)}, set(perkey_counts))

  def test_setting_fixed_windows(self):
    p = TestPipeline()
示例#2
0
    """平均以上の文字数を持つ文字列をフィルタリングする."""
    def __init__(self):
        super(FilterAboveMeanLengthFn, self).__init__()

    def process(self, element, mean_word_length):
        if element >= mean_word_length:
            yield element


if __name__ == '__main__':
    p = beam.Pipeline(options=PipelineOptions())

    inputs = ["good morning.", "good afternoon.", "good evening."]

    # 主入力
    word_lengths = (
        p
        | 'create inputs' >> beam.Create(inputs)
        | 'compute word length' >> beam.Map(lambda element: len(element)))

    # 副入力
    mean_word_length = word_lengths | 'compute mean word length' >> beam.CombineGlobally(
        beam.combiners.MeanCombineFn())

    (word_lengths
     | 'filter above mean length' >> beam.ParDo(
         FilterAboveMeanLengthFn(), pvalue.AsSingleton(mean_word_length))
     | 'write to text' >> beam.io.WriteToText("./output.txt"))

    p.run().wait_until_finish()
示例#3
0
    def process(self, lines):
        with self.gcsio().open(f'{self.gcs_path}/index.csv',
                               'w',
                               mime_type='text/csv') as fp:
            fp.write(lines.encode())


job_name = f"reviewr-automl--{datetime.utcnow().strftime('%Y%m%d-%H%I%S')}"
gcs_path = f'{GCS_DESTINATION}/{job_name}'
pipeline_options = PipelineOptions(project=PROJECT_ID,
                                   region=DATAFLOW_REGION,
                                   job_name=job_name,
                                   temp_location=f'{gcs_path}/temp')

p = beam.Pipeline(runner=RUNNER, options=pipeline_options)
bq_row = p | 'ReadFromBigQuery' >> ReadFromBigQuery(
    query=
    f"SELECT * FROM `{BQ_SOURCE}`{' LIMIT 10' if RUNNER == 'DirectRunner' else ''}",
    project=PROJECT_ID,
    use_standard_sql=True,
    gcs_location=f'{gcs_path}/temp')

bq_row | 'WriteExampleFile' >> beam.ParDo(WriteExampleFile(gcs_path))

bq_row | 'CreateLine' >> beam.ParDo(CreateLine(gcs_path))\
    | 'CombineLines' >> beam.CombineGlobally(lambda lines: '\n'.join(lines))\
    | 'WriteIndexFile' >> beam.ParDo(WriteIndexFile(gcs_path))

p.run()
def _WriteMetricsPlotsAndValidations(  # pylint: disable=invalid-name
    evaluation: evaluator.Evaluation,
    output_paths: Dict[str, str],
    eval_config: config_pb2.EvalConfig,
    add_metrics_callbacks: List[types.AddMetricsCallbackType],
    metrics_key: str,
    plots_key: str,
    attributions_key: str,
    validations_key: str,
    output_file_format: str,
    rubber_stamp: bool = False) -> beam.pvalue.PDone:
  """PTransform to write metrics and plots."""

  if output_file_format not in _SUPPORTED_FORMATS:
    raise ValueError('only "{}" formats are currently supported but got '
                     'output_file_format={}'.format(_SUPPORTED_FORMATS,
                                                    output_file_format))

  def convert_slice_key_to_parquet_dict(
      slice_key: metrics_for_slice_pb2.SliceKey) -> _SliceKeyDictPythonType:
    single_slice_key_dicts = []
    for single_slice_key in slice_key.single_slice_keys:
      kind = single_slice_key.WhichOneof('kind')
      if not kind:
        continue
      single_slice_key_dicts.append({kind: getattr(single_slice_key, kind)})
    return {_SINGLE_SLICE_KEYS_PARQUET_FIELD_NAME: single_slice_key_dicts}

  def convert_to_parquet_columns(
      value: Union[metrics_for_slice_pb2.MetricsForSlice,
                   metrics_for_slice_pb2.PlotsForSlice,
                   metrics_for_slice_pb2.AttributionsForSlice]
  ) -> Dict[str, Union[_SliceKeyDictPythonType, bytes]]:
    return {
        _SLICE_KEY_PARQUET_COLUMN_NAME:
            convert_slice_key_to_parquet_dict(value.slice_key),
        _SERIALIZED_VALUE_PARQUET_COLUMN_NAME:
            value.SerializeToString()
    }

  if metrics_key in evaluation and constants.METRICS_KEY in output_paths:
    metrics = (
        evaluation[metrics_key] | 'ConvertSliceMetricsToProto' >> beam.Map(
            convert_slice_metrics_to_proto,
            add_metrics_callbacks=add_metrics_callbacks))

    file_path_prefix = output_paths[constants.METRICS_KEY]
    if output_file_format == _PARQUET_FORMAT:
      _ = (
          metrics
          | 'ConvertToParquetColumns' >> beam.Map(convert_to_parquet_columns)
          | 'WriteMetricsToParquet' >> beam.io.WriteToParquet(
              file_path_prefix=file_path_prefix,
              schema=_SLICED_PARQUET_SCHEMA,
              file_name_suffix='.' + output_file_format))
    elif output_file_format == _TFRECORD_FORMAT:
      _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord(
          file_path_prefix=file_path_prefix,
          shard_name_template=None if output_file_format else '',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.MetricsForSlice))
    else:
      raise ValueError(f'Unsupported output file format: {output_file_format}.')

  if plots_key in evaluation and constants.PLOTS_KEY in output_paths:
    plots = (
        evaluation[plots_key] | 'ConvertSlicePlotsToProto' >> beam.Map(
            convert_slice_plots_to_proto,
            add_metrics_callbacks=add_metrics_callbacks))

    file_path_prefix = output_paths[constants.PLOTS_KEY]
    if output_file_format == _PARQUET_FORMAT:
      _ = (
          plots
          |
          'ConvertPlotsToParquetColumns' >> beam.Map(convert_to_parquet_columns)
          | 'WritePlotsToParquet' >> beam.io.WriteToParquet(
              file_path_prefix=file_path_prefix,
              schema=_SLICED_PARQUET_SCHEMA,
              file_name_suffix='.' + output_file_format))
    elif output_file_format == _TFRECORD_FORMAT:
      _ = plots | 'WritePlotsToTFRecord' >> beam.io.WriteToTFRecord(
          file_path_prefix=file_path_prefix,
          shard_name_template=None if output_file_format else '',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.PlotsForSlice))
    else:
      raise ValueError(f'Unsupported output file format: {output_file_format}.')

  if (attributions_key in evaluation and
      constants.ATTRIBUTIONS_KEY in output_paths):
    attributions = (
        evaluation[attributions_key] | 'ConvertSliceAttributionsToProto' >>
        beam.Map(convert_slice_attributions_to_proto))

    file_path_prefix = output_paths[constants.ATTRIBUTIONS_KEY]
    if output_file_format == _PARQUET_FORMAT:
      _ = (
          attributions
          | 'ConvertAttributionsToParquetColumns' >>
          beam.Map(convert_to_parquet_columns)
          | 'WriteAttributionsToParquet' >> beam.io.WriteToParquet(
              file_path_prefix=file_path_prefix,
              schema=_SLICED_PARQUET_SCHEMA,
              file_name_suffix='.' + output_file_format))
    elif output_file_format == _TFRECORD_FORMAT:
      _ = attributions | 'WriteAttributionsToTFRecord' >> beam.io.WriteToTFRecord(
          file_path_prefix=file_path_prefix,
          shard_name_template=None if output_file_format else '',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(
              metrics_for_slice_pb2.AttributionsForSlice))
    else:
      raise ValueError(f'Unsupported output file format: {output_file_format}.')

  if (validations_key in evaluation and
      constants.VALIDATIONS_KEY in output_paths):
    validations = (
        evaluation[validations_key]
        | 'MergeValidationResults' >> beam.CombineGlobally(
            CombineValidations(eval_config, rubber_stamp=rubber_stamp)))

    file_path_prefix = output_paths[constants.VALIDATIONS_KEY]
    # We only use a single shard here because validations are usually single
    # values. Setting the shard_name_template to the empty string forces this.
    shard_name_template = ''
    if output_file_format == _PARQUET_FORMAT:
      _ = (
          validations
          | 'ConvertValidationsToParquetColumns' >> beam.Map(
              lambda v:  # pylint: disable=g-long-lambda
              {_SERIALIZED_VALUE_PARQUET_COLUMN_NAME: v.SerializeToString()})
          | 'WriteValidationsToParquet' >> beam.io.WriteToParquet(
              file_path_prefix=file_path_prefix,
              shard_name_template=shard_name_template,
              schema=_UNSLICED_PARQUET_SCHEMA,
              file_name_suffix='.' + output_file_format))
    elif output_file_format == _TFRECORD_FORMAT:
      _ = (
          validations
          | 'WriteValidationsToTFRecord' >> beam.io.WriteToTFRecord(
              file_path_prefix=file_path_prefix,
              shard_name_template=shard_name_template,
              file_name_suffix=('.' + output_file_format
                                if output_file_format else ''),
              coder=beam.coders.ProtoCoder(
                  validation_result_pb2.ValidationResult)))
    else:
      raise ValueError(f'Unsupported output file format: {output_file_format}.')

  return beam.pvalue.PDone(list(evaluation.values())[0].pipeline)
示例#5
0
# 	# This can be used directly if print to screen not needed as
# 	# | >> beam.CombinePerKey(sum)
# 	| 'Get the total for each item' >> beam.CombinePerKey(sum)
# 	# To print to screen
# 	| 'Prints data to screen' >> beam.ParDo(Printer())
# 	)

# Option 3
number_of_transactions_per_item = (
    data_from_source
    | 'Clean the item for items count' >> beam.ParDo(Transaction())
    | 'Map record item to 1 for items count' >> beam.Map(lambda record:
                                                         (record['item'], 1))
    | 'Get the total for each item' >> beam.CombinePerKey(sum)
    | 'Convert data into List' >> (
        beam.CombineGlobally(
            beam.combiners.ToListCombineFn())  # ToDictCombineFn())
    ))

# Uses the Dictionary
# # Maximum
# most_popular_item = (
# 	number_of_transactions_per_item
# 	| 'Get the item with maximum count' >> beam.Map(lambda item: max(item, key=item.get))
# 	# To print to screen
# 	| 'Prints max. data to screen' >> beam.ParDo(Printer())
# 	)

# # Minimum
# less_popular_item = (
# 	number_of_transactions_per_item
# 	| 'Get the item with minimum count' >> beam.Map(lambda item: min(item, key=item.get))
示例#6
0
#   Licensed to the Apache Software Foundation (ASF) under one
#   or more contributor license agreements.  See the NOTICE file
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import apache_beam as beam

from log_elements import LogElements

p = beam.Pipeline()

(p | beam.Create(range(1, 11)) | beam.CombineGlobally(sum) | LogElements())

p.run()
    def expand(self, inputs):
        pcoll, = inputs
        if self._top_k is not None and self._top_k < 0:
            raise ValueError(
                'top_k for VocabularyImpl should be >= 0 or None, got '
                '{}.'.format(self._top_k))
        if self._frequency_threshold is not None and self._frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold for VocabularyImpl should be >= 0 or None, '
                'got {}.'.format(self._frequency_threshold))

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and b'\n' not in string and b'\r' not in string

        if (self._vocab_ordering_type ==
                tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            flatten_map_fn = (
                _flatten_positive_label_weights_total_weights_and_counts)

            # count_and_means is a pcollection that contains a
            # _CountAndWeightsMeansAccumulator where:
            #   `weighted_mean` is the weighted mean of positive labels
            #       for all features.
            #   `count` is the count for all features.
            #   `weights_mean` is the mean of the weights for all features.
            count_and_means = (
                pcoll
                | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means)
                | 'ComputeCountAndWeightsMeansGlobally' >>
                beam.CombineGlobally(CountAndWeightsMeansCombineFn()))

            # CountAndWeightsMeansCombineFn returns a tuple of the form:
            # (feature,_CountAndWeightsMeansAccumulator) where:
            #   `feature` is a single string, which is the word in the vocabulary
            #       whose mutual information with the label is being computed.
            #   `weighted_mean` is the weighted mean of y positive given x.
            #   `count` is the count of weights for a feature.
            #   `weights_mean` is the mean of the weights for a feature.
            combine_transform = (
                'ComputeCountAndWeightsMeansPerUniqueWord' >>
                beam.CombinePerKey(CountAndWeightsMeansCombineFn())
                | 'CalculateMutualInformationPerUniqueWord' >> beam.Map(
                    _calculate_mutual_information,
                    global_accumulator=beam.pvalue.AsSingleton(
                        count_and_means),
                    use_adjusted_mutual_info=self._use_adjusted_mutual_info))
        elif (self._vocab_ordering_type ==
              tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
            flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = _flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        raw_counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeightsLabels' >>
            beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        counts = (
            raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
                _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                    self._frequency_threshold, self._top_k)))

        return counts | 'WriteVocabFile' >> (
            _WriteVocabFile(  # pylint: disable=no-value-for-parameter
                self._base_temp_dir, self._vocab_filename,
                self._store_frequency))
示例#8
0
 def expand(self, stats: beam.PCollection):
     return stats | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombineGlobally(
         merge_util.merge_dataset_feature_statistics_list)
示例#9
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  parser.add_argument('--topic',
                      type=str,
                      help='Pub/Sub topic to read from')
  parser.add_argument('--subscription',
                      type=str,
                      help='Pub/Sub subscription to read from')
  parser.add_argument('--dataset',
                      type=str,
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                      'Must already exist.')
  parser.add_argument('--table_name',
                      type=str,
                      default='game_stats',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--fixed_window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration for user '
                           'analysis, in minutes')
  parser.add_argument('--session_gap',
                      type=int,
                      default=5,
                      help='Numeric value of gap between user sessions, '
                           'in minutes')
  parser.add_argument('--user_activity_window_duration',
                      type=int,
                      default=30,
                      help='Numeric value of fixed window for finding mean of '
                           'user session duration, in minutes')

  args, pipeline_args = parser.parse_known_args(argv)

  if args.topic is None and args.subscription is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: one of --topic or --subscription is required')
    sys.exit(1)

  options = PipelineOptions(pipeline_args)

  # We also require the --project option to access --dataset
  if options.view_as(GoogleCloudOptions).project is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: argument --project is required')
    sys.exit(1)

  fixed_window_duration = args.fixed_window_duration * 60
  session_gap = args.session_gap * 60
  user_activity_window_duration = args.user_activity_window_duration * 60

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  # Enforce that this pipeline is always run in streaming mode
  options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=options) as p:
    # Read game events from Pub/Sub using custom timestamps, which
    # are extracted from the data elements, and parse the data.
    if args.subscription:
      scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          subscription=args.subscription)
    else:
      scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          topic=args.topic)
    raw_events = (
        scores
        | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
        | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
        | 'AddEventTimestamps' >> beam.Map(
            lambda elem: beam.window.TimestampedValue(elem, elem['timestamp'])))

    # Extract username/score pairs from the event stream
    user_events = (
        raw_events
        | 'ExtractUserScores' >> beam.Map(
            lambda elem: (elem['user'], elem['score'])))

    # Calculate the total score per user over fixed windows, and cumulative
    # updates for late data
    spammers_view = (
        user_events
        | 'UserFixedWindows' >> beam.WindowInto(
            beam.window.FixedWindows(fixed_window_duration))

        # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
        # These might be robots/spammers.
        | 'CalculateSpammyUsers' >> CalculateSpammyUsers()

        # Derive a view from the collection of spammer users. It will be used as
        # a side input in calculating the team score sums, below
        | 'CreateSpammersView' >> beam.CombineGlobally(
            beam.combiners.ToDictCombineFn()).as_singleton_view())

    # [START filter_and_calc]
    # Calculate the total score per team over fixed windows, and emit cumulative
    # updates for late data. Uses the side input derived above --the set of
    # suspected robots-- to filter out scores from those users from the sum.
    # Write the results to BigQuery.
    (raw_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoFixedWindows' >> beam.WindowInto(
         beam.window.FixedWindows(fixed_window_duration))

     # Filter out the detected spammer users, using the side input derived above
     | 'FilterOutSpammers' >> beam.Filter(
         lambda elem, spammers: elem['user'] not in spammers,
         spammers_view)
     # Extract and sum teamname/score pairs from the event data.
     | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
     # [END filter_and_calc]
     | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
     | 'WriteTeamScoreSums' >> WriteToBigQuery(
         args.table_name + '_teams', args.dataset, {
             'team': 'STRING',
             'total_score': 'INTEGER',
             'window_start': 'STRING',
             'processing_time': 'STRING',
         }, options.view_as(GoogleCloudOptions).project))

    # [START session_calc]
    # Detect user sessions-- that is, a burst of activity separated by a gap
    # from further activity. Find and record the mean session lengths.
    # This information could help the game designers track the changing user
    # engagement as their set of game changes.
    (user_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoSessions' >> beam.WindowInto(
         beam.window.Sessions(session_gap),
         timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)

     # For this use, we care only about the existence of the session, not any
     # particular information aggregated over it, so we can just group by key
     # and assign a "dummy value" of None.
     | beam.CombinePerKey(lambda _: None)

     # Get the duration of the session
     | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
     # [END session_calc]

     # [START rewindow]
     # Re-window to process groups of session sums according to when the
     # sessions complete
     | 'WindowToExtractSessionMean' >> beam.WindowInto(
         beam.window.FixedWindows(user_activity_window_duration))

     # Find the mean session duration in each window
     | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults()
     | 'FormatAvgSessionLength' >> beam.Map(
         lambda elem: {'mean_duration': float(elem)})
     | 'WriteAvgSessionLength' >> WriteToBigQuery(
         args.table_name + '_sessions', args.dataset, {
             'mean_duration': 'FLOAT',
         }, options.view_as(GoogleCloudOptions).project))
示例#10
0
def _compute_sum(examples):
    return (examples
            | beam.Map(lambda x: x[1]['label'])
            | beam.CombineGlobally(sum))
示例#11
0
def _compute_mean(examples):
    return (examples
            | beam.Map(lambda x: x[1]['id'])
            | beam.CombineGlobally(beam.combiners.MeanCombineFn()))
示例#12
0
 def expand(self, pcoll):
     return pcoll | beam.CombineGlobally(
         _MergeDefinitionsFn(self._definitions_merger)).without_defaults()
 def expand(self, pcoll):
     return pcoll \
            | beam.CombineGlobally(sum).with_output_types(int)
 def expand(self, pcoll):
     return pcoll | 'MergeHeaders' >> beam.CombineGlobally(
         _MergeHeadersFn(self._header_merger)).without_defaults()
示例#15
0
 def test_global_sum(self):
   pc = [1, 2, 3]
   # [START global_sum]
   result = pc | beam.CombineGlobally(sum)
   # [END global_sum]
   self.assertEqual([6], result)
示例#16
0
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# beam-playground:
#   name: AggregationSum
#   description: Task from katas to compute the sum of all elements.
#   multifile: false
#   pipeline_options:
#   categories:
#     - Combiners

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.CombineGlobally(sum)
     | LogElements())

示例#17
0
from abc import ABCMeta

import apache_beam as beam


# Advanced Combiner - CombineFn
class Combiner(beam.CombineFn):
    __metaclass__ = ABCMeta

    def create_accumulator(self, *args, **kwargs):
        return 0.0, 0

    def add_input(self, mutable_accumulator, element, *args, **kwargs):
        num, count = mutable_accumulator
        return num + element, count + 1

    def merge_accumulators(self, accumulators, *args, **kwargs):
        num_values, count_values = zip(*accumulators)
        return sum(num_values), sum(count_values)

    def extract_output(self, accumulator, *args, **kwargs):
        num_values, count_values = accumulator
        return num_values / count_values if count_values else float('NaN')


with beam.Pipeline() as combiner_pipeline:
    (combiner_pipeline
     | "Create list" >> beam.Create([1, 2, 3, 4, 5, 6])
     | "Using combiner" >> beam.CombineGlobally(Combiner())
     | "Writing output" >> beam.io.WriteToText("combiner_output/output"))
 def expand(self, estimates):
     return (estimates
             | 'ExtractFileSize' >>
             beam.Map(lambda estimate: estimate.size_in_bytes)
             | 'SumFileSizes' >> beam.CombineGlobally(sum))
示例#19
0
文件: task.py 项目: xsm110/Beam15.0
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import apache_beam as beam

from log_elements import LogElements


def sum(numbers):
    total = 0

    for num in numbers:
        total += num

    return total


p = beam.Pipeline()

(p | beam.Create([1, 2, 3, 4, 5]) | beam.CombineGlobally(sum) | LogElements())

p.run()
 def expand(self, estimates):
     return (estimates
             | 'ExtractVariantCount' >>
             beam.Map(lambda estimate: estimate.estimated_variant_count)
             | 'SumVariantCounts' >> beam.CombineGlobally(sum))
示例#21
0
def CSVToRecordBatch(lines: beam.pvalue.PCollection,
                     column_names: List[Text],
                     desired_batch_size: Optional[int],
                     delimiter: Text = ",",
                     skip_blank_lines: bool = True,
                     schema: Optional[schema_pb2.Schema] = None,
                     multivalent_columns: Optional[List[Text]] = None,
                     secondary_delimiter: Optional[Text] = None,
                     raw_record_column_name: Optional[Text] = None):
    """Decodes CSV records into Arrow RecordBatches.

  Args:
    lines: The pcollection of raw records (csv lines).
    column_names: List of feature names. Order must match the order in the CSV
      file.
    desired_batch_size: Batch size. The output Arrow RecordBatches will have as
      many rows as the `desired_batch_size`. If None, the batch size is auto
      tuned by beam.
    delimiter: A one-character string used to separate fields.
    skip_blank_lines: A boolean to indicate whether to skip over blank lines
      rather than interpreting them as missing values.
    schema: An optional schema of the input data. If this is provided, it must
      contain a subset of columns in `column_names`. If a feature is in
      `column_names` but not in the schema, it won't be in the result
      RecordBatch.
    multivalent_columns: Columns that can contain multiple values. If
      secondary_delimiter is provided, this must also be provided.
    secondary_delimiter: Delimiter used for parsing multivalent columns. If
      multivalent_columns is provided, this must also be provided.
    raw_record_column_name: Optional name for a column containing the raw csv
      lines. If this is None, then this column will not be produced. This will
      always be the last column in the record batch.

  Returns:
    RecordBatches of the CSV lines.

  Raises:
    ValueError:
      * If the columns do not match the specified csv headers.
      * If the schema has invalid feature types.
      * If the schema does not contain all columns.
      * If raw_record_column_name exists in column_names
  """
    if (raw_record_column_name is not None
            and raw_record_column_name in column_names):
        raise ValueError(
            "raw_record_column_name: {} is already an existing column name. "
            "Please choose a different name.".format(raw_record_column_name))

    csv_lines_and_raw_records = (
        lines | "ParseCSVLines" >> beam.ParDo(ParseCSVLine(delimiter)))

    if schema is not None:
        column_infos = _GetColumnInfosFromSchema(schema, column_names)
    else:
        # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
        # Do first pass to infer the feature types.
        column_infos = beam.pvalue.AsSingleton(
            csv_lines_and_raw_records
            | "ExtractParsedCSVLines" >> beam.Keys()
            | "InferColumnTypes" >> beam.CombineGlobally(
                ColumnTypeInferrer(column_names=column_names,
                                   skip_blank_lines=skip_blank_lines,
                                   multivalent_columns=multivalent_columns,
                                   secondary_delimiter=secondary_delimiter)))

    # Do second pass to generate the RecordBatches.
    return (
        csv_lines_and_raw_records
        | "BatchCSVLines" >> beam.BatchElements(
            **batch_util.GetBatchElementsKwargs(desired_batch_size))
        | "BatchedCSVRowsToArrow" >> beam.ParDo(
            BatchedCSVRowsToRecordBatch(
                skip_blank_lines=skip_blank_lines,
                multivalent_columns=multivalent_columns,
                secondary_delimiter=secondary_delimiter,
                raw_record_column_name=raw_record_column_name), column_infos))
 def expand(self, sample_map):
     return (sample_map
             | 'GetListsOfValueCounts' >> beam.Values()
             | 'SumValueCountsPerSample' >> beam.Map(sum)
             | 'SumTotalValueCounts' >> beam.CombineGlobally(sum))
示例#23
0
      def expand(self, pcolls):

        scalar_inputs = [expr for expr in self.stage.inputs if is_scalar(expr)]
        tabular_inputs = [
            expr for expr in self.stage.inputs if not is_scalar(expr)
        ]

        if len(tabular_inputs) == 0:
          partitioned_pcoll = next(pcolls.values()).pipeline | beam.Create([{}])

        elif self.stage.partitioning != partitionings.Nothing():
          # Partitioning required for these operations.
          # Compute the number of partitions to use for the inputs based on
          # the estimated size of the inputs.
          if self.stage.partitioning == partitionings.Singleton():
            # Always a single partition, don't waste time computing sizes.
            num_partitions = 1
          else:
            # Estimate the sizes from the outputs of a *previous* stage such
            # that using these estimates will not cause a fusion break.
            input_sizes = [
                estimate_size(input, same_stage_ok=False)
                for input in tabular_inputs
            ]
            if None in input_sizes:
              # We were unable to (cheaply) compute the size of one or more
              # inputs.
              num_partitions = DEFAULT_PARTITIONS
            else:
              num_partitions = beam.pvalue.AsSingleton(
                  input_sizes
                  | 'FlattenSizes' >> beam.Flatten()
                  | 'SumSizes' >> beam.CombineGlobally(sum)
                  | 'NumPartitions' >> beam.Map(
                      lambda size: max(
                          MIN_PARTITIONS,
                          min(MAX_PARTITIONS, size // TARGET_PARTITION_SIZE))))

          # Arrange such that partitioned_pcoll is properly partitioned.
          main_pcolls = {
              expr._id: pcolls[expr._id] | 'Partition_%s_%s' %
              (self.stage.partitioning, expr._id) >> beam.FlatMap(
                  self.stage.partitioning.partition_fn, num_partitions)
              for expr in tabular_inputs
          } | beam.CoGroupByKey()
          partitioned_pcoll = main_pcolls | beam.ParDo(_ReBatch())

        else:
          # Already partitioned, or no partitioning needed.
          assert len(tabular_inputs) == 1
          tag = tabular_inputs[0]._id
          partitioned_pcoll = pcolls[tag] | beam.Map(lambda df: {tag: df})

        side_pcolls = {
            expr._id: beam.pvalue.AsSingleton(pcolls[expr._id])
            for expr in scalar_inputs
        }

        # Actually evaluate the expressions.
        def evaluate(partition, stage=self.stage, **side_inputs):
          session = expressions.Session(
              dict([(expr, partition[expr._id]) for expr in tabular_inputs] +
                   [(expr, side_inputs[expr._id]) for expr in scalar_inputs]))
          for expr in stage.outputs:
            yield beam.pvalue.TaggedOutput(expr._id, expr.evaluate_at(session))

        return partitioned_pcoll | beam.FlatMap(evaluate, **
                                                side_pcolls).with_outputs()
示例#24
0
def run(argv=None):

    pipeline_args = [
        '--project={0}'.format(PROJECT), '--job_name=majesticmillion1',
        '--save_main_session',
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/temp/'.format(BUCKET), '--num_workers=4',
        '--runner=DataflowRunner',
        '--inputFile=gs://{0}/Sample_Data/majestic_million.csv'.format(BUCKET),
        '--template_location=gs://{0}/templates/majestic_million_template'.
        format(BUCKET), '--zone=australia-southeast1-a'
        #  '--region=australia-southeast1',
    ]
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    inbound_options = pipeline_options.view_as(FileLoader)
    input = inbound_options.inputFile

    with beam.Pipeline(options=pipeline_options) as p:
        TLD_Desc = (
            p
            | 'Read TLD Description File' >> beam.io.ReadFromText(TLDFile)
            | 'Parse Descriptions' >> beam.ParDo(combine_TLD())
            | 'Combine Descriptions to Dictionary' >>
            beam.CombineGlobally(combine_pdict))

        excludedTLDs = (
            p
            | 'Read excuded TLD file' >> beam.io.ReadFromText(excludedTLDFile)
            | 'Get list of excluded TLD' >> beam.ParDo(lambda x: x.split(',')))

        # Extract records as dictionaries
        records = (
            p
            | 'Read File' >> beam.io.ReadFromText(input, skip_header_lines=1)
            | 'Parse CSV' >> beam.ParDo(Split(), SCHEMA)
            | 'Add Descriptions' >> beam.ParDo(
                AddDTLDDesc(), beam.pvalue.AsSingleton(TLD_Desc)))

        # Write TLD aggregations to BigQuery
        (records | 'Aggregate TLDS' >> CountTLDs(excludedTLDs)
         | 'Write TLDs to BigQuery' >> beam.io.WriteToBigQuery(
             '{0}:{1}.TLDCounts'.format(PROJECT,
                                        DATASET),  # Enter your table name
             schema=TLD_SCHEMA,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        # Write all records to BigQuery
        (records
         | 'Write Items BQ' >> beam.io.WriteToBigQuery(
             '{0}:{1}.TopSites'.format(PROJECT,
                                       DATASET),  # Enter your table name
             schema=SCHEMA + "," + DESCRIPTIONSCHEMA,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        # Write metadata to Datastore
        (records
         | 'Get Record Count' >> beam.combiners.Count.Globally()
         | 'Create Metadata' >> beam.ParDo(
             GetMetaData(inbound_options.inputFile))
         | 'Create DS Entity' >> beam.Map(lambda x: create_ds_entity(x))
         | 'Write To DS' >> WriteToDatastore(PROJECT))

    p.run()