Python GroupBy примеры, apache_beam.GroupBy Python примеры использования

Пример #1

0

Показать файл

    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        new_user_stats_models = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        unused_put_result = (
            new_user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (new_user_stats_models
                | 'Count all new models' >>
                (beam.combiners.Count.Globally().without_defaults())
                | 'Only create result for new models when > 0' >>
                (beam.Filter(lambda x: x > 0))
                | 'Create result for new models' >>
                beam.Map(lambda x: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % x)))

Пример #2

0

Показать файл

Файл: avro_file_transformer.py Проект: viktor-pecheniuk/beam-on-flink

 def expand(self, pcoll):
     return (pcoll
             | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn())
             | 'Group By' >> beam.GroupBy("post_id", "title")
                                 .aggregate_field(lambda row: len(row.tags.split("|")), sum, "tags_count")
             | 'Format Result' >> beam.ParDo(FormatCsvRowFn())
             )

Пример #3

0

Показать файл

Файл: filter-ad-traffic.py Проект: younggon0/log

def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument( '--input', dest='input', required=True)
    parser.add_argument( '--output', dest='output', required=True)
    args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions()
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

    p = beam.Pipeline(options=pipeline_options)

    # Input: '\t'.join((timestamp, ip, agent, url, referer))
    lines = p | 'Read' >> ReadFromText(args.input)
    requests = lines | 'Split' >> beam.Map(lambda l: l.split('\t'))
    join = requests | 'Group' >> beam.GroupBy(lambda req: req[1])
    requests = join | 'Filter' >> beam.ParDo(FilterFn())
    requests |'Write' >> WriteToText(args.output)

    result = p.run()
    result.wait_until_finish()

    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        query_result = result.metrics().query()
        for result in query_result['counters']:
            logging.info(f'{result.key.metric.name}: {result.result}')

Пример #4

0

Показать файл

    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        user_settings_models = (
            self.pipeline
            | 'Get all UserSettingsModels' >>
            (ndb_io.GetModels(user_models.UserSettingsModel.get_all())))

        old_user_stats_models = (
            self.pipeline
            | 'Get all UserStatsModels' >>
            (ndb_io.GetModels(user_models.UserStatsModel.get_all())))

        # Creates UserStatsModels if it does not exists.
        new_user_stats_models = (
            (user_settings_models, old_user_stats_models)
            | 'Merge models' >> beam.Flatten()
            # Returns a PCollection of
            # (model.id, (user_settings_models, user_stats_models)) or
            # (model.id, (user_settings_models,)).
            | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id)
            # Discards model.id from the PCollection.
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # Only keep groupings that indicate that
            # the UserStatsModel is missing.
            | 'Filter pairs of models' >>
            beam.Filter(lambda models: (len(list(models)) == 1 and isinstance(
                list(models)[0], user_models.UserSettingsModel)))
            # Choosing the first element.
            | 'Transform tuples into models' >>
            beam.Map(lambda models: list(models)[0])
            # Creates the missing UserStatsModels.
            | 'Create new user stat models' >> beam.ParDo(
                CreateUserStatsModel()))

        unused_put_result = (
            (new_user_stats_models, old_user_stats_models)
            | 'Merge new and old models together' >> beam.Flatten()
            | 'Update the dashboard stats' >> beam.ParDo(
                UpdateWeeklyCreatorStats())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        new_user_stats_job_result = (
            new_user_stats_models
            | 'Count all new models' >> beam.combiners.Count.Globally()
            | 'Only create result for new models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for new models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS NEW %s' % x)))
        old_user_stats_job_result = (
            old_user_stats_models
            | 'Count all old models' >> beam.combiners.Count.Globally()
            | 'Only create result for old models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for old models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS OLD %s' % x)))

        return ((new_user_stats_job_result, old_user_stats_job_result)
                | 'Merge new and old results together' >> beam.Flatten())

Пример #5

0

Показать файл

Файл: main.py Проект: googleinterns/ellipsis-step-internship

def run(recognition_provider_name, ingestion_run=None, ingestion_provider=None, output_name=None, run_locally=False):
    """Main entry point, defines and runs the image recognition pipeline.

    Input: either ingestion run id or ingestion provider id.
    The input is used for querying the database for image ingested by
    either one of the optional inputs.
    """
    _validate_args(recognition_provider_name, ingestion_run, ingestion_provider)
    recognition_provider = get_recognition_provider(recognition_provider_name)
    job_name = generate_cloud_dataflow_job_name(_PIPELINE_TYPE, recognition_provider.provider_id)
    if run_locally:
        pipeline_options = PipelineOptions()
    else:
        output_name = 'gs://demo-bucket-step/results/outputs'
        pipeline_options = PipelineOptions(
            flags=None,
            runner='DataflowRunner',
            project='step-project-ellispis',
            job_name=job_name,
            temp_location='gs://demo-bucket-step/temp',
            region='europe-west2',
            setup_file='./setup.py',
        )
    
    try:
        with beam.Pipeline(options=pipeline_options) as pipeline:
            store_pipeline_run(job_name, recognition_provider.provider_id)
            indices_for_batching = pipeline | 'create' >> beam.Create(create_query_indices())
            if ingestion_run:
                dataset = indices_for_batching | 'get images dataset' >> \
                    beam.ParDo(GetBatchedImageDataset(), ingestion_run=ingestion_run)
            else:
                dataset = indices_for_batching | 'get images dataset' >> \
                    beam.ParDo(GetBatchedImageDataset(), ingestion_provider=ingestion_provider)
            dataset_with_url_for_provider = dataset | 'add url for labeling' >> \
                beam.ParDo(recognition_provider.add_url_for_recognition_api)
            filtered_dataset = dataset_with_url_for_provider | 'filter images' >> \
                beam.Filter(recognition_provider.is_eligible)
            images_batch = filtered_dataset | 'combine to batches' >> \
                beam.GroupBy(lambda doc: int(doc['random']*100)) |\
                    beam.ParDo(lambda element: [element[1]])
            # Labels the images by the process method of the provider.
            labeled_images_batch = images_batch | 'label by batch' >> \
                beam.ParDo(recognition_provider)
            labeled_images = labeled_images_batch | \
                beam.FlatMap(lambda elements: elements)
            # pylint: disable=expression-not-assigned
            labeled_images | 'store in database' >> beam.ParDo(UpdateImageLabelsInDatabase(),\
                job_name, recognition_provider.provider_id)

            if output_name: # For testing.
                def format_result(image, labels):
                    return '%s: %s' % (image['url'], labels)
                output = labeled_images | 'Format' >> beam.MapTuple(format_result)
                output | 'Write' >> WriteToText(output_name)
        update_pipeline_run_when_succeeded(job_name)
    except:
        update_pipeline_run_when_failed(job_name)
        raise

Пример #6

0

Показать файл

 def expand(self, pcoll):
     return (pcoll
             | 'Xml2Dict' >> beam.ParDo(ParseXmlToDictFn())
             | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn())
             | 'Group By' >> beam.GroupBy("post_id", "title")
                                 .aggregate_field(lambda row: len(re.findall(r"<.*?>", row.tags)), sum, "tags_count")
             | 'Format Result' >> beam.ParDo(FormatCsvRowFn())
             )

Пример #7

0

Показать файл

Файл: batches_from_executions.py Проект: Hatuna/megalista

 def expand(self, executions):
     return (executions
             | beam.Filter(lambda execution: execution.destination.
                           destination_type == self._destination_type)
             | beam.ParDo(self._get_bq_request_class())
             | beam.io.ReadAllFromBigQuery()
             | beam.GroupBy(lambda x: x['execution_hash'])
             | beam.ParDo(self._BatchElements(self._batch_size),
                          beam.pvalue.AsList(executions)))

Пример #8

0

Показать файл

Файл: weather_statistics_pipeline.py Проект: subratcall/training-data-analyst

    def expand(self, pcoll):

        results = (
            pcoll
            | 'ComputeStatistics' >> beam.GroupBy('loc_id').aggregate_field(
                'low_temp', min, 'record_low').aggregate_field(
                    'high_temp', max, 'record_high').aggregate_field(
                        'precip', sum, 'total_precip')
            | 'ToJson' >> beam.ParDo(ConvertToJson()))

        return results

Пример #9

0

Показать файл

def SampleValuesBy(pcoll, key_columns):
    """Group sample_values by key_columns.

  E.g. with this pcollection:
    col1=A col2=B col3=C sample_values=[1]
    col1=A col2=B col3=Z sample_values=[2, 3]

  p | SampleValuesBy(['col1', 'col2']) emits:
    col1=A col2=B all_sample_values=[1, 2, 3]
  """
    return (pcoll
            | beam.GroupBy(*key_columns).force_tuple_keys().aggregate_field(
                'sample_values', ConcatListCombineFn(), 'all_sample_values'))

Пример #10

0

Показать файл