Пример #1
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        new_user_stats_models = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        unused_put_result = (
            new_user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (new_user_stats_models
                | 'Count all new models' >>
                (beam.combiners.Count.Globally().without_defaults())
                | 'Only create result for new models when > 0' >>
                (beam.Filter(lambda x: x > 0))
                | 'Create result for new models' >>
                beam.Map(lambda x: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % x)))
 def expand(self, pcoll):
     return (pcoll
             | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn())
             | 'Group By' >> beam.GroupBy("post_id", "title")
                                 .aggregate_field(lambda row: len(row.tags.split("|")), sum, "tags_count")
             | 'Format Result' >> beam.ParDo(FormatCsvRowFn())
             )
Пример #3
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument( '--input', dest='input', required=True)
    parser.add_argument( '--output', dest='output', required=True)
    args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions()
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

    p = beam.Pipeline(options=pipeline_options)

    # Input: '\t'.join((timestamp, ip, agent, url, referer))
    lines = p | 'Read' >> ReadFromText(args.input)
    requests = lines | 'Split' >> beam.Map(lambda l: l.split('\t'))
    join = requests | 'Group' >> beam.GroupBy(lambda req: req[1])
    requests = join | 'Filter' >> beam.ParDo(FilterFn())
    requests |'Write' >> WriteToText(args.output)

    result = p.run()
    result.wait_until_finish()

    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        query_result = result.metrics().query()
        for result in query_result['counters']:
            logging.info(f'{result.key.metric.name}: {result.result}')
Пример #4
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        user_settings_models = (
            self.pipeline
            | 'Get all UserSettingsModels' >>
            (ndb_io.GetModels(user_models.UserSettingsModel.get_all())))

        old_user_stats_models = (
            self.pipeline
            | 'Get all UserStatsModels' >>
            (ndb_io.GetModels(user_models.UserStatsModel.get_all())))

        # Creates UserStatsModels if it does not exists.
        new_user_stats_models = (
            (user_settings_models, old_user_stats_models)
            | 'Merge models' >> beam.Flatten()
            # Returns a PCollection of
            # (model.id, (user_settings_models, user_stats_models)) or
            # (model.id, (user_settings_models,)).
            | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id)
            # Discards model.id from the PCollection.
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # Only keep groupings that indicate that
            # the UserStatsModel is missing.
            | 'Filter pairs of models' >>
            beam.Filter(lambda models: (len(list(models)) == 1 and isinstance(
                list(models)[0], user_models.UserSettingsModel)))
            # Choosing the first element.
            | 'Transform tuples into models' >>
            beam.Map(lambda models: list(models)[0])
            # Creates the missing UserStatsModels.
            | 'Create new user stat models' >> beam.ParDo(
                CreateUserStatsModel()))

        unused_put_result = (
            (new_user_stats_models, old_user_stats_models)
            | 'Merge new and old models together' >> beam.Flatten()
            | 'Update the dashboard stats' >> beam.ParDo(
                UpdateWeeklyCreatorStats())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        new_user_stats_job_result = (
            new_user_stats_models
            | 'Count all new models' >> beam.combiners.Count.Globally()
            | 'Only create result for new models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for new models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS NEW %s' % x)))
        old_user_stats_job_result = (
            old_user_stats_models
            | 'Count all old models' >> beam.combiners.Count.Globally()
            | 'Only create result for old models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for old models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS OLD %s' % x)))

        return ((new_user_stats_job_result, old_user_stats_job_result)
                | 'Merge new and old results together' >> beam.Flatten())
def run(recognition_provider_name, ingestion_run=None, ingestion_provider=None, output_name=None, run_locally=False):
    """Main entry point, defines and runs the image recognition pipeline.

    Input: either ingestion run id or ingestion provider id.
    The input is used for querying the database for image ingested by
    either one of the optional inputs.
    """
    _validate_args(recognition_provider_name, ingestion_run, ingestion_provider)
    recognition_provider = get_recognition_provider(recognition_provider_name)
    job_name = generate_cloud_dataflow_job_name(_PIPELINE_TYPE, recognition_provider.provider_id)
    if run_locally:
        pipeline_options = PipelineOptions()
    else:
        output_name = 'gs://demo-bucket-step/results/outputs'
        pipeline_options = PipelineOptions(
            flags=None,
            runner='DataflowRunner',
            project='step-project-ellispis',
            job_name=job_name,
            temp_location='gs://demo-bucket-step/temp',
            region='europe-west2',
            setup_file='./setup.py',
        )
    
    try:
        with beam.Pipeline(options=pipeline_options) as pipeline:
            store_pipeline_run(job_name, recognition_provider.provider_id)
            indices_for_batching = pipeline | 'create' >> beam.Create(create_query_indices())
            if ingestion_run:
                dataset = indices_for_batching | 'get images dataset' >> \
                    beam.ParDo(GetBatchedImageDataset(), ingestion_run=ingestion_run)
            else:
                dataset = indices_for_batching | 'get images dataset' >> \
                    beam.ParDo(GetBatchedImageDataset(), ingestion_provider=ingestion_provider)
            dataset_with_url_for_provider = dataset | 'add url for labeling' >> \
                beam.ParDo(recognition_provider.add_url_for_recognition_api)
            filtered_dataset = dataset_with_url_for_provider | 'filter images' >> \
                beam.Filter(recognition_provider.is_eligible)
            images_batch = filtered_dataset | 'combine to batches' >> \
                beam.GroupBy(lambda doc: int(doc['random']*100)) |\
                    beam.ParDo(lambda element: [element[1]])
            # Labels the images by the process method of the provider.
            labeled_images_batch = images_batch | 'label by batch' >> \
                beam.ParDo(recognition_provider)
            labeled_images = labeled_images_batch | \
                beam.FlatMap(lambda elements: elements)
            # pylint: disable=expression-not-assigned
            labeled_images | 'store in database' >> beam.ParDo(UpdateImageLabelsInDatabase(),\
                job_name, recognition_provider.provider_id)

            if output_name: # For testing.
                def format_result(image, labels):
                    return '%s: %s' % (image['url'], labels)
                output = labeled_images | 'Format' >> beam.MapTuple(format_result)
                output | 'Write' >> WriteToText(output_name)
        update_pipeline_run_when_succeeded(job_name)
    except:
        update_pipeline_run_when_failed(job_name)
        raise
Пример #6
0
 def expand(self, pcoll):
     return (pcoll
             | 'Xml2Dict' >> beam.ParDo(ParseXmlToDictFn())
             | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn())
             | 'Group By' >> beam.GroupBy("post_id", "title")
                                 .aggregate_field(lambda row: len(re.findall(r"<.*?>", row.tags)), sum, "tags_count")
             | 'Format Result' >> beam.ParDo(FormatCsvRowFn())
             )
Пример #7
0
 def expand(self, executions):
     return (executions
             | beam.Filter(lambda execution: execution.destination.
                           destination_type == self._destination_type)
             | beam.ParDo(self._get_bq_request_class())
             | beam.io.ReadAllFromBigQuery()
             | beam.GroupBy(lambda x: x['execution_hash'])
             | beam.ParDo(self._BatchElements(self._batch_size),
                          beam.pvalue.AsList(executions)))
    def expand(self, pcoll):

        results = (
            pcoll
            | 'ComputeStatistics' >> beam.GroupBy('loc_id').aggregate_field(
                'low_temp', min, 'record_low').aggregate_field(
                    'high_temp', max, 'record_high').aggregate_field(
                        'precip', sum, 'total_precip')
            | 'ToJson' >> beam.ParDo(ConvertToJson()))

        return results
Пример #9
0
def SampleValuesBy(pcoll, key_columns):
    """Group sample_values by key_columns.

  E.g. with this pcollection:
    col1=A col2=B col3=C sample_values=[1]
    col1=A col2=B col3=Z sample_values=[2, 3]

  p | SampleValuesBy(['col1', 'col2']) emits:
    col1=A col2=B all_sample_values=[1, 2, 3]
  """
    return (pcoll
            | beam.GroupBy(*key_columns).force_tuple_keys().aggregate_field(
                'sample_values', ConcatListCombineFn(), 'all_sample_values'))
Пример #10
0
    def test_global_aggregate(self):
        # [START global_aggregate]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create(GROCERY_LIST)
                       | beam.GroupBy().aggregate_field(
                           'unit_price', min, 'min_price').aggregate_field(
                               'unit_price', MeanCombineFn(),
                               'mean_price').aggregate_field(
                                   'unit_price', max, 'max_price'))
            # [END global_aggregate]

            expected = [
                #[START global_aggregate_result]
                NamedTuple(min_price=1.00, mean_price=7 / 3, max_price=4.00),
                #[END global_aggregate_result]
            ]
            assert_that(grouped | beam.Map(normalize), equal_to(expected))
Пример #11
0
    def test_expr_aggregate(self):
        # [START expr_aggregate]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create(GROCERY_LIST)
                       | beam.GroupBy('recipe').aggregate_field(
                           'quantity', sum, 'total_quantity').aggregate_field(
                               lambda x: x.quantity * x.unit_price, sum,
                               'price'))
            # [END expr_aggregate]

            expected = [
                #[START expr_aggregate_result]
                NamedTuple(recipe='pie', total_quantity=6, price=14.00),
                NamedTuple(recipe='muffin', total_quantity=5, price=7.00),
                #[END expr_aggregate_result]
            ]
            assert_that(grouped | beam.Map(normalize), equal_to(expected))
Пример #12
0
    def test_group_by_attr_expr(self):
        # [START groupby_attr_expr]
        with beam.Pipeline() as p:
            grouped = (p | beam.Create(GROCERY_LIST)
                       | beam.GroupBy('recipe',
                                      is_berry=lambda x: 'berry' in x.fruit))
            # [END groupby_attr_expr]

            expected = [
                #[START groupby_attr_expr_result]
                (NamedTuple(recipe='pie', is_berry=True), [
                    beam.Row(recipe='pie',
                             fruit='strawberry',
                             quantity=3,
                             unit_price=1.50),
                    beam.Row(recipe='pie',
                             fruit='raspberry',
                             quantity=1,
                             unit_price=3.50),
                    beam.Row(recipe='pie',
                             fruit='blackberry',
                             quantity=1,
                             unit_price=4.00),
                    beam.Row(recipe='pie',
                             fruit='blueberry',
                             quantity=1,
                             unit_price=2.00),
                ]),
                (NamedTuple(recipe='muffin', is_berry=True), [
                    beam.Row(recipe='muffin',
                             fruit='blueberry',
                             quantity=2,
                             unit_price=2.00),
                ]),
                (NamedTuple(recipe='muffin', is_berry=False), [
                    beam.Row(recipe='muffin',
                             fruit='banana',
                             quantity=3,
                             unit_price=1.00),
                ]),
                #[END groupby_attr_expr_result]
            ]
            assert_that(grouped | beam.MapTuple(normalize_kv),
                        equal_to(expected))
Пример #13
0
    def test_simple_aggregate(self):
        # [START simple_aggregate]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create(GROCERY_LIST)
                       | beam.GroupBy('fruit').aggregate_field(
                           'quantity', sum, 'total_quantity'))
            # [END simple_aggregate]

            expected = [
                #[START simple_aggregate_result]
                NamedTuple(fruit='strawberry', total_quantity=3),
                NamedTuple(fruit='raspberry', total_quantity=1),
                NamedTuple(fruit='blackberry', total_quantity=1),
                NamedTuple(fruit='blueberry', total_quantity=3),
                NamedTuple(fruit='banana', total_quantity=3),
                #[END simple_aggregate_result]
            ]
            assert_that(grouped | beam.Map(normalize), equal_to(expected))
Пример #14
0
    def test_groupby_expr(self):
        # [START groupby_expr]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create([
                           'strawberry', 'raspberry', 'blueberry',
                           'blackberry', 'banana'
                       ])
                       | beam.GroupBy(lambda s: s[0]))
            # [END groupby_expr]

            assert_that(
                grouped | beam.MapTuple(normalize_kv),
                equal_to([
                    #[START groupby_expr_result]
                    ('s', ['strawberry']),
                    ('r', ['raspberry']),
                    ('b', ['banana', 'blackberry', 'blueberry']),
                    #[END groupby_expr_result]
                ]))
Пример #15
0
    def test_groupby_two_exprs(self):
        # [START groupby_two_exprs]
        with beam.Pipeline() as p:
            grouped = (p
                       | beam.Create([
                           'strawberry', 'raspberry', 'blueberry',
                           'blackberry', 'banana'
                       ])
                       | beam.GroupBy(letter=lambda s: s[0],
                                      is_berry=lambda s: 'berry' in s))
            # [END groupby_two_exprs]

            expected = [
                #[START groupby_two_exprs_result]
                (NamedTuple(letter='s', is_berry=True), ['strawberry']),
                (NamedTuple(letter='r', is_berry=True), ['raspberry']),
                (NamedTuple(letter='b',
                            is_berry=True), ['blackberry', 'blueberry']),
                (NamedTuple(letter='b', is_berry=False), ['banana']),
                #[END groupby_two_exprs_result]
            ]
            assert_that(grouped | beam.MapTuple(normalize_kv),
                        equal_to(expected))
Пример #16
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating SkillOpportunityModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating SkillOpportunityModel.
        """
        question_skill_link_models = (
            self.pipeline
            | 'Get all non-deleted QuestionSkillLinkModels' >>
            (ndb_io.GetModels(
                question_models.QuestionSkillLinkModel.get_all(
                    include_deleted=False)))
            | 'Group QuestionSkillLinkModels by skill ID' >>
            beam.GroupBy(lambda n: n.skill_id))

        skills = (
            self.pipeline
            | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels(
                skill_models.SkillModel.get_all(include_deleted=False)))
            | 'Get skill object from model' >> beam.Map(
                skill_fetchers.get_skill_from_model)
            |
            'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id))

        skills_with_question_counts = (
            {
                'skill': skills,
                'question_skill_links': question_skill_link_models
            }
            | 'Merge by skill ID' >> beam.CoGroupByKey()
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove skill IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # We are using itertools.chain.from_iterable to flatten
            # question_skill_links from a 2D list into a 1D list.
            | 'Flatten skill and question_skill_links' >> beam.Map(
                lambda object: {
                    'skill':
                    list(object['skill'][0])[0],
                    'question_skill_links':
                    list(
                        itertools.chain.from_iterable(object[
                            'question_skill_links']))
                }))

        opportunities_results = (
            skills_with_question_counts
            | beam.Map(lambda object: self._create_skill_opportunity_model(
                object['skill'], object['question_skill_links'])))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with OK status' >>
            beam.Filter(lambda result: result.is_ok())
            | 'Fetch the models to be put' >>
            beam.Map(lambda result: result.unwrap())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Transform Results to JobRunResults' >>
                (job_result_transforms.ResultsToJobRunResults()))
Пример #17
0
    }

    #end bigquery schema

    #with beam.Pipeline(argv=sys.argv) as p:
    parsed_csv = (
        p
        | 'Readfile' >> beam.io.ReadFromText(input_filename)
        | 'Parsefile' >> beam.Map(parse_file)
        | 'DifinirSchema' >>
        beam.Map(lambda x: beam.Row(fecha=str(x[0].strip()),
                                    fruta=str(x[1].strip()),
                                    cantidad=int(x[2].strip())))
        #| 'SQLTransform' >> SqlTransform("""
        #SELECT
        #  fruta,
        #  COUNT(fruta) AS Cuenta
        #FROM PCOLLECTION
        #GROUP BY fruta""")
        | 'Groupby' >> beam.GroupBy('fruta').aggregate_field(
            lambda x: 1 if x.fruta else 0, sum, 'Cuenta')
        | 'print' >> beam.FlatMap(print_row)
        #| 'write' >> beam.io.WriteToText(prefijoSalida, file_name_suffix='.txt', header='fecha, fruta, cantidad')
        | 'bq_insert' >> beam.io.gcp.bigquery.WriteToBigQuery(
            table='prueba_dataflow',
            dataset='CUSTOMER_EXPERIENCE',
            project='apex-dataway',
            schema=bq_schema,
            create_disposition='CREATE_IF_NEEDED',
            write_disposition='WRITE_APPEND'))
    p.run().wait_until_finish()
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        user_stats_results = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None)))

        user_stats_models = (
            user_stats_results
            | 'Filter ok results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_ok())
            | 'Unpack result' >> beam.MapTuple(lambda key, result:
                                               (key, result.unwrap()))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        user_stats_error_job_run_results = (
            user_stats_results
            | 'Filter err results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_err())
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove keys' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Transform result to job run result' >>
            (job_result_transforms.ResultsToJobRunResults()))

        unused_put_result = (
            user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        user_stats_models_job_run_results = (
            user_stats_models
            | 'Create job run result' >>
            (job_result_transforms.CountObjectsToJobRunResult()))

        return ((user_stats_error_job_run_results,
                 user_stats_models_job_run_results)
                | 'Merge job run results' >> beam.Flatten())
Пример #19
0
def pipeline(root):
    """Beam pipeline.

  Args:
    root: the root of the pipeline.
  """
    stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1')
    stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2')

    # Create a collection of conformers with duplicate information
    equivalent_files = gfile.glob(FLAGS.input_equivalent_glob)
    equivalent_conformers = (
        root
        | 'CreateEquivInputs' >> beam.Create(equivalent_files)
        | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file))

    # Merge by bond_topology_id
    merged_results = (
        (stage1_matched_conformers, stage2_matched_conformers,
         equivalent_conformers)
        | 'FlattenAllConformers' >> beam.Flatten()
        | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id)
        | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs(
            MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers'))
    merged_conformers = merged_results['conformers']

    # Write out the merge conflicts
    _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT]
         | 'ConflictsCSVFormat' >> beam.Map(csv_format)
         | 'ConflictsReshuffle' >> beam.Reshuffle()
         | 'WriteConflictsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_conflicts',
             header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS),
             num_shards=1,
             file_name_suffix='.csv'))

    cleaned_conformers = (merged_conformers
                          |
                          'CleanUpConformers' >> beam.Map(clean_up_conformer))

    # Get the bond length distributions
    bond_length_dists_pcoll = (
        cleaned_conformers
        | 'ExtractBondLengths' >> beam.FlatMap(
            extract_bond_lengths,
            dist_sig_digits=_BOND_LENGTHS_SIG_DIGITS,
            unbonded_max=_BOND_LENGTHS_UNBONDED_MAX)
        | 'CountBondLengths' >> beam.combiners.Count.PerElement()
        | 'ToListBondLengths' >> beam.combiners.ToList())
    _ = (bond_length_dists_pcoll
         | 'WriteBondLengths' >> beam.ParDo(
             write_bond_lengths,
             filename=f'{FLAGS.output_stem}_bond_lengths.csv'))

    # Get the SMILES to id mapping needed for UpdateConformerFn
    smiles_id_pcoll = (
        root
        | 'BTInputForSmiles' >> beam.Create([FLAGS.input_bond_topology_csv])
        | 'GenerateSmilesToID' >> beam.FlatMap(smiles_to_id))
    smiles_id_dict = beam.pvalue.AsDict(smiles_id_pcoll)

    # Various per conformer processing
    update_results = (cleaned_conformers
                      | 'UpdateConformers' >> beam.ParDo(
                          UpdateConformerFn(),
                          beam.pvalue.AsSingleton(bond_length_dists_pcoll),
                          smiles_id_dict).with_outputs(
                              UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH,
                              main='conformers'))
    updated_conformers = update_results['conformers']

    # Output SMILES mismatches
    _ = (
        update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH]
        | 'ReshuffleSmilesOutput' >> beam.Reshuffle()
        | 'SmilesCSVFormat' >> beam.Map(csv_format)
        | 'WriteSmilesCSV' >> beam.io.WriteToText(
            FLAGS.output_stem + '_smiles_compare',
            header=
            'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h',
            num_shards=1,
            file_name_suffix='.csv'))

    # Process duplicate information
    final_conformers = (
        updated_conformers
        | 'KeyedForDuplicates' >>
        beam.FlatMap(generate_keyed_conformers_for_duplicates)
        | 'DupGroupByKey' >> beam.GroupByKey()
        | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information))

    # Pull the stats of various sorts write to a file
    _ = (final_conformers
         | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values)
         | 'CountStats' >> beam.combiners.Count.PerElement()
         | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}')
         | 'WriteStatsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_stats',
             header='primary_key,secondary_key,count',
             num_shards=1,
             file_name_suffix='.csv'))

    # Generate the summary by bond topology.
    bare_bt_summaries = (
        root
        | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv])
        | 'GenerateBareBTSummaries' >>
        beam.FlatMap(bond_topology_summaries_from_csv))
    real_bt_summaries = (
        final_conformers
        |
        'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary))
    _ = ((bare_bt_summaries, real_bt_summaries)
         | 'FlattenAllBTSummaries' >> beam.Flatten()
         | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary())

    # Make the filtered versions of the dataset
    complete_conformers = (final_conformers
                           |
                           'MakeComplete' >> beam.Map(make_complete_conformer))

    standard_conformers = (
        final_conformers
        | 'MakeStandard' >> beam.FlatMap(make_standard_conformer))

    # Write the complete and standard conformers as binary protobuf in TFRecord.
    for id_str, collection in [['complete', complete_conformers],
                               ['standard', standard_conformers]]:
        _ = (collection
             | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle()
             |
             ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord(
                 f'{FLAGS.output_stem}_{id_str}_tfrecord',
                 coder=beam.coders.ProtoCoder(dataset_pb2.Conformer),
                 num_shards=FLAGS.output_shards))
Пример #20
0
def run():
    opts = PipelineOptions()
    with beam.Pipeline(options=opts) as p:

        opts = opts.view_as(WeatherPipelineOptions)

        stations = load_dict(opts.input_stationlist,
                             parse_csv_kv_row(int, str, 6, 2))

        weather_entries, weather_badrows = (
            p
            | "Read weather entries" >> beam.io.ReadFromText(
                os.path.join(opts.input_weather_dir, "*.csv"))
            | "Parse weather" >> beam.ParDo(ParseFn(stations)).with_outputs(
                "parsed", "invalid"))

        # windowed_weather_entries = (weather_entries | "Add timestamp" >> beam.ParDo(EntryAddTimestampFn())
        #                             | beam.WindowInto(window.FixedWindows(24*60*60)))

        _ = weather_badrows | "Log invalid weather rows" >> beam.io.WriteToText(
            os.path.join(opts.outputdir, "invalid_input_rows"))

        def any(seq: typing.Iterable[bool]):
            result = False
            for x in seq:
                result = result or x
            return result

        def process_weather_entries(el):
            key = el[0]
            weather = el[1]
            return beam.Row(
                country_code=key.country_code,
                obsdate=key.obsdate,
                temp=mean([x.temp for x in weather if x.temp < 9999.9]),
                windspeed=mean(
                    [x.windspeed for x in weather if x.windspeed < 999.9]),
                tornadoes=any([x.tornado_or_funnel for x in weather]),
            )

        weather_by_country_by_day = (weather_entries
                                     | beam.GroupBy("country_code", "obsdate")
                                     | beam.Map(process_weather_entries))

        weather_by_country_by_year = weather_by_country_by_day | beam.GroupBy(
            "country_code", year=lambda x: x.obsdate.year)

        averages = weather_by_country_by_year | beam.MapTuple(lambda k, v: (
            k.year,
            beam.Row(
                country=k.country_code,
                temp=mean([y.temp for y in v]),
                windspeed=mean([y.windspeed for y in v]),
            ),
        ))

        hottest = (
            averages
            | beam.Filter(lambda x: not numpy.isnan(x[1].temp))
            |
            "Top hottest" >> beam.combiners.Top.PerKey(1, key=lambda x: x.temp)
            | beam.Map(str)
            | "Write hottest" >> beam.io.WriteToText(
                os.path.join(opts.outputdir, "hottest")))

        second_windiest = (
            averages
            | beam.Filter(lambda x: not numpy.isnan(x[1].windspeed))
            | "Top two windiest" >> beam.combiners.Top.PerKey(
                2, key=lambda x: x.windspeed)
            | beam.MapTuple(lambda k, v: (k, v[1]))
            | "Write second windiest" >> beam.io.WriteToText(
                os.path.join(opts.outputdir, "second_windiest")))

        def calc_max_consecutive_tornado_days(el):
            key, entries = el
            entries = sorted(entries, key=lambda x: x.obsdate, reverse=True)
            return (
                key.year,
                beam.Row(
                    country_code=key.country_code,
                    max_consecutive_tornado_days=max_consec_sequence_len(
                        [x.tornadoes for x in entries], True),
                ),
            )

        max_consec_tornado_days = (
            weather_by_country_by_year
            | beam.Map(calc_max_consecutive_tornado_days)
            | "Top consec days tornado" >> beam.combiners.Top.PerKey(
                1, key=lambda x: x.max_consecutive_tornado_days)
            | "Write consecutively tornadiest" >> beam.io.WriteToText(
                os.path.join(opts.outputdir, "consecutively_tornadiest")))
Пример #21
0
def pipeline(root):
    """Beam pipeline.

  Args:
    root: the root of the pipeline.
  """
    stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1')
    stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2')

    # Create a collection of conformers with duplicate information
    equivalent_files = gfile.glob(FLAGS.input_equivalent_glob)
    equivalent_conformers = (
        root
        | 'CreateEquivInputs' >> beam.Create(equivalent_files)
        | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file))

    # Merge by bond_topology_id
    merged_results = (
        (stage1_matched_conformers, stage2_matched_conformers,
         equivalent_conformers)
        | 'FlattenAllConformers' >> beam.Flatten()
        | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id)
        | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs(
            MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers'))
    merged_conformers = merged_results['conformers']

    # Write out the merge conflicts
    _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT]
         | 'ConflictsCSVFormat' >> beam.Map(csv_format)
         | 'ConflictsReshuffle' >> beam.Reshuffle()
         | 'WriteConflictsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_conflicts',
             header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS),
             num_shards=1,
             file_name_suffix='.csv'))

    # Various per conformer processing
    update_results = (
        merged_conformers
        | 'UpdateConformers' >> beam.ParDo(UpdateConformerFn()).with_outputs(
            UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers'))
    updated_conformers = update_results['conformers']

    # Output SMILES mismatches
    _ = (
        update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH]
        | 'ReshuffleSmilesOutput' >> beam.Reshuffle()
        | 'SmilesCSVFormat' >> beam.Map(csv_format)
        | 'WriteSmilesCSV' >> beam.io.WriteToText(
            FLAGS.output_stem + '_smiles_compare',
            header=
            'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h',
            num_shards=1,
            file_name_suffix='.csv'))

    # Process duplicate information
    final_conformers = (
        updated_conformers
        | 'KeyedForDuplicates' >>
        beam.FlatMap(generate_keyed_conformers_for_duplicates)
        | 'DupGroupByKey' >> beam.GroupByKey()
        | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information))

    # Pull the stats of various sorts write to a file
    _ = (final_conformers
         | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values)
         | 'CountStats' >> beam.combiners.Count.PerElement()
         | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}')
         | 'WriteStatsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_stats',
             header='primary_key,secondary_key,count',
             num_shards=1,
             file_name_suffix='.csv'))

    # Generate the summary by bond topology.
    bare_bt_summaries = (
        root
        | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv])
        | 'GenerateBareBTSummaries' >>
        beam.FlatMap(bond_topology_summaries_from_csv))
    real_bt_summaries = (
        final_conformers
        |
        'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary))
    _ = ((bare_bt_summaries, real_bt_summaries)
         | 'FlattenAllBTSummaries' >> beam.Flatten()
         | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary())

    # Make the filtered versions of the dataset
    complete_conformers = (final_conformers
                           |
                           'MakeComplete' >> beam.Map(make_complete_conformer))

    standard_conformers = (
        final_conformers
        | 'MakeStandard' >> beam.FlatMap(make_standard_conformer))

    # Write the complete and standard conformers as JSON.
    # Bit of a hack here: the slowest part of the whole pipeline is writing out
    # the JSON for the complete conformers. So we just hard code a tripling of the
    # shards to get more parallelism.
    for id_str, collection, num_shards in [[
            'complete', complete_conformers, FLAGS.output_shards * 3
    ], ['standard', standard_conformers, FLAGS.output_shards]]:
        _ = (collection
             | ('JSONReshuffle_' + id_str) >> beam.Reshuffle()
             | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json)
             | ('WriteJSON_' + id_str) >> beam.io.WriteToText(
                 f'{FLAGS.output_stem}_{id_str}_json',
                 num_shards=num_shards,
                 file_name_suffix='.json.gz'))
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner',
                        required=True,
                        help='Specify Apache Beam Runner')
    parser.add_argument('--input_path',
                        required=True,
                        help='Path to events.json')
    parser.add_argument('--table_name',
                        required=True,
                        help='BigQuery table name')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
        'batch-user-traffic-pipeline-', time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "total_bytes",
                "type": "INTEGER"
            },
            {
                "name": "max_bytes",
                "type": "INTEGER"
            },
            {
                "name": "min_bytes",
                "type": "INTEGER"
            },
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
     | 'PerUserAggregations' >> beam.GroupBy('user_id').aggregate_field(
         'user_id', CountCombineFn(), 'page_views').aggregate_field(
             'num_bytes', sum, 'total_bytes').aggregate_field(
                 'num_bytes', max, 'max_bytes').aggregate_field(
                     'num_bytes', min,
                     'min_bytes').with_output_types(PerUserAggregation)
     | 'ToDict' >> beam.Map(to_dict)
     | 'WriteToBQ' >> beam.io.WriteToBigQuery(
         table_name,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
Пример #23
0
def reduce_node(k: Callable[[E], _K], reducer: Callable[[List[E]],
                                                        Iterable[E]]) -> Tfm1:
    return lambda pcoll: (pcoll
                          | beam.GroupBy(k)
                          | beam.CombineValues(reducer)
                          | beam.FlatMap(lambda e: e[1]))