def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) new_user_stats_models = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], list(x['opportunity'][0])[0] if len(x['opportunity']) else None)) | 'Combine the stats' >> beam.CombinePerKey(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) unused_put_result = ( new_user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (new_user_stats_models | 'Count all new models' >> (beam.combiners.Count.Globally().without_defaults()) | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)))
def expand(self, pcoll): return (pcoll | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn()) | 'Group By' >> beam.GroupBy("post_id", "title") .aggregate_field(lambda row: len(row.tags.split("|")), sum, "tags_count") | 'Format Result' >> beam.ParDo(FormatCsvRowFn()) )
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=True) parser.add_argument( '--output', dest='output', required=True) args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions() pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Input: '\t'.join((timestamp, ip, agent, url, referer)) lines = p | 'Read' >> ReadFromText(args.input) requests = lines | 'Split' >> beam.Map(lambda l: l.split('\t')) join = requests | 'Group' >> beam.GroupBy(lambda req: req[1]) requests = join | 'Filter' >> beam.ParDo(FilterFn()) requests |'Write' >> WriteToText(args.output) result = p.run() result.wait_until_finish() if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation query_result = result.metrics().query() for result in query_result['counters']: logging.info(f'{result.key.metric.name}: {result.result}')
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: user_settings_models = ( self.pipeline | 'Get all UserSettingsModels' >> (ndb_io.GetModels(user_models.UserSettingsModel.get_all()))) old_user_stats_models = ( self.pipeline | 'Get all UserStatsModels' >> (ndb_io.GetModels(user_models.UserStatsModel.get_all()))) # Creates UserStatsModels if it does not exists. new_user_stats_models = ( (user_settings_models, old_user_stats_models) | 'Merge models' >> beam.Flatten() # Returns a PCollection of # (model.id, (user_settings_models, user_stats_models)) or # (model.id, (user_settings_models,)). | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id) # Discards model.id from the PCollection. | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter # Only keep groupings that indicate that # the UserStatsModel is missing. | 'Filter pairs of models' >> beam.Filter(lambda models: (len(list(models)) == 1 and isinstance( list(models)[0], user_models.UserSettingsModel))) # Choosing the first element. | 'Transform tuples into models' >> beam.Map(lambda models: list(models)[0]) # Creates the missing UserStatsModels. | 'Create new user stat models' >> beam.ParDo( CreateUserStatsModel())) unused_put_result = ( (new_user_stats_models, old_user_stats_models) | 'Merge new and old models together' >> beam.Flatten() | 'Update the dashboard stats' >> beam.ParDo( UpdateWeeklyCreatorStats()) | 'Put models into the datastore' >> ndb_io.PutModels()) new_user_stats_job_result = ( new_user_stats_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS NEW %s' % x))) old_user_stats_job_result = ( old_user_stats_models | 'Count all old models' >> beam.combiners.Count.Globally() | 'Only create result for old models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for old models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS OLD %s' % x))) return ((new_user_stats_job_result, old_user_stats_job_result) | 'Merge new and old results together' >> beam.Flatten())
def run(recognition_provider_name, ingestion_run=None, ingestion_provider=None, output_name=None, run_locally=False): """Main entry point, defines and runs the image recognition pipeline. Input: either ingestion run id or ingestion provider id. The input is used for querying the database for image ingested by either one of the optional inputs. """ _validate_args(recognition_provider_name, ingestion_run, ingestion_provider) recognition_provider = get_recognition_provider(recognition_provider_name) job_name = generate_cloud_dataflow_job_name(_PIPELINE_TYPE, recognition_provider.provider_id) if run_locally: pipeline_options = PipelineOptions() else: output_name = 'gs://demo-bucket-step/results/outputs' pipeline_options = PipelineOptions( flags=None, runner='DataflowRunner', project='step-project-ellispis', job_name=job_name, temp_location='gs://demo-bucket-step/temp', region='europe-west2', setup_file='./setup.py', ) try: with beam.Pipeline(options=pipeline_options) as pipeline: store_pipeline_run(job_name, recognition_provider.provider_id) indices_for_batching = pipeline | 'create' >> beam.Create(create_query_indices()) if ingestion_run: dataset = indices_for_batching | 'get images dataset' >> \ beam.ParDo(GetBatchedImageDataset(), ingestion_run=ingestion_run) else: dataset = indices_for_batching | 'get images dataset' >> \ beam.ParDo(GetBatchedImageDataset(), ingestion_provider=ingestion_provider) dataset_with_url_for_provider = dataset | 'add url for labeling' >> \ beam.ParDo(recognition_provider.add_url_for_recognition_api) filtered_dataset = dataset_with_url_for_provider | 'filter images' >> \ beam.Filter(recognition_provider.is_eligible) images_batch = filtered_dataset | 'combine to batches' >> \ beam.GroupBy(lambda doc: int(doc['random']*100)) |\ beam.ParDo(lambda element: [element[1]]) # Labels the images by the process method of the provider. labeled_images_batch = images_batch | 'label by batch' >> \ beam.ParDo(recognition_provider) labeled_images = labeled_images_batch | \ beam.FlatMap(lambda elements: elements) # pylint: disable=expression-not-assigned labeled_images | 'store in database' >> beam.ParDo(UpdateImageLabelsInDatabase(),\ job_name, recognition_provider.provider_id) if output_name: # For testing. def format_result(image, labels): return '%s: %s' % (image['url'], labels) output = labeled_images | 'Format' >> beam.MapTuple(format_result) output | 'Write' >> WriteToText(output_name) update_pipeline_run_when_succeeded(job_name) except: update_pipeline_run_when_failed(job_name) raise
def expand(self, pcoll): return (pcoll | 'Xml2Dict' >> beam.ParDo(ParseXmlToDictFn()) | 'Extract Fields' >> beam.ParDo(ExtractFieldsFn()) | 'Group By' >> beam.GroupBy("post_id", "title") .aggregate_field(lambda row: len(re.findall(r"<.*?>", row.tags)), sum, "tags_count") | 'Format Result' >> beam.ParDo(FormatCsvRowFn()) )
def expand(self, executions): return (executions | beam.Filter(lambda execution: execution.destination. destination_type == self._destination_type) | beam.ParDo(self._get_bq_request_class()) | beam.io.ReadAllFromBigQuery() | beam.GroupBy(lambda x: x['execution_hash']) | beam.ParDo(self._BatchElements(self._batch_size), beam.pvalue.AsList(executions)))
def expand(self, pcoll): results = ( pcoll | 'ComputeStatistics' >> beam.GroupBy('loc_id').aggregate_field( 'low_temp', min, 'record_low').aggregate_field( 'high_temp', max, 'record_high').aggregate_field( 'precip', sum, 'total_precip') | 'ToJson' >> beam.ParDo(ConvertToJson())) return results
def SampleValuesBy(pcoll, key_columns): """Group sample_values by key_columns. E.g. with this pcollection: col1=A col2=B col3=C sample_values=[1] col1=A col2=B col3=Z sample_values=[2, 3] p | SampleValuesBy(['col1', 'col2']) emits: col1=A col2=B all_sample_values=[1, 2, 3] """ return (pcoll | beam.GroupBy(*key_columns).force_tuple_keys().aggregate_field( 'sample_values', ConcatListCombineFn(), 'all_sample_values'))
def test_global_aggregate(self): # [START global_aggregate] with beam.Pipeline() as p: grouped = (p | beam.Create(GROCERY_LIST) | beam.GroupBy().aggregate_field( 'unit_price', min, 'min_price').aggregate_field( 'unit_price', MeanCombineFn(), 'mean_price').aggregate_field( 'unit_price', max, 'max_price')) # [END global_aggregate] expected = [ #[START global_aggregate_result] NamedTuple(min_price=1.00, mean_price=7 / 3, max_price=4.00), #[END global_aggregate_result] ] assert_that(grouped | beam.Map(normalize), equal_to(expected))
def test_expr_aggregate(self): # [START expr_aggregate] with beam.Pipeline() as p: grouped = (p | beam.Create(GROCERY_LIST) | beam.GroupBy('recipe').aggregate_field( 'quantity', sum, 'total_quantity').aggregate_field( lambda x: x.quantity * x.unit_price, sum, 'price')) # [END expr_aggregate] expected = [ #[START expr_aggregate_result] NamedTuple(recipe='pie', total_quantity=6, price=14.00), NamedTuple(recipe='muffin', total_quantity=5, price=7.00), #[END expr_aggregate_result] ] assert_that(grouped | beam.Map(normalize), equal_to(expected))
def test_group_by_attr_expr(self): # [START groupby_attr_expr] with beam.Pipeline() as p: grouped = (p | beam.Create(GROCERY_LIST) | beam.GroupBy('recipe', is_berry=lambda x: 'berry' in x.fruit)) # [END groupby_attr_expr] expected = [ #[START groupby_attr_expr_result] (NamedTuple(recipe='pie', is_berry=True), [ beam.Row(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.50), beam.Row(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.50), beam.Row(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.00), beam.Row(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.00), ]), (NamedTuple(recipe='muffin', is_berry=True), [ beam.Row(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.00), ]), (NamedTuple(recipe='muffin', is_berry=False), [ beam.Row(recipe='muffin', fruit='banana', quantity=3, unit_price=1.00), ]), #[END groupby_attr_expr_result] ] assert_that(grouped | beam.MapTuple(normalize_kv), equal_to(expected))
def test_simple_aggregate(self): # [START simple_aggregate] with beam.Pipeline() as p: grouped = (p | beam.Create(GROCERY_LIST) | beam.GroupBy('fruit').aggregate_field( 'quantity', sum, 'total_quantity')) # [END simple_aggregate] expected = [ #[START simple_aggregate_result] NamedTuple(fruit='strawberry', total_quantity=3), NamedTuple(fruit='raspberry', total_quantity=1), NamedTuple(fruit='blackberry', total_quantity=1), NamedTuple(fruit='blueberry', total_quantity=3), NamedTuple(fruit='banana', total_quantity=3), #[END simple_aggregate_result] ] assert_that(grouped | beam.Map(normalize), equal_to(expected))
def test_groupby_expr(self): # [START groupby_expr] with beam.Pipeline() as p: grouped = (p | beam.Create([ 'strawberry', 'raspberry', 'blueberry', 'blackberry', 'banana' ]) | beam.GroupBy(lambda s: s[0])) # [END groupby_expr] assert_that( grouped | beam.MapTuple(normalize_kv), equal_to([ #[START groupby_expr_result] ('s', ['strawberry']), ('r', ['raspberry']), ('b', ['banana', 'blackberry', 'blueberry']), #[END groupby_expr_result] ]))
def test_groupby_two_exprs(self): # [START groupby_two_exprs] with beam.Pipeline() as p: grouped = (p | beam.Create([ 'strawberry', 'raspberry', 'blueberry', 'blackberry', 'banana' ]) | beam.GroupBy(letter=lambda s: s[0], is_berry=lambda s: 'berry' in s)) # [END groupby_two_exprs] expected = [ #[START groupby_two_exprs_result] (NamedTuple(letter='s', is_berry=True), ['strawberry']), (NamedTuple(letter='r', is_berry=True), ['raspberry']), (NamedTuple(letter='b', is_berry=True), ['blackberry', 'blueberry']), (NamedTuple(letter='b', is_berry=False), ['banana']), #[END groupby_two_exprs_result] ] assert_that(grouped | beam.MapTuple(normalize_kv), equal_to(expected))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating SkillOpportunityModel. """ question_skill_link_models = ( self.pipeline | 'Get all non-deleted QuestionSkillLinkModels' >> (ndb_io.GetModels( question_models.QuestionSkillLinkModel.get_all( include_deleted=False))) | 'Group QuestionSkillLinkModels by skill ID' >> beam.GroupBy(lambda n: n.skill_id)) skills = ( self.pipeline | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels( skill_models.SkillModel.get_all(include_deleted=False))) | 'Get skill object from model' >> beam.Map( skill_fetchers.get_skill_from_model) | 'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id)) skills_with_question_counts = ( { 'skill': skills, 'question_skill_links': question_skill_link_models } | 'Merge by skill ID' >> beam.CoGroupByKey() # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Remove skill IDs' >> beam.Values() # pylint: disable=no-value-for-parameter # We are using itertools.chain.from_iterable to flatten # question_skill_links from a 2D list into a 1D list. | 'Flatten skill and question_skill_links' >> beam.Map( lambda object: { 'skill': list(object['skill'][0])[0], 'question_skill_links': list( itertools.chain.from_iterable(object[ 'question_skill_links'])) })) opportunities_results = ( skills_with_question_counts | beam.Map(lambda object: self._create_skill_opportunity_model( object['skill'], object['question_skill_links']))) unused_put_result = ( opportunities_results | 'Filter the results with OK status' >> beam.Filter(lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.Map(lambda result: result.unwrap()) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Transform Results to JobRunResults' >> (job_result_transforms.ResultsToJobRunResults()))
} #end bigquery schema #with beam.Pipeline(argv=sys.argv) as p: parsed_csv = ( p | 'Readfile' >> beam.io.ReadFromText(input_filename) | 'Parsefile' >> beam.Map(parse_file) | 'DifinirSchema' >> beam.Map(lambda x: beam.Row(fecha=str(x[0].strip()), fruta=str(x[1].strip()), cantidad=int(x[2].strip()))) #| 'SQLTransform' >> SqlTransform(""" #SELECT # fruta, # COUNT(fruta) AS Cuenta #FROM PCOLLECTION #GROUP BY fruta""") | 'Groupby' >> beam.GroupBy('fruta').aggregate_field( lambda x: 1 if x.fruta else 0, sum, 'Cuenta') | 'print' >> beam.FlatMap(print_row) #| 'write' >> beam.io.WriteToText(prefijoSalida, file_name_suffix='.txt', header='fecha, fruta, cantidad') | 'bq_insert' >> beam.io.gcp.bigquery.WriteToBigQuery( table='prueba_dataflow', dataset='CUSTOMER_EXPERIENCE', project='apex-dataway', schema=bq_schema, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND')) p.run().wait_until_finish()
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) user_stats_results = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], list(x['opportunity'][0])[0] if len(x['opportunity']) else None))) user_stats_models = ( user_stats_results | 'Filter ok results' >> beam.Filter(lambda key_and_result: key_and_result[1].is_ok()) | 'Unpack result' >> beam.MapTuple(lambda key, result: (key, result.unwrap())) | 'Combine the stats' >> beam.CombinePerKey(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) user_stats_error_job_run_results = ( user_stats_results | 'Filter err results' >> beam.Filter(lambda key_and_result: key_and_result[1].is_err()) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Remove keys' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Transform result to job run result' >> (job_result_transforms.ResultsToJobRunResults())) unused_put_result = ( user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) user_stats_models_job_run_results = ( user_stats_models | 'Create job run result' >> (job_result_transforms.CountObjectsToJobRunResult())) return ((user_stats_error_job_run_results, user_stats_models_job_run_results) | 'Merge job run results' >> beam.Flatten())
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1') stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2') # Create a collection of conformers with duplicate information equivalent_files = gfile.glob(FLAGS.input_equivalent_glob) equivalent_conformers = ( root | 'CreateEquivInputs' >> beam.Create(equivalent_files) | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file)) # Merge by bond_topology_id merged_results = ( (stage1_matched_conformers, stage2_matched_conformers, equivalent_conformers) | 'FlattenAllConformers' >> beam.Flatten() | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id) | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs( MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers')) merged_conformers = merged_results['conformers'] # Write out the merge conflicts _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT] | 'ConflictsCSVFormat' >> beam.Map(csv_format) | 'ConflictsReshuffle' >> beam.Reshuffle() | 'WriteConflictsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_conflicts', header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS), num_shards=1, file_name_suffix='.csv')) cleaned_conformers = (merged_conformers | 'CleanUpConformers' >> beam.Map(clean_up_conformer)) # Get the bond length distributions bond_length_dists_pcoll = ( cleaned_conformers | 'ExtractBondLengths' >> beam.FlatMap( extract_bond_lengths, dist_sig_digits=_BOND_LENGTHS_SIG_DIGITS, unbonded_max=_BOND_LENGTHS_UNBONDED_MAX) | 'CountBondLengths' >> beam.combiners.Count.PerElement() | 'ToListBondLengths' >> beam.combiners.ToList()) _ = (bond_length_dists_pcoll | 'WriteBondLengths' >> beam.ParDo( write_bond_lengths, filename=f'{FLAGS.output_stem}_bond_lengths.csv')) # Get the SMILES to id mapping needed for UpdateConformerFn smiles_id_pcoll = ( root | 'BTInputForSmiles' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateSmilesToID' >> beam.FlatMap(smiles_to_id)) smiles_id_dict = beam.pvalue.AsDict(smiles_id_pcoll) # Various per conformer processing update_results = (cleaned_conformers | 'UpdateConformers' >> beam.ParDo( UpdateConformerFn(), beam.pvalue.AsSingleton(bond_length_dists_pcoll), smiles_id_dict).with_outputs( UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers')) updated_conformers = update_results['conformers'] # Output SMILES mismatches _ = ( update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH] | 'ReshuffleSmilesOutput' >> beam.Reshuffle() | 'SmilesCSVFormat' >> beam.Map(csv_format) | 'WriteSmilesCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_smiles_compare', header= 'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h', num_shards=1, file_name_suffix='.csv')) # Process duplicate information final_conformers = ( updated_conformers | 'KeyedForDuplicates' >> beam.FlatMap(generate_keyed_conformers_for_duplicates) | 'DupGroupByKey' >> beam.GroupByKey() | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information)) # Pull the stats of various sorts write to a file _ = (final_conformers | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values) | 'CountStats' >> beam.combiners.Count.PerElement() | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}') | 'WriteStatsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_stats', header='primary_key,secondary_key,count', num_shards=1, file_name_suffix='.csv')) # Generate the summary by bond topology. bare_bt_summaries = ( root | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateBareBTSummaries' >> beam.FlatMap(bond_topology_summaries_from_csv)) real_bt_summaries = ( final_conformers | 'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary)) _ = ((bare_bt_summaries, real_bt_summaries) | 'FlattenAllBTSummaries' >> beam.Flatten() | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary()) # Make the filtered versions of the dataset complete_conformers = (final_conformers | 'MakeComplete' >> beam.Map(make_complete_conformer)) standard_conformers = ( final_conformers | 'MakeStandard' >> beam.FlatMap(make_standard_conformer)) # Write the complete and standard conformers as binary protobuf in TFRecord. for id_str, collection in [['complete', complete_conformers], ['standard', standard_conformers]]: _ = (collection | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle() | ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord( f'{FLAGS.output_stem}_{id_str}_tfrecord', coder=beam.coders.ProtoCoder(dataset_pb2.Conformer), num_shards=FLAGS.output_shards))
def run(): opts = PipelineOptions() with beam.Pipeline(options=opts) as p: opts = opts.view_as(WeatherPipelineOptions) stations = load_dict(opts.input_stationlist, parse_csv_kv_row(int, str, 6, 2)) weather_entries, weather_badrows = ( p | "Read weather entries" >> beam.io.ReadFromText( os.path.join(opts.input_weather_dir, "*.csv")) | "Parse weather" >> beam.ParDo(ParseFn(stations)).with_outputs( "parsed", "invalid")) # windowed_weather_entries = (weather_entries | "Add timestamp" >> beam.ParDo(EntryAddTimestampFn()) # | beam.WindowInto(window.FixedWindows(24*60*60))) _ = weather_badrows | "Log invalid weather rows" >> beam.io.WriteToText( os.path.join(opts.outputdir, "invalid_input_rows")) def any(seq: typing.Iterable[bool]): result = False for x in seq: result = result or x return result def process_weather_entries(el): key = el[0] weather = el[1] return beam.Row( country_code=key.country_code, obsdate=key.obsdate, temp=mean([x.temp for x in weather if x.temp < 9999.9]), windspeed=mean( [x.windspeed for x in weather if x.windspeed < 999.9]), tornadoes=any([x.tornado_or_funnel for x in weather]), ) weather_by_country_by_day = (weather_entries | beam.GroupBy("country_code", "obsdate") | beam.Map(process_weather_entries)) weather_by_country_by_year = weather_by_country_by_day | beam.GroupBy( "country_code", year=lambda x: x.obsdate.year) averages = weather_by_country_by_year | beam.MapTuple(lambda k, v: ( k.year, beam.Row( country=k.country_code, temp=mean([y.temp for y in v]), windspeed=mean([y.windspeed for y in v]), ), )) hottest = ( averages | beam.Filter(lambda x: not numpy.isnan(x[1].temp)) | "Top hottest" >> beam.combiners.Top.PerKey(1, key=lambda x: x.temp) | beam.Map(str) | "Write hottest" >> beam.io.WriteToText( os.path.join(opts.outputdir, "hottest"))) second_windiest = ( averages | beam.Filter(lambda x: not numpy.isnan(x[1].windspeed)) | "Top two windiest" >> beam.combiners.Top.PerKey( 2, key=lambda x: x.windspeed) | beam.MapTuple(lambda k, v: (k, v[1])) | "Write second windiest" >> beam.io.WriteToText( os.path.join(opts.outputdir, "second_windiest"))) def calc_max_consecutive_tornado_days(el): key, entries = el entries = sorted(entries, key=lambda x: x.obsdate, reverse=True) return ( key.year, beam.Row( country_code=key.country_code, max_consecutive_tornado_days=max_consec_sequence_len( [x.tornadoes for x in entries], True), ), ) max_consec_tornado_days = ( weather_by_country_by_year | beam.Map(calc_max_consecutive_tornado_days) | "Top consec days tornado" >> beam.combiners.Top.PerKey( 1, key=lambda x: x.max_consecutive_tornado_days) | "Write consecutively tornadiest" >> beam.io.WriteToText( os.path.join(opts.outputdir, "consecutively_tornadiest")))
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1') stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2') # Create a collection of conformers with duplicate information equivalent_files = gfile.glob(FLAGS.input_equivalent_glob) equivalent_conformers = ( root | 'CreateEquivInputs' >> beam.Create(equivalent_files) | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file)) # Merge by bond_topology_id merged_results = ( (stage1_matched_conformers, stage2_matched_conformers, equivalent_conformers) | 'FlattenAllConformers' >> beam.Flatten() | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id) | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs( MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers')) merged_conformers = merged_results['conformers'] # Write out the merge conflicts _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT] | 'ConflictsCSVFormat' >> beam.Map(csv_format) | 'ConflictsReshuffle' >> beam.Reshuffle() | 'WriteConflictsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_conflicts', header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS), num_shards=1, file_name_suffix='.csv')) # Various per conformer processing update_results = ( merged_conformers | 'UpdateConformers' >> beam.ParDo(UpdateConformerFn()).with_outputs( UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers')) updated_conformers = update_results['conformers'] # Output SMILES mismatches _ = ( update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH] | 'ReshuffleSmilesOutput' >> beam.Reshuffle() | 'SmilesCSVFormat' >> beam.Map(csv_format) | 'WriteSmilesCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_smiles_compare', header= 'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h', num_shards=1, file_name_suffix='.csv')) # Process duplicate information final_conformers = ( updated_conformers | 'KeyedForDuplicates' >> beam.FlatMap(generate_keyed_conformers_for_duplicates) | 'DupGroupByKey' >> beam.GroupByKey() | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information)) # Pull the stats of various sorts write to a file _ = (final_conformers | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values) | 'CountStats' >> beam.combiners.Count.PerElement() | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}') | 'WriteStatsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_stats', header='primary_key,secondary_key,count', num_shards=1, file_name_suffix='.csv')) # Generate the summary by bond topology. bare_bt_summaries = ( root | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateBareBTSummaries' >> beam.FlatMap(bond_topology_summaries_from_csv)) real_bt_summaries = ( final_conformers | 'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary)) _ = ((bare_bt_summaries, real_bt_summaries) | 'FlattenAllBTSummaries' >> beam.Flatten() | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary()) # Make the filtered versions of the dataset complete_conformers = (final_conformers | 'MakeComplete' >> beam.Map(make_complete_conformer)) standard_conformers = ( final_conformers | 'MakeStandard' >> beam.FlatMap(make_standard_conformer)) # Write the complete and standard conformers as JSON. # Bit of a hack here: the slowest part of the whole pipeline is writing out # the JSON for the complete conformers. So we just hard code a tripling of the # shards to get more parallelism. for id_str, collection, num_shards in [[ 'complete', complete_conformers, FLAGS.output_shards * 3 ], ['standard', standard_conformers, FLAGS.output_shards]]: _ = (collection | ('JSONReshuffle_' + id_str) >> beam.Reshuffle() | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json) | ('WriteJSON_' + id_str) >> beam.io.WriteToText( f'{FLAGS.output_stem}_{id_str}_json', num_shards=num_shards, file_name_suffix='.json.gz'))
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_path', required=True, help='Path to events.json') parser.add_argument('--table_name', required=True, help='BigQuery table name') opts = parser.parse_args() # Setting up the Beam pipeline options options = PipelineOptions(save_main_session=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'batch-user-traffic-pipeline-', time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_path = opts.input_path table_name = opts.table_name # Table schema for BigQuery table_schema = { "fields": [ { "name": "user_id", "type": "STRING" }, { "name": "page_views", "type": "INTEGER" }, { "name": "total_bytes", "type": "INTEGER" }, { "name": "max_bytes", "type": "INTEGER" }, { "name": "min_bytes", "type": "INTEGER" }, ] } # Create the pipeline p = beam.Pipeline(options=options) (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path) | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog) | 'PerUserAggregations' >> beam.GroupBy('user_id').aggregate_field( 'user_id', CountCombineFn(), 'page_views').aggregate_field( 'num_bytes', sum, 'total_bytes').aggregate_field( 'num_bytes', max, 'max_bytes').aggregate_field( 'num_bytes', min, 'min_bytes').with_output_types(PerUserAggregation) | 'ToDict' >> beam.Map(to_dict) | 'WriteToBQ' >> beam.io.WriteToBigQuery( table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def reduce_node(k: Callable[[E], _K], reducer: Callable[[List[E]], Iterable[E]]) -> Tfm1: return lambda pcoll: (pcoll | beam.GroupBy(k) | beam.CombineValues(reducer) | beam.FlatMap(lambda e: e[1]))