def _load_data(self, partitions_using_temp_tables, partitions_direct_to_destination, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name): """Load data to BigQuery Data is loaded into BigQuery in the following two ways: 1. Single partition: When there is a single partition of files destined to a single destination, a single load job is triggered. 2. Multiple partitions and/or Dynamic Destinations: When there are multiple partitions of files destined for a single destination or when Dynamic Destinations are used, multiple load jobs need to be triggered for each partition/destination. Load Jobs are triggered to temporary tables, and those are later copied to the actual appropriate destination table. This ensures atomicity when only some of the load jobs would fail but not other. If any of them fails, then copy jobs are not triggered. """ # Load data using temp tables trigger_loads_outputs = ( partitions_using_temp_tables | "TriggerLoadJobsWithTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=True, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name), load_job_name_pcv, * self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] finished_temp_tables_load_jobs_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForTempTableLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(temp_tables_load_job_ids_pc))) schema_mod_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( UpdateDestinationSchema( write_disposition=self.write_disposition, test_client=self.test_client, additional_bq_parameters=self.additional_bq_parameters, step_name=step_name), schema_mod_job_name_pcv)) finished_schema_mod_jobs_pc = ( p | "ImpulseMonitorSchemaModJobs" >> beam.Create([None]) | "WaitForSchemaModJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(schema_mod_job_ids_pc))) destination_copy_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, test_client=self.test_client, step_name=step_name), copy_job_name_pcv, pvalue.AsIter(finished_schema_mod_jobs_pc))) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_copy_job_ids_pc))) _ = ( p | "RemoveTempTables/Impulse" >> beam.Create([None]) | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda _, unused_copy_jobs, deleting_tables: deleting_tables, pvalue.AsIter(finished_copy_jobs_pc), pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Keys() | "RemoveTempTables/Delete" >> beam.ParDo( DeleteTablesFn(self.test_client))) # Load data directly to destination table destination_load_job_ids_pc = ( partitions_direct_to_destination | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=False, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name), load_job_name_pcv, * self.schema_side_inputs)) _ = (p | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None]) | "WaitForDestinationLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_load_job_ids_pc))) destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) | beam.Flatten()) return destination_load_job_ids_pc, destination_copy_job_ids_pc
def run(bigquery_dataset, bigquery_table, storage_bucket="coffeecircle"): """ Extracts the data from the provided file data.csv with an ecommerce sales information :param bigquery_dataset: name of the dataset that is going to be use to load the data :param bigquery_table: Name of table used to load the data :param storage_bucket: Name of the bucket use to store DataFlow logs """ # DataFlow requires to have the imports inside the function or context where the pipeline # definition is even though it goes against Python best practices. import csv import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions from datetime import datetime class Split(beam.DoFn): def process(self, element): reader = csv.reader(element.split('\n'), delimiter=',') try: for row in reader: return [{ 'invoice_no': str(row[0]), 'stock_code': str(row[1]), 'description': str(row[2]), 'quantity': int(row[3]), 'invoice_date': str(datetime.strptime(row[4], '%m/%d/%Y %H:%M')), 'unit_price': float(row[5]), 'customer_id': int(row[6]) if row[6] else None, 'country': str(row[7]) }] except Exception as exc: print(exc) # Gets BigQuery schema definition bigquery_schema = 'invoice_no:STRING,stock_code:STRING,description:STRING,\ quantity:INTEGER,invoice_date:TIMESTAMP,unit_price:FLOAT,\ customer_id:INTEGER,country:STRING' # Storage bucket for logs storage_bucket = "gs://{}/dataflow".format(storage_bucket) # Retrieve project Id and append to PROJECT form GoogleCloudOptions global PROJECT PROJECT = PipelineOptions().view_as(GoogleCloudOptions).project # Create and set your PipelineOptions. options = PipelineOptions() # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'coffeecircle' google_cloud_options.job_name = "test-" + \ str(bigquery_table).replace('_', '-') google_cloud_options.staging_location = ("%s/staging_location" % storage_bucket) google_cloud_options.temp_location = ("%s/temp" % storage_bucket) options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = beam.Pipeline(options=options) # Transformation and loading steps rows = (p | ReadFromText('gs://coffeecircle/data.csv', skip_header_lines=1) | beam.ParDo(Split())) rows | 'Write data into Bigquery' >> beam.io.WriteToBigQuery( table=bigquery_table, dataset=bigquery_dataset, project='coffeecircle', schema=bigquery_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) result = p.run() result.wait_until_finish()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = (p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = (p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionSentences supervision_sentences = (p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateIncarcerationSentences incarceration_sentences = (p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionPeriods supervision_periods = (p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`" ssvr_to_agent_associations = (p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource (query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = (ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_violation_response_id') ) supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \ f"supervision_period_to_agent_association`" supervision_period_to_agent_associations = (p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource (query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = (supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id') ) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`" us_mo_sentence_statuses = (p | "Read MO sentence status table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = (p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id') ) sentences_and_statuses = ( {'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv} | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey() ) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs('incarceration_sentences', 'supervision_sentences') ) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences person_periods_and_sentences = ( {'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StatePerson to all entities' >> beam.CoGroupByKey() ) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_periods_and_sentences | 'Get SupervisionTimeBuckets' >> beam.ParDo(ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = (person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( SupervisionMetricWritableDict()).with_outputs( 'populations', 'revocations', 'successes', 'successful_sentence_lengths', 'assessment_changes', 'revocation_analyses', 'revocation_violation_type_analyses' ) ) # Write the metrics to the output tables in BigQuery populations_table = output + '.supervision_population_metrics' revocations_table = output + '.supervision_revocation_metrics' successes_table = output + '.supervision_success_metrics' successful_sentence_lengths_table = output + '.successful_supervision_sentence_days_served_metrics' assessment_changes_table = output + '.terminated_supervision_assessment_score_change_metrics' revocation_analysis_table = output + '.supervision_revocation_analysis_metrics' revocation_violation_type_analysis_table = output + \ '.supervision_revocation_violation_type_analysis_metrics' _ = (writable_metrics.populations | f"Write population metrics to BQ table: {populations_table}" >> beam.io.WriteToBigQuery( table=populations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocations | f"Write revocation metrics to BQ table: {revocations_table}" >> beam.io.WriteToBigQuery( table=revocations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.successes | f"Write success metrics to BQ table: {successes_table}" >> beam.io.WriteToBigQuery( table=successes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.successful_sentence_lengths | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.assessment_changes | f"Write assessment change metrics to BQ table: {assessment_changes_table}" >> beam.io.WriteToBigQuery( table=assessment_changes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocation_analyses | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocation_violation_type_analyses | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))
def run(argv=None): """ This function parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ # Keeps track if schema was inferred by input or ouput table. schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew, primary_key_cols=data_args.primary_key_cols) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p # Read the file we created with num_records newlines. | 'Read file with num_records lines' >> beam.io.ReadFromText( os.path.join('gs://', temp_blob.bucket.name, temp_blob.name)) # Use our instance of our custom DataGenerator Class to generate 1 fake # datum with the appropriate schema for each element in the PColleciton # created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: for key in data_args.primary_key_cols.split(','): rows |= 'Enforcing primary key: {}'.format( key) >> EnforcePrimaryKeys(key) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read()) fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=fastavro_avsc)) if data_args.write_to_parquet: with open(data_args.schema_file, 'r') as infile: str_schema = json.load(infile) pa_schema = get_pyarrow_translated_schema(str_schema) (rows | 'Fix data and time Types for Parquet.' >> beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema)) | 'Write to Parquet.' >> beam.io.WriteToParquet( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.parquet', schema=pa_schema)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command # line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this # job's directory and Dataflow will not remove it for us. temp_blob.delete()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic_prefix', dest='topic_prefix', default=default_topic) parser.add_argument('--bucket', dest='bucket', default=default_bucket) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job' ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True class DiffOutputsFn(beam.DoFn): # These tags will be used to tag the outputs of this DoFn. OUTPUT_TAG_BUY = 'buy' OUTPUT_TAG_SELL = 'sell' OUTPUT_TAG_ERROR = 'error' def process(self, element): dictionary = yaml.load(element) dictionary['timestamp'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if dictionary['type'] == 'buy': dictionary.pop('type') yield pvalue.TaggedOutput(self.OUTPUT_TAG_BUY, dictionary) elif dictionary['type'] == 'sell': dictionary.pop('type') yield pvalue.TaggedOutput(self.OUTPUT_TAG_SELL, dictionary) else: # we don't drop the key here, since we want to know where the mistake was yield pvalue.TaggedOutput(self.OUTPUT_TAG_ERROR, dictionary) def string_join(elements): string = str(elements) return string.replace('},', '};') with beam.Pipeline(options=pipeline_options) as p: output_buy = [] output_sell = [] output_error = [] for branch in range(BRANCHES): current_topic = known_args.topic_prefix + str(branch) diff_outputs = (p | "ReadTopic{}".format(branch) >> beam.io.ReadFromPubSub(topic=current_topic) | "SplitOutputs{}".format(branch) >> beam.ParDo( DiffOutputsFn()).with_outputs( DiffOutputsFn.OUTPUT_TAG_BUY, DiffOutputsFn.OUTPUT_TAG_SELL, DiffOutputsFn.OUTPUT_TAG_ERROR)) # We need to make a list for each output type output_buy.append(diff_outputs.buy) output_sell.append(diff_outputs.sell) output_error.append(diff_outputs.error) buy = ( tuple(output_buy) | "FlattenBuy" >> beam.Flatten() | "WindowBuy" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "CombineBuy" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSBuy" >> WriteToText(file_path_prefix=known_args.bucket + 'buy/')) sell = ( tuple(output_sell) | "FlattenSell" >> beam.Flatten() | "WindowSell" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "CombineSell" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSSell" >> WriteToText(file_path_prefix=known_args.bucket + 'sell/')) error = (tuple(output_error) | "FlattenError" >> beam.Flatten() | "WindowError" >> beam.WindowInto( window.FixedWindows(WINDOW_LENGTH)) | "CombineError" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSError" >> WriteToText(file_path_prefix=known_args.bucket + 'error/'))
def ExtractSliceKeys(extracts: beam.pvalue.PCollection, slice_spec: List[slicer.SingleSliceSpec], eval_config: Optional[config_pb2.EvalConfig] = None, materialize: bool = True) -> beam.pvalue.PCollection: return extracts | beam.ParDo(ExtractSliceKeysFn(eval_config, materialize), slice_spec=slice_spec)
def expand(self, deployed_model): """Apply the transform. Args: deployed_model: A PCollection should be the output of DeployVersion, or a tuple of (model, version). Returns: A PCollection with a the results of the Prediction Raises: ValueError: If the arguments are invalid. """ pipeline = deployed_model.pipeline # For the job name use a combination of the transform label and a # datestamp. The datestamp is intended to make it unique. now = datetime.datetime.now() # We add some salt to the job name to avoid collisions if we try to submit # multiple jobs at the same time. # N.B. The job_name is fixed at pipeline construction time. This is # critical because multiple invocation of the Train transform (e.g. because # of retries) need to use the same job name. salt = '%04x' % random.getrandbits(4 * 4) # TODO(b/28989568): We need to lower case the name because the backend # only allows lower case letters for job names. The backend should probably # do this automatically but currently it doesn't. job_name = '{0}_{1}_{2}'.format(self.label, now.strftime('%y%m%d_%H%M%S'), salt).lower().replace(' ', '_') options = pipeline.options # TODO(b/29163051) Options can be None depending on how the runner was # constructed. if options is None: options = df_options.PipelineOptions() cloud_options = options.view_as(df_options.GoogleCloudOptions) project_id = cloud_options.project if cloud_options.temp_location: temp_dir = cloud_options.temp_location elif cloud_options.staging_location: temp_dir = cloud_options.staging_location else: raise ValueError( '--staging_location must be specified to run in the cloud') if not self.output_uri: output_uri = os.path.join(temp_dir, 'prediction_results') else: output_uri = self.output_uri logging.info('Output uri : %s', output_uri) # Construct the batch prediction job. prediction_request = ml_func.PredictionJobRequest( project_id, job_name, self.input_uris, output_uri, self.region, self.data_format, endpoint=self.cloud_ml_endpoint, runtime_version=self.runtime_version) request = ( pipeline | 'PredictRequest' >> beam.Create([prediction_request]) | 'AugmentPredictArgs' >> beam.ParDo( ml_func._AugmentPredictArgsDo(), # pylint: disable=protected-access beam.pvalue.AsSingleton(deployed_model))) # Run the batch prediction job predict_do = ml_func.BatchPredictionJobDo(api_class=self.api_version) unused_prediction_results = ( request | 'BatchPrediction' >> beam.ParDo(predict_do)) # Wait until the prediction job is done, then Read the results from the file # to which they were written and return. results = 'Read Results' >> beam.io.ReadFromText(output_uri, validate=False) return results
def expand(self, pcoll): return ( pcoll | 'ProcessTransformLog' >> beam.ParDo(ProcessTransformLog()))
def expand(self, pcoll): return ( pcoll | 'JoinTable' >> beam.ParDo(JoinTable()))
def process(self, element): """ Prepares each row to be written in the csv """ result = [ "{},{},{}".format(element[0], element[1]['users'][0], element[1]['timings'][0]) ] with open(output_filename, 'a') as f: f.write(result[0] + "\n") return result if __name__ == '__main__': with beam.Pipeline(options=options) as p: rows = (p | ReadFromText(input_filename) | beam.ParDo(Split())) timings = (rows | beam.ParDo(CollectTimings()) | "Grouping timings" >> beam.GroupByKey() | "Calculating average" >> beam.CombineValues( beam.combiners.MeanCombineFn())) users = (rows | beam.ParDo(CollectUsers()) | "Grouping users" >> beam.GroupByKey() | "Counting users" >> beam.CombineValues(beam.combiners.CountCombineFn())) to_be_joined = ({ 'timings': timings, 'users': users } | beam.CoGroupByKey() | beam.ParDo(WriteToCSV()) | WriteToText(output_filename))
'temp_location': BUCKET + '/temp', 'staging_location': BUCKET + '/staging', 'machine_type': 'n1-standard-1', # machine types listed here: https://cloud.google.com/compute/docs/machine-types 'num_workers': 1 } opts = beam.pipeline.PipelineOptions(flags=[], **options) with beam.Pipeline('DataflowRunner', options=opts) as p: takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT sid, cno, grade FROM college_split.Takes')) class_pcoll = p | 'Read from BQ Class' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT cno FROM college_split.Class')) # write PCollections to log files takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'takes_query_results.txt') class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'class_query_results.txt') # apply ParDo to check cno value's referential integrity norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo(NormalizeTakesFn(), beam.pvalue.AsList(class_pcoll)) # write PCollection to log file norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'norm_takes_pcoll.txt') qualified_table_name = PROJECT_ID + ':college_normalized.Takes' table_schema = 'sid:STRING,cno:STRING,grade:STRING' # write PCollection to new BQ table norm_takes_pcoll | 'Write BQ table' >> beam.io.Write(beam.io.BigQuerySink(qualified_table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()
from apache_beam.options.pipeline_options import PipelineOptions,GoogleCloudOptions,StandardOptions def printer(data_item): print(data_item) options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'first-gcp-wordcount' # google_cloud_options.job_name = 'myjob' # google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' # google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'Directrunner' options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) lines=\ (p| "read data from subscription :" >> beam.io.ReadFromPubSub (subscription="projects/first-gcp-wordcount/subscriptions/subscribe_test_twitter").with_output_types(bytes) # | "map the message :" >> beam.Map(lambda x:p) | "print the value :" >> beam.ParDo(printer) # | "makedataframe :" >> beam.ParDo(dataframe_val) # | "print the value 2:">> beam.ParDo(printer) ) p.run().wait_until_finish()
import datetime import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from config import Config from data.utils import dataloader if __name__ == "__main__": options = PipelineOptions( runner=Config.RUNNER, project=Config.PROJECT_ID, job_name='generate-tfrecords' + datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S"), temp_location='gs://raw_data_layer/temp', region='us-central1', setup_file="./setup.py") with beam.Pipeline(options=options) as pipeline: content = ( pipeline | "create data" >> beam.io.ReadFromTFRecord( file_pattern= "/Volumes/STEF-EXT/object_detection/kitti/kitti/3.2.0/kitti-train.tfrecord*" ) | "parse tfds examples" >> beam.ParDo(dataloader.ParseExample()) | "create tf examples" >> beam.ParDo(dataloader.ConvertToExample()) | "write to TFRecords" >> beam.io.WriteToTFRecord(file_path_prefix=Config.LABELS_TFRECORD))
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline( runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = ( p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame({ 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[ 0, 1, 'event_time', 'windows', 'pane_info' ]) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue( e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
} return [total_age/age_adjusted_rate.length(), total_crude/crude_rate.length()] PROJECT_ID = os.environ['dogwood-outcome-231223'] # Project ID is required when using the BQ source options = { 'project': PROJECT_ID } opts = beam.pipeline.PipelineOptions(flags=[], **options) # Create beam pipeline using local runner with beam.Pipeline('DirectRunner', options=opts) as p: query = p | 'Read Query' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT crude_rate, population,age_adjusted_rate, area FROM cancer_stats.Cancer_By_Area_Incidence LIMIT 1000')) formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(AvgCancerStat()) # write PCollections to log files p | 'Write log' >> WriteToText('input.txt') # write PCollection to log file formatted_dob_pcoll | 'Write log 2' >> WriteToText('output.txt') # write PCollection to new BQ table norm_takes_pcoll | 'Write BQ table' >> beam.io.Write(beam.io.BigQuerySink(AvgCancerArea, schema= 'area:STRING, avg_crude:FLOAT,population:INT,avg_age_adjusted_Rate:FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
else: valenc = bytes(val) row.set_cell('cf1',colname.encode("utf-8"),valenc, datetime.now()) rows.append(row) table.mutate_rows(rows) except: logging.error("Failed with input: ", str(element)) raise options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.job_name = 'beamwordcount' google_cloud_options.staging_location = 'gs://beamgcdeveloperwipro/staging' google_cloud_options.temp_location = 'gs://beamgcdeveloperwipro/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' avro_input = options.view_as(MyOptions).avro_input json_input = options.view_as(MyOptions).json_input project_id = google_cloud_options.project p = beam.Pipeline(options=options) lines_avro = p | "ReadAvroFromGCS" >> beam.io.avroio.ReadFromAvro(avro_input) lines_text = p | "ReadJsonFromGCS" >> beam.io.ReadFromText(json_input) lines_avro | "CreateHbaseRowsFromAvro" >> beam.ParDo(CreateHbaseRow(project_id, 'mybigtable','customer')) lines_json = lines_text | "ConvertToJson" >> beam.ParDo(ConvertToJson()) lines_json | "CreateHbaseRowsFromJson" >> beam.ParDo(CreateHbaseRow(project_id, 'mybigtable','customerfromjson')) p.run()
for dim, axis in item.items(): label = (title[dim].values[0]) d = { u'fbkey': '{}'.format(element['fbkeyl2']), 'axisDim': int(dim), 'axisOrder': int(idx), 'axisValue': float(axis), 'axisTitle': u'{}'.format(label) } yield json.dumps(d) except Exception as e: beam.pvalue.TaggedOutput('exception', element['physical_measurement']) if __name__ == '__main__': with beam.Pipeline(options=options) as pipeline: data, log = ( pipeline | beam.io.ReadFromText(infile, coder=JsonCoder()) | beam.Filter(lambda row: all( [row['content'] != 'notParse', row['type'] == 'measurement'])) # | beam.Map(lambda e : (e['content'],e['physical_measurement'])) | 'Print Results' >> beam.ParDo(DimTrans()).with_outputs( 'exception', main='data')) data | beam.io.WriteToText(outfile) log | 'exception' >> beam.io.WriteToText('log file.txt') pipeline.run()
def run(argv=None, save_main_session=True): """Runs the workflow.""" known_args, pipeline_args = parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session input_info = known_args.input with TestPipeline(options=pipeline_options) as p: source = SyntheticSource(input_info) # pylint: disable=expression-not-assigned barrier = known_args.barrier pc_list = [] num_roots = 2**(len(known_args.steps) - 1) if (barrier == 'merge-gbk' or barrier == 'merge-side-input') else 1 for read_no in range(num_roots): pc_list.append((p | ('Read %d' % read_no) >> beam.io.Read(source))) for step_no, steps in enumerate(known_args.steps): if step_no != 0: new_pc_list = [] for pc_no, pc in enumerate(pc_list): if barrier == 'shuffle': new_pc_list.append( (pc | ('shuffle %d.%d' % (step_no, pc_no)) >> ShuffleBarrier())) elif barrier == 'side-input': new_pc_list.append( (pc | ('side-input %d.%d' % (step_no, pc_no)) >> SideInputBarrier())) elif barrier == 'expand-gbk': new_pc_list.extend( expand_using_gbk( ('expand-gbk %d.%d' % (step_no, pc_no)), pc)) elif barrier == 'expand-second-output': new_pc_list.extend( expand_using_second_output( ('expand-second-output %d.%d' % (step_no, pc_no)), pc)) elif barrier == 'merge-gbk': if pc_no % 2 == 0: new_pc_list.append( merge_using_gbk( ('merge-gbk %d.%d' % (step_no, pc_no)), pc, pc_list[pc_no + 1])) else: continue elif barrier == 'merge-side-input': if pc_no % 2 == 0: new_pc_list.append( merge_using_side_input( ('merge-side-input %d.%d' % (step_no, pc_no)), pc, pc_list[pc_no + 1])) else: continue pc_list = new_pc_list new_pc_list = [] for pc_no, pc in enumerate(pc_list): if steps['splittable']: step = get_synthetic_sdf_step( per_element_delay_sec=steps['per_element_delay'], per_bundle_delay_sec=steps['per_bundle_delay'], output_records_per_input_record=steps[ 'output_records_per_input_record'], output_filter_ratio=steps['output_filter_ratio'], initial_splitting_num_bundles=steps[ 'initial_splitting_num_bundles'], initial_splitting_uneven_chunks=steps[ 'initial_splitting_uneven_chunks'], disable_liquid_sharding=steps[ 'disable_liquid_sharding'], size_estimate_override=steps['size_estimate_override']) else: step = SyntheticStep( per_element_delay_sec=steps['per_element_delay'], per_bundle_delay_sec=steps['per_bundle_delay'], output_records_per_input_record=steps[ 'output_records_per_input_record'], output_filter_ratio=steps['output_filter_ratio']) new_pc = pc | 'SyntheticStep %d.%d' % ( step_no, pc_no) >> beam.ParDo(step) new_pc_list.append(new_pc) pc_list = new_pc_list if known_args.output: # If an output location is provided we format and write output. if len(pc_list) == 1: (pc_list[0] | 'FormatOutput' >> beam.Map(lambda elm: (elm[0] + elm[1])) | 'WriteOutput' >> WriteToText(known_args.output)) logging.info('Pipeline run completed.')
def test_streaming_complex_timing(self): # Use state on the TestCase class, since other references would be pickled # into a closure and not have the desired side effects. # # TODO(BEAM-5295): Use assert_that after it works for the cases here in # streaming mode. WriteFilesTest.all_records = [] dir = '%s%s' % (self._new_tempdir(), os.sep) # Setting up the input (TestStream) ts = TestStream().advance_watermark_to(0) for elm in WriteFilesTest.LARGER_COLLECTION: timestamp = int(elm) ts.add_elements([('key', '%s' % elm)]) if timestamp % 5 == 0 and timestamp != 0: # TODO(BEAM-3759): Add many firings per window after getting PaneInfo. ts.advance_processing_time(5) ts.advance_watermark_to(timestamp) ts.advance_watermark_to_infinity() def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') # The pipeline that we are testing options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: res = (p | ts | beam.WindowInto( FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) # Triggering after 5 processing-time seconds, and on the watermark. Also # discarding old elements. _ = (res | beam.io.fileio.WriteToFiles( path=dir, file_naming=no_colon_file_naming, max_writers_per_bundle=0) | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) | beam.ParDo(self.record_dofn())) # Verification pipeline with TestPipeline() as p: files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) file_names = (files | beam.Map(lambda fm: fm.path)) file_contents = ( files | beam.io.fileio.ReadMatches() | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip( ).split('\n')))) content = (file_contents | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) assert_that(file_names, equal_to(WriteFilesTest.all_records), label='AssertFilesMatch') assert_that(content, matches_all(WriteFilesTest.LARGER_COLLECTION), label='AssertContentsMatch')
def expand(self, train_and_test_datasets): """Apply the transform. Args: train_and_test_datasets: A pair of (train, test) PCollections of json strings representing Example Protos Returns: A 2-tuple of A PCollection with a single TrainedModel, suitable for used by Predict A PCollection with a single TrainingJobResult that describes the result of training. Raises: ValueError: If the arguments are invalid. """ train_dataset, test_dataset = train_and_test_datasets pipeline = train_dataset.pipeline # For the job name use a combination of the transform label and a # datestamp. The datestamp is intended to make it unique. now = datetime.datetime.now() # We add some salt to the job name to avoid collisions if we try to submit # multiple jobs at the same time. # N.B. The job_name is fixed at pipeline construction time. This is # critical because multiple invocation of the Train transform (e.g. because # of retries) need to use the same job name. salt = '%04x' % random.getrandbits(4 * 4) # TODO(b/28989568): We need to lower case the name because the backend # only allows lower case letters for job names. The backend should probably # do this automatically but currently it doesn't. job_name = '{0}_{1}_{2}'.format(self.label, now.strftime('%y%m%d_%H%M%S'), salt).lower() options = pipeline.options # TODO(b/29163051) Options can be None depending on how the runner was # constructed. if options is None: options = df_options.PipelineOptions() cloud_options = options.view_as(df_options.GoogleCloudOptions) run_on_cloud = self.use_cloud_ml if run_on_cloud is None: # TODO(user): Remove the fallback after the next Dataflow release. try: dataflow_runner = beam.runners.DataflowRunner except AttributeError: dataflow_runner = beam.runners.DataflowPipelineRunner # Choose a default based on the runner. if isinstance(pipeline.runner, dataflow_runner): run_on_cloud = True else: run_on_cloud = False if self.output_dir: temp_dir = self.output_dir elif run_on_cloud: cloud_options = options.view_as(df_options.GoogleCloudOptions) if cloud_options.temp_location: temp_dir = os.path.join(cloud_options.temp_location, job_name) elif cloud_options.staging_location: temp_dir = os.path.join(cloud_options.staging_location, job_name) else: raise ValueError( '--staging_location must be specified to run in the cloud') else: temp_dir = tempfile.mkdtemp(job_name) logging.info('Temp dir: %s', temp_dir) if run_on_cloud: train_do = ml_func.TrainingJobDo() project = cloud_options.project else: train_do = ml_func._TrainingJobLocalDo() # pylint: disable=protected-access project = None _ = train_dataset | dfutil.CountPCollection('ml-train-input') # Write the train and test data to files so we can pass it to the trainer. train_data_path = os.path.join(temp_dir, 'training') test_data_path = os.path.join(temp_dir, 'testing') output_dir = os.path.join(temp_dir, 'model') # TODO(b/34839956) Make sure we can handle the tf.Transform metadata. metadata_path = os.path.join(output_dir, 'metadata.json') # This PTransform is primarily to avoid stage name collisions in writing # training and test data. # TODO(user): Figure out why i_type @beam.ptransform_fn breaks pickling. train_files = ( train_dataset | 'WriteTrainData' >> ml_func._WrapCallable( # pylint: disable=protected-access self.tf_main_spec.write_input_data, train_data_path)) test_files = ( test_dataset | 'WriteTestData' >> ml_func._WrapCallable( # pylint: disable=protected-access self.tf_main_spec.write_input_data, test_data_path)) if self.metadata: metadata_files = self.metadata | SaveMetadata(metadata_path) else: metadata_files = pipeline | beam.Create([None]) # Construct and run the training job. train_request = self.tf_main_spec.train_request.copy() if not train_request.package_uris: train_request.package_uris = [] if self.package_uris: if isinstance(self.package_uris, basestring): train_request.package_uris.extend([self.package_uris]) else: train_request.package_uris.extend(self.package_uris) # remove duplicates from train_request train_request.package_uris = list(set(train_request.package_uris)) train_request.job_args = self.job_args or [] if self.python_module: train_request.python_module = self.python_module if not train_request.project: train_request.parent = project if not train_request.job_name: train_request.job_name = job_name if not train_request.endpoint: train_request.endpoint = self.cloud_ml_endpoint if not train_request.hyperparameters: train_request.hyperparameters = self.hyperparameters if not train_request.region: train_request.region = self.region if not train_request.scale_tier: train_request.scale_tier = self.scale_tier if not train_request.worker_count: train_request.worker_count = self.worker_count if not train_request.ps_count: train_request.ps_count = self.ps_count if not train_request.worker_type: train_request.worker_type = self.worker_type if not train_request.ps_type: train_request.ps_type = self.ps_type if not train_request.master_type: train_request.master_type = self.master_type if not train_request.runtime_version: train_request.runtime_version = self.runtime_version requests = ( pipeline | 'CreateRequest' >> beam.Create([train_request]) | 'AugmentTrainingArgs' >> beam.ParDo( ml_func._AugmentTrainArgsDo( # pylint: disable=protected-access self.tf_main_spec), beam.pvalue.AsIter(train_files), beam.pvalue.AsIter(test_files), output_dir, beam.pvalue.AsSingleton(metadata_files))) train_results = requests | 'TrainModel' >> beam.ParDo(train_do) # Read and return the model directory and training results. model_directory = ( train_results | 'CreateModel' >> beam.Map(self.tf_main_spec.read_model, output_dir, self.export_subdir)) return model_directory, train_results
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from PubSub into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--accum_mode', required=True, help='Accumulation mode for pipeline') opts, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args, save_main_session=True) options.view_as( GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}" options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(StandardOptions).runner = 'DataflowRunner' table_schema = { "fields": [ { "name": "taxi_events", "type": "INTEGER" }, { "name": "timestamp", "type": "STRING" }, ] } input_topic = "projects/pubsub-public-data/topics/taxirides-realtime" output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}" if opts.accum_mode == 'accumulating': accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING elif opts.accum_mode == 'discarding': accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING else: raise ValueError( 'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' ' ) p = beam.Pipeline(options=options) (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic) | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide) | 'WindowByMinute' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(early=AfterProcessingTime(10)), accumulation_mode=accum_mode) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults() | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn()) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( output_table, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
'temp_location': BUCKET + '/temp', 'staging_location': BUCKET + '/staging', 'machine_type': 'n1-standard-8', 'num_workers': 8 } opts = beam.pipeline.PipelineOptions(flags=[], **options) with beam.Pipeline('DataflowRunner', options=opts) as p: query_results = p | beam.io.Read(beam.io.BigQuerySource(query='select * from tweets.tweets_data as a LEFT OUTER JOIN tweets.stock_companyname_lookup as b on a.company_names = b.name')) # write PCollection to a log file query_results | 'Write to File 1' >> WriteToText('input_joined_table.txt') # apply a ParDo to the PCollection out_pcoll = query_results | 'Filter tweets by source handles starting with character C' >> beam.ParDo(FilterSourceFn()) # write PCollection to a log file out_pcoll | 'Write to File 2' >> WriteToText('output_joined_table_source_like_c.txt') qualified_table_name = 'avian-force-216105:beam_dataset.LeftJoin_Milestone8_Cluster' table_schema = 'a_id:INTEGER,a_text:STRING,a_timestamp:STRING,a_source:STRING,a_symbols:STRING,a_company_names:STRING,a_url:STRING,a_verified:BOOLEAN,b_ticker:STRING,b_name:STRING' #Existing Schema # id INTEGER NULLABLE # text STRING NULLABLE # timestamp STRING NULLABLE # source STRING NULLABLE # symbols STRING NULLABLE # company_names STRING NULLABLE # url STRING NULLABLE
def make_beam_pipeline(root, input_filenames, sample_rate, debug, embedding_names, embedding_modules, module_output_keys, audio_key, sample_rate_key, label_key, speaker_id_key, average_over_time, delete_audio_from_output, output_filename, input_format='tfrecord', output_format='tfrecord', suffix='Main'): """Construct beam pipeline for mapping from audio to embeddings. Args: root: The beam root node. input_filenames: Python list. List of input files. sample_rate: Python int, or `None`. The sample rate for all embeddings, or `None` if this is a TFDS dataset, or if each example has its own sample rate. debug: Python bool. Whether to operate in debug mode. embedding_names: Python list of embeddings. embedding_modules: Python list of TF-Hub modules. module_output_keys: Python list of strings, names of output modules. audio_key: Python string, the key of the audio. sample_rate_key: Python string or `None`, the key for. label_key: Python string. Field for label. speaker_id_key: Python string or `None`. Key for speaker ID, or `None`. average_over_time: Python bool. If `True`, average over the time axis. delete_audio_from_output: Python bool. Whether to remove audio fromm outputs. output_filename: Python string. Output filename. input_format: Python string. Must correspond to a function in `reader_functions`. output_format: Python string. Must correspond to a function `writer_functions`. suffix: Python string. Suffix to stage names to make them unique. """ tf_examples_key_ = 'tf_examples' assert tf_examples_key_ not in embedding_names s = suffix # for code brevity. # Read from input. input_examples = reader_functions[input_format](root, input_filenames, s) # In debug mode, take one input example. if debug: input_examples = ( input_examples | f'TakeOne{s}' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) # Sampling generates lists, so flatten back into one collection. | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x)) # Compute all the embeddings simultaneously. embedding_tables = {} for name, mod, out_key in zip(embedding_names, embedding_modules, module_output_keys): logging.info('Adding signal: %s %s, %s', name, mod, out_key) tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo( ComputeEmbeddingMapFn(name=name, module=mod, output_key=out_key, audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time)) embedding_tables[name] = tbl assert tf_examples_key_ not in embedding_tables embedding_tables[tf_examples_key_] = input_examples logging.info('embedding_tables: %s', embedding_tables) # Combine embeddings and tf.train.Example, using the common key. combined_tbl = (embedding_tables | f'CombineEmbeddingTables-{s}' >> beam.CoGroupByKey() | f'AddEmbeddings-{s}' >> beam.Map( _add_embedding_column_map_fn, original_example_key=tf_examples_key_, delete_audio_from_output=delete_audio_from_output, audio_key=audio_key, label_key=label_key, speaker_id_key=speaker_id_key)) output_filename = f'{output_filename}@*' logging.info('Writing to %s', output_filename) writer_functions[output_format](combined_tbl, output_filename, s)
def expand(self, dataset): return (dataset | 'DetectAnomaliesInExamples' >> beam.Map( _detect_anomalies_in_example, options=self.options) | 'GenerateAnomalyReasonKeys' >> beam.ParDo( _GenerateAnomalyReasonSliceKeys()))
def FilterOutSlices( # pylint: disable=invalid-name values: beam.pvalue.PCollection, slices_count: beam.pvalue.PCollection, k_anonymization_count: int, error_metric_key: Text) -> beam.pvalue.PCollection: """Filter out slices with examples count lower than k_anonymization_count. Since we might filter out certain slices to preserve privacy in the case of small slices, to make end users aware of this, we will append filtered out slice keys with empty data, and a debug message explaining the omission. Args: values: PCollection of aggregated data keyed at slice_key slices_count: PCollection of slice keys and their example count. k_anonymization_count: If the number of examples in a specific slice is less than k_anonymization_count, then an error will be returned for that slice. This will be useful to ensure privacy by not displaying the aggregated data for smaller number of examples. error_metric_key: The special metric key to indicate errors. Returns: A PCollection keyed at all the possible slice_key and aggregated data for slice keys with example count more than k_anonymization_count and error message for filtered out slices. """ class FilterOutSmallSlicesDoFn(beam.DoFn): """DoFn to filter out small slices.""" def __init__(self, error_metric_key: Text): self.error_metric_key = error_metric_key def process( self, element: Tuple[SliceKeyType, Dict[Text, Any]] ) -> Generator[Tuple[SliceKeyType, Dict[Text, Any]], None, None]: """Filter out small slices. For slices (excluding overall slice) with examples count lower than k_anonymization_count, it adds an error message. Args: element: Tuple containing slice key and a dictionary containing corresponding elements from merged pcollections. Yields: PCollection of (slice_key, aggregated_data or error message) """ (slice_key, value) = element if value['values']: if (not slice_key or value['slices_count'][0] >= k_anonymization_count): yield (slice_key, value['values'][0]) else: yield (slice_key, { self.error_metric_key: 'Example count for this slice key is lower than ' 'the minimum required value: %d. No data is aggregated for ' 'this slice.' % k_anonymization_count }) return ({ 'values': values, 'slices_count': slices_count } | 'CoGroupingSlicesCountAndAggregatedData' >> beam.CoGroupByKey() | 'FilterOutSmallSlices' >> beam.ParDo( FilterOutSmallSlicesDoFn(error_metric_key)))
def ComputePerSliceMetrics( # pylint: disable=invalid-name slice_result: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None, num_bootstrap_samples: Optional[int] = 1, random_seed: Optional[int] = None, ) -> beam.pvalue.PCollection: """PTransform for computing, aggregating and combining metrics. Args: slice_result: Incoming PCollection consisting of slice key and extracts. eval_shared_model: Shared model parameters for EvalSavedModel. desired_batch_size: Optional batch size for batching in Aggregate. num_bootstrap_samples: Number of replicas to use in calculating uncertainty using bootstrapping. If 1 is provided (default), aggregate metrics will be calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple samples of each slice will be calculated using the Poisson bootstrap method. To calculate standard errors, num_bootstrap_samples should be 20 or more in order to provide useful data. More is better, but you pay a performance cost. random_seed: Seed to use for testing, because nondeterministic tests stink. Returns: DoOutputsTuple. The tuple entries are PCollection of (slice key, metrics) and PCollection of (slice key, plot metrics). """ # TODO(ckuhn): Remove this workaround per discussions in CL/227944001 slice_result.element_type = beam.typehints.Any compute_with_sampling = False if not num_bootstrap_samples: num_bootstrap_samples = 1 if num_bootstrap_samples < 1: raise ValueError('num_bootstrap_samples should be > 0, got %d' % num_bootstrap_samples) if num_bootstrap_samples > 1: slice_result_sampled = slice_result | 'FanoutBootstrap' >> beam.ParDo( _FanoutBootstrapFn(num_bootstrap_samples)) compute_with_sampling = True output_results = ( slice_result | 'CombinePerSlice' >> beam.CombinePerKey( _AggregateCombineFn(eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, compute_with_sampling=False)) | 'InterpretOutput' >> beam.ParDo( _ExtractOutputDoFn(eval_shared_model=eval_shared_model))) if compute_with_sampling: output_results = ( slice_result_sampled | 'CombinePerSliceWithSamples' >> beam.CombinePerKey( _AggregateCombineFn(eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, compute_with_sampling=True, seed_for_testing=random_seed)) | 'InterpretSampledOutput' >> beam.ParDo( _ExtractOutputDoFn(eval_shared_model=eval_shared_model)) | beam.GroupByKey() | beam.ParDo(_MergeBootstrap(), beam.pvalue.AsIter(output_results))) # Separate metrics and plots. return (output_results | beam.ParDo(_SeparateMetricsAndPlotsFn()).with_outputs( _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS, main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS))
def test_streaming_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline( runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = ( data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a fake limiter that cancels the BCJ once the main job receives the # expected amount of results. class FakeLimiter: def __init__(self, p, pcoll): self.p = p self.pcoll = pcoll def is_triggered(self): result = ie.current_env().pipeline_result(self.p) if result: try: results = result.get(self.pcoll) except ValueError: return False return len(results) >= 10 return False # This sets the limiters to stop reading when the test receives 10 elements. ie.current_env().options.capture_control.set_limiters_for_test( [FakeLimiter(p, data)]) # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame([ ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('or', 0, [IntervalWindow(0, 10)], pane_info), ('not', 0, [IntervalWindow(0, 10)], pane_info), ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('that', 20000000, [IntervalWindow(20, 30)], pane_info), ('is', 20000000, [IntervalWindow(20, 30)], pane_info), ('the', 20000000, [IntervalWindow(20, 30)], pane_info), ('question', 20000000, [IntervalWindow(20, 30)], pane_info) ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable counts_df = ib.collect(counts, include_window_info=True) # The group by key has no guarantee of order. So we post-process the DF by # sorting so we can test equality. sorted_counts_df = (counts_df .sort_values(['event_time', 0], ascending=True) .reset_index(drop=True)) # yapf: disable pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
} opts = beam.pipeline.PipelineOptions(flags=[], **options) with beam.Pipeline('DataflowRunner', options=opts) as p: query_results = p | beam.io.Read( beam.io.BigQuerySource( query='SELECT Year,State,Rate FROM Unemployment.unemployment_rate') ) # write PCollection to a log file query_results | 'Write to File 1' >> WriteToText(DIR_PATH + 'unemployment_query.txt') #apply Pardo on the Pcollection state_pcoll = query_results | 'Create State abb' >> beam.ParDo(StateName()) # write PCollection to a file state_pcoll | 'Write to File 2' >> WriteToText(DIR_PATH + 'output_unemployment.txt') qualified_takes_table_name = 'han97jiayan:Unemployment.unemployment_transform_cluster' takes_table_schema = 'Year:INTEGER,State:STRING,Rate:FLOAT' state_pcoll | 'Write Takes to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( qualified_takes_table_name, schema=takes_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
def expand(self, pcoll): p = pcoll.pipeline try: step_name = self.label except AttributeError: step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT BigQueryBatchFileLoads.COUNT += 1 temp_location = p.options.view_as(GoogleCloudOptions).temp_location job_name = (p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME') empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP'))) schema_mod_job_name_pcv = pvalue.AsSingleton( singleton_pc | "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'SCHEMA_MOD_STEP'))) copy_job_name_pcv = pvalue.AsSingleton( singleton_pc | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP'))) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) if not self.with_auto_sharding: all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) else: all_destination_file_pairs_pc = self._write_files_with_auto_sharding( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(all_partitions, empty_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }