def test_set_google_cloud_options( all_options, config, update, exp_update, dataflow_endpoint, klio_cli_version, deployed_ci, user_env, gcp_options, mocker, monkeypatch, ): expected_opts = [ "project", "region", "temp_location", "staging_location", "service_account_email", "no_auth", "template_location", "enable_streaming_engine", "dataflow_kms_key", "flexrs_goal", ] # this is to be changed when running `tox`; remove when no longer # supporting beam 2.14.0 if dataflow_endpoint: all_options["dataflow_endpoint"] = dataflow_endpoint else: all_options.pop("dataflow_endpoint", None) options = pipeline_options.PipelineOptions().from_dictionary(all_options) actual_gcp_opts = options.view_as(pipeline_options.GoogleCloudOptions) monkeypatch.setattr( config.pipeline_options, "dataflow_endpoint", dataflow_endpoint ) if klio_cli_version: monkeypatch.setenv("KLIO_CLI_VERSION", klio_cli_version) klio_cli_version_clean = klio_cli_version.replace(".", "-") if deployed_ci: monkeypatch.setenv("CI", "TRUE") if not user_env: monkeypatch.delenv("USER", raising=False) kpipe = run.KlioPipeline("test-job", config, mocker.Mock(update=update)) kpipe._set_google_cloud_options(options) for opt in expected_opts: expected_value = gcp_options[opt] # getattr should explode when not setting a default value assert expected_value == getattr(actual_gcp_opts, opt) assert exp_update == actual_gcp_opts.update if dataflow_endpoint: assert dataflow_endpoint == actual_gcp_opts.dataflow_endpoint else: assert ( "https://dataflow.googleapis.com" == actual_gcp_opts.dataflow_endpoint ) user = None if deployed_ci: user = "******" elif user_env: user = os.environ["USER"] klio_exec_version_clean = klio_exec_version.replace(".", "-") klio_core_version_clean = klio_core_version.replace(".", "-") klio_lib_version_clean = klio_lib_version.replace(".", "-") exp_labels = [ "foo=bar", "baz=bla", "klio-exec={}".format(klio_exec_version_clean), "klio-core={}".format(klio_core_version_clean), "klio={}".format(klio_lib_version_clean), ] if user: exp_labels.append("deployed_by={}".format(user).lower()) if klio_cli_version: exp_labels.append("klio-cli={}".format(klio_cli_version_clean)) assert sorted(exp_labels) == sorted(actual_gcp_opts.labels)
def pipeline_options_from_dict(all_options): return pipeline_options.PipelineOptions().from_dictionary(all_options)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args(argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args(known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if ((known_args.optimize_for_large_inputs and variant_merger) or known_args.partition_config_path): partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = (variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards)) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def expand(self, deployed_model): """Apply the transform. Args: deployed_model: A PCollection should be the output of DeployVersion, or a tuple of (model, version). Returns: A PCollection with a the results of the Prediction Raises: ValueError: If the arguments are invalid. """ pipeline = deployed_model.pipeline # For the job name use a combination of the transform label and a # datestamp. The datestamp is intended to make it unique. now = datetime.datetime.now() # We add some salt to the job name to avoid collisions if we try to submit # multiple jobs at the same time. # N.B. The job_name is fixed at pipeline construction time. This is # critical because multiple invocation of the Train transform (e.g. because # of retries) need to use the same job name. salt = '%04x' % random.getrandbits(4 * 4) # TODO(b/28989568): We need to lower case the name because the backend # only allows lower case letters for job names. The backend should probably # do this automatically but currently it doesn't. job_name = '{0}_{1}_{2}'.format(self.label, now.strftime('%y%m%d_%H%M%S'), salt).lower().replace(' ', '_') options = pipeline.options # TODO(b/29163051) Options can be None depending on how the runner was # constructed. if options is None: options = df_options.PipelineOptions() cloud_options = options.view_as(df_options.GoogleCloudOptions) project_id = cloud_options.project if cloud_options.temp_location: temp_dir = cloud_options.temp_location elif cloud_options.staging_location: temp_dir = cloud_options.staging_location else: raise ValueError( '--staging_location must be specified to run in the cloud') if not self.output_uri: output_uri = os.path.join(temp_dir, 'prediction_results') else: output_uri = self.output_uri logging.info('Output uri : %s', output_uri) # Construct the batch prediction job. prediction_request = ml_func.PredictionJobRequest( project_id, job_name, self.input_uris, output_uri, self.region, self.data_format, endpoint=self.cloud_ml_endpoint, runtime_version=self.runtime_version) request = ( pipeline | 'PredictRequest' >> beam.Create([prediction_request]) | 'AugmentPredictArgs' >> beam.ParDo( ml_func._AugmentPredictArgsDo(), # pylint: disable=protected-access beam.pvalue.AsSingleton(deployed_model))) # Run the batch prediction job predict_do = ml_func.BatchPredictionJobDo(api_class=self.api_version) unused_prediction_results = ( request | 'BatchPrediction' >> beam.ParDo(predict_do)) # Wait until the prediction job is done, then Read the results from the file # to which they were written and return. results = 'Read Results' >> beam.io.ReadFromText(output_uri, validate=False) return results
def main(unused_args): """Runs the Beam pipeline.""" options = pipeline_options.PipelineOptions() p = beam.Pipeline(options=options) pipeline(p) p.run().wait_until_finish()
def _create_large_avro_file( self, blob_name, staging_table_util, destination_prefix, compression, extension ): """Creates avro files from a staging table and stores in GCS. The avro file is generated in this method using DataFlow. BigQuery extract jobs do support avro as a destination format. However, if the size of the staging table is greater than 1 GB, the generated files must be sharded and then composed into a single file. The composition process causes errors when the destination format is avro, since some of the composed avro files end up with negative row counts. Therefore, this method can be called when generating an avro file from a staging table greater than 1 GB. Args: blob_name(str): Name of the file (or blob) to be generated. Starts with 'fileType=' and end with the file extension. Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv # pylint: disable=line-too-long staging_table_util(load_benchmark_tools.table_util.TableUtil): Util object for interacting with the staging table that the avro file will be generated from. destination_prefix(str): String containing the 'gs://' prefix, the bucket name, and the path of the file, without the extension. This is needed by the WriteToParquet class. Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long compression(str): String representing the compression format that the generated file should have. Options are 'none' if no compression is to be used, 'snappy', or 'deflate'. extension(str): String to be used as the extension for the avro file. Options are 'avro' if no compression is to be used, 'snappy', or 'deflate'. """ pipeline_args = ['--project', self.project_id, '--staging_location', self.dataflow_staging_location, '--temp_location', self.dataflow_temp_location, '--save_main_session', '--worker_machine_type', 'n1-highcpu-32', '--runner', 'DataflowRunner', '--setup_file', './setup.py'] options = pipeline_options.PipelineOptions(pipeline_args) table_spec = beam_bigquery.TableReference( projectId=self.project_id, datasetId=self.primitive_staging_dataset_id, tableId=staging_table_util.table_id, ) codec = 'null' if compression == 'none' else compression bq_schema = staging_table_util.table.schema table_name = staging_table_util.table.table_id avro_schema = avro_util.AvroUtil( bq_schema=bq_schema, schema_name=table_name ).get_avro_translated_schema() p = beam.Pipeline(options=options) table = (p | 'ReadTable' >> beam.io.Read( beam.io.BigQuerySource(table_spec))) (table | beam.io.WriteToAvro( file_path_prefix=destination_prefix, schema=avro_schema, file_name_suffix='.' + extension, use_fastavro=True, codec=codec, num_shards=1, shard_name_template='', )) p.run().wait_until_finish() logging.info('Created file: {0:s}'.format(blob_name))
def run(): ''' Create the Publisher ''' publisher = pubsub_v1.PublisherClient() # Creates a publisher client topic_name = 'projects/{project_id}/topics/{topic}'.format( project_id="famous-store-237108", topic="BQTopic") topic_path = publisher.topic_path( "famous-store-237108", "BQTopic" ) # Creates a fully qualified topic path. Same as previous row project_path = publisher.project_path( "famous-store-237108") # Creates a fully qualified project path found = False # Check if topic exists in project for topic in publisher.list_topics( project_path): # topic is a fully qualified topic path if topic.name == topic_name: found = True if not found: # If not found, create it publisher.create_topic(topic_name) future = publisher.publish(topic_name, b"3,3,Three Three") # Publish a message if future._completed: # Check if successful print("Message sent successfully!") """Build and run the pipeline.""" pipeline_options = opt.PipelineOptions() pipeline_options.view_as(opt.StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Creates a pipeline # Read the pubsub topic into a PCollection. msg = p | "Read from pubSub" >> beam.io.ReadFromPubSub( topic_path) # Read lines2 = ( p | "Create from in-memory List" >> beam.Create([ # Create 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ' ])) # PCollection: immutable, elements are of same type, no random access. Can be bounded or stream. Windows are used with timestamps # Transforms: ParDo, Combine, composite: combines core transforms ''' [Final Output PCollection] = ([Initial Input PCollection] | [First Transform] | [Second Transform] | [Third Transform]) ''' # Apply a ParDo to the PCollection "words" to compute lengths for each word. # ParDo: “Map” phase of a Map/Shuffle/Reduce-style algorithm # Filter, convert, pick part of the data, simple computation # You must supply a DoFn class rows = msg | "Convert to dict" >> beam.ParDo(MyFn()) #rows = [{"id1": 3, "id2": 3, "val1": "Three Three"}] rows | beam.io.WriteToBigQuery( table='mytable', dataset="mydataset", project="famous-store-237108", schema='id1:INTEGER, id2:INTEGER, val1:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ) # Could be WRITE_TRUNCATE
def _run_pipeline(self, pipeline): options = pipeline_options.PipelineOptions( runner='DirectRunner', direct_running_mode='in_memory') p = beam.Pipeline(options=options) pipeline(p) p.run().wait_until_finish()
def run(argv=None): """Main function. Main function containing the Apache Beam pipeline describing how to process the input CSV file to generate the LTV predictions. """ parser = argparse.ArgumentParser() _, pipeline_args = parser.parse_known_args(argv) options = pipeline_options.PipelineOptions(pipeline_args) runtime_options = options.view_as(RuntimeOptions) with beam.Pipeline(options=options) as pipeline: options = (pipeline | 'Create single element Stream containing options dict' >> beam.Create([options.get_all_options()]) | beam.Map( lambda x: { k: v.get() if isinstance( v, value_provider.ValueProvider) else v for (k, v) in x.items() }) | beam.Map(c.set_extra_options)) full_elog = ( pipeline | beam.io.ReadFromText(getattr(runtime_options, c._OPTION_INPUT_CSV), skip_header_lines=1) | beam.Map(lambda x: list(csv.reader([x]))[0]) | beam.FlatMap( c.csv_line_to_list, pvalue.AsSingleton(options)) # (customer_id, date_str, date, # sales, extra_dimension?) ) full_elog_merged = ( full_elog | beam.Filter(lambda x: x[3] > 0) # sales > 0 | beam.Map(lambda x: ((x[0], x[1]), x)) # key: (customer_id, date) | 'Group full elog by customer and date' >> beam.GroupByKey() | beam.Map(c.merge_full_elog_by_customer_and_date) # (customer_id, # date_str, date, # sales) ) min_max_dates = ( full_elog_merged | beam.Map(lambda x: x[2]) # date | beam.CombineGlobally(c.MinMaxDatesFn()) | beam.Map(c.min_max_dates_dict)) limits_dates = (min_max_dates | beam.FlatMap(c.limit_dates_boundaries, pvalue.AsSingleton(options))) cohort = (full_elog_merged | beam.FlatMap(c.filter_customers_in_cohort, pvalue.AsSingleton(limits_dates)) | 'Distinct Customer IDs in Cohort' >> util.Distinct()) cohort_count = ( cohort | 'Count cohort entries' >> beam.combiners.Count.Globally()) cohort_set = (cohort | beam.Map(lambda x: (x, 1))) all_customer_ids = ( full_elog_merged | beam.Map(lambda x: x[0]) # key: customer_id | 'Distinct all Customer IDs' >> util.Distinct()) all_customer_ids_count = ( all_customer_ids | 'Count all customers' >> beam.combiners.Count.Globally()) num_customers = ( pipeline | 'Create single elem Stream I' >> beam.Create([1]) | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count), pvalue.AsSingleton(all_customer_ids_count), pvalue.AsSingleton(options))) cal_hol_elog = (full_elog_merged | beam.FlatMap(c.filter_cohort_records_in_cal_hol, pvalue.AsDict(cohort_set), pvalue.AsSingleton(limits_dates))) cal_hol_elog_count = ( cal_hol_elog | 'Count cal hol elog entries' >> beam.combiners.Count.Globally()) calibration = (cal_hol_elog | beam.FlatMap(c.filter_records_in_calibration, pvalue.AsSingleton(limits_dates))) num_txns_total = ( full_elog_merged | beam.FlatMap(c.filter_records_in_cal_hol, pvalue.AsSingleton(limits_dates)) | 'Count num txns total' >> beam.combiners.Count.Globally()) num_txns = (pipeline | 'Create single elem Stream II' >> beam.Create([1]) | beam.FlatMap(c.count_txns, pvalue.AsSingleton(cal_hol_elog_count), pvalue.AsSingleton(num_txns_total), pvalue.AsSingleton(options))) calcbs = ( calibration | beam.Map(lambda x: (x[0], x)) | 'Group calibration elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_cal_cbs, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates) ) # (customer_id, number_of_transactions, average_order_value, # frequency, recency, total_time_observed) ) first_transaction_dates_by_customer = ( cal_hol_elog | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group cal hol elog by customer id' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1]))) ) # item 2 -> date ) cal_hol_elog_repeat = ( cal_hol_elog | beam.FlatMap(c.filter_first_transaction_date_records, pvalue.AsDict(first_transaction_dates_by_customer)) | beam.FlatMap( c.calculate_time_unit_numbers, # (customer_id, date, # time_unit_number) pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates)) | beam.Map(lambda x: (x[2], 1)) # key: time_unit_number | 'Group cal hol elog repeat by time unit number' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # (time_unit_number, occurrences) ) repeat_tx = ( pipeline | 'Create single elem Stream III' >> beam.Create([1]) | beam.FlatMap(c.calculate_cumulative_repeat_transactions, pvalue.AsIter(cal_hol_elog_repeat) ) # (time_unit_number, repeat_transactions, # repeat_transactions_cumulative) ) model_validation = ( pipeline | 'Create single elem Stream IV' >> beam.Create([1]) | beam.FlatMap( c.calculate_model_fit_validation, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs), pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns))) _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape)) _ = (model_validation | beam.Map(lambda x: x[0]) | beam.FlatMap(c.calculate_model_fit_validation_to_text, pvalue.AsSingleton(options))) fullcbs_without_extra_dimension = ( full_elog_merged | beam.Map(lambda x: (x[0], x)) # key: customer_id | 'Group full merged elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_fullcbs, pvalue.AsSingleton(options), pvalue.AsSingleton(min_max_dates) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed) ) full_elog_if_extra_dimension = ( full_elog | 'Discard records if no extra dimension' >> beam.FlatMap( c.discard_if_no_extra_dimension, pvalue.AsSingleton(options))) extra_dimensions_stats = ( full_elog_if_extra_dimension | beam.Map(lambda x: ( (x[0], x[4]), x)) # key: (customer_id, extra_dimension) | 'Group full elog by customer id and extra dimension' >> beam.GroupByKey() | beam.Map( c.create_extra_dimensions_stats ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) top_dimension_per_customer = ( extra_dimensions_stats | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group extra dimension stats by customer id' >> beam.GroupByKey() | beam.Map( c.extract_top_extra_dimension ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) customer_dimension_map = ( top_dimension_per_customer | beam.Map(lambda x: (x[0], x[1])) # (customer_id, extra_dimension) ) prediction = ( pipeline | 'Create single elem Stream V' >> beam.Create([1]) | beam.FlatMap( c.calculate_prediction, pvalue.AsSingleton(options), pvalue.AsIter(fullcbs_without_extra_dimension), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed], prediction_params ) prediction_by_customer_no_segments_no_extra_dimension = ( prediction | beam.FlatMap(lambda x: x[0]) # Extract predictions by customer ) prediction_by_customer_no_segments = ( prediction_by_customer_no_segments_no_extra_dimension | beam.FlatMap( c.add_top_extra_dimension_to_fullcbs, pvalue.AsSingleton(options), pvalue.AsDict(customer_dimension_map) ) # [customer_id, p_alive, predicted_purchases, future_aov # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] ) _ = ( prediction | beam.Map(lambda x: x[1]) # Extract predictions params | beam.FlatMap(c.calculate_prediction_to_text, pvalue.AsSingleton(options))) num_rows = (full_elog_merged | 'Count num rows in full elog merged' >> beam.combiners.Count.Globally()) segment_predictions_exact = ( pipeline | 'Create single elem Stream VII' >> beam.Create([1]) | beam.FlatMap( lambda _, rows_count: [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD], pvalue.AsSingleton(num_rows))) sharded_cust_predictions_no_segments_exact, \ sharded_cust_predictions_no_segments_hash = ( prediction_by_customer_no_segments | beam.FlatMap( c.prediction_sharded, pvalue.AsSingleton(options), pvalue.AsSingleton(segment_predictions_exact) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] | beam.Partition(lambda x, _: 0 if x[1] else 1, 2) ) # BEGIN of "exact" branch prediction_by_customer_exact = ( pipeline | 'Create single elem Stream VIII' >> beam.Create([1]) | beam.FlatMap( c.split_in_ntiles_exact, pvalue.AsSingleton(options), pvalue.AsIter(sharded_cust_predictions_no_segments_exact )) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "exact" branch # BEGIN of "hash" branch customer_count_by_expected_value = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: (x[0][5], 1)) # (expected_value, 1) | 'Group customer predictions by expected value' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # expected_value, customers_count ) hash_segment_limits = ( pipeline | 'Create single elem Stream IX' >> beam.Create([1]) | beam.FlatMap(c.expected_values_segment_limits, pvalue.AsSingleton(options), pvalue.AsIter(customer_count_by_expected_value), pvalue.AsSingleton(all_customer_ids_count))) prediction_by_customer_hash = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: x[0]) | beam.FlatMap(c.split_in_ntiles_hash, pvalue.AsSingleton(hash_segment_limits) ) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "hash" branch prediction_by_customer = ( # only one of these two streams will contains values (prediction_by_customer_exact, prediction_by_customer_hash) | beam.Flatten()) _ = (prediction_by_customer | beam.FlatMap( lambda x, opts: [x + ['']] if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x], pvalue.AsSingleton(options)) | 'prediction_by_customer to CSV line' >> beam.Map( c.list_to_csv_line) | 'Write prediction_by_customer' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='customer_id,p_alive' ',predicted_purchases' ',future_aov,historical_aov' ',expected_value,frequency,recency' ',total_time_observed,segment' ',extra_dimension', shard_name_template='', num_shards=1, file_name_suffix='prediction_by_customer.csv')) prediction_summary_temp = ( prediction_by_customer | beam.Map(lambda x: (x[9], x)) # key: segment | 'Group customer predictions by segment' >> beam.GroupByKey() | beam.FlatMap( c.generate_prediction_summary, pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, total_customer_value, # number_of_customers) ) tot_equity = ( prediction_summary_temp | beam.Map(lambda x: x[5]) # total_customer_value | beam.CombineGlobally(sum)) prediction_summary = ( prediction_summary_temp | beam.FlatMap( c.calculate_perc_of_total_customer_value, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, # total_customer_value, number_of_customers, # perc_of_total_customer_value) ) _ = (prediction_summary | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line) | 'Write prediction_summary' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='segment,average_retention_probability' ',average_predicted_customer_value' ',average_predicted_order_value,average_predicted_purchases' ',total_customer_value,number_of_customers' ',perc_of_total_customer_value', shard_name_template='', num_shards=1, file_name_suffix='prediction_summary.csv')) prediction_summary_extra_dimension = ( prediction_by_customer | 'Discard prediction if there is not extra dimension' >> beam.FlatMap(c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)) | beam.Map(lambda x: (x[10], x)) # extra dimension | 'Group customer predictions by extra dimension' >> beam.GroupByKey() | beam.FlatMap(c.generate_prediction_summary_extra_dimension, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(options))) _ = (prediction_summary_extra_dimension | 'prediction_summary_extra_dimension to CSV line' >> beam.Map( c.list_to_csv_line) | 'Write prediction_summary_extra_dimension' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='extra_dimension,average_retention_probability' ',average_predicted_customer_value' ',average_predicted_order_value' ',average_predicted_purchases,total_customer_value' ',number_of_customers,perc_of_total_customer_value', shard_name_template='', num_shards=1, file_name_suffix='prediction_summary_extra_dimension.csv'))
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Add the arguments needed for this specific Dataflow job. parser.add_argument( '--input', dest='input', required=True, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.') parser.add_argument('--output', dest='output', required=True, help='Output BQ table to write results to.') parser.add_argument('--delimiter', dest='delimiter', required=False, help='Delimiter to split input records.', default=',') parser.add_argument('--fields', dest='fields', required=True, help='Comma separated list of field names.') parser.add_argument('--load_dt', dest='load_dt', required=True, help='Load date in YYYY-MM-DD format.') known_args, pipeline_args = parser.parse_known_args(argv) row_transformer = RowTransformer(delimiter=known_args.delimiter, header=known_args.fields, filename=ntpath.basename( known_args.input), load_dt=known_args.load_dt) p_opts = pipeline_options.PipelineOptions(pipeline_args) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is. with beam.Pipeline(options=p_opts) as pipeline: # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. rows = pipeline | "Read from text file" >> beam.io.ReadFromText( known_args.input) # This stage of the pipeline translates from a delimited single row # input to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. dict_records = rows | "Convert to BigQuery row" >> beam.Map( lambda r: row_transformer.parse(r)) # This stage of the pipeline writes the dictionary records into # an existing BigQuery table. The sink is also configured to truncate # the table if it contains any existing records. dict_records | "Write to BigQuery" >> beam.io.Write( beam.io.BigQuerySink( known_args.output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
def add_input (self, sum_count, input): (sum) = sum_count return sum + input def merge_accumulators (self, accumulators): sums = zip(*accumulators) return sum (sums) def extract_output(self, sum_count): (sum) = sum_count return {'Sum': sum} def default_label(self): return self.__class__.__name__ pipeline_options = opt.PipelineOptions () pipeline_options.view_as (opt.StandardOptions).streaming = False #options=pipeline_options with beam.Pipeline () as p: # Creates a pipeline lines = (p | "Create from in-memory List" >> beam.Create ([ # Create 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, '])) # Example 1 msg = ["5, 5, Five Five"] words = p | beam.Create (["Cat", "Mouse", "Horse", "Chimpanzee", "Fish"])
def main(): data = [ ( 'a', 1, 100, ), ( 'b', 2, 100, ), ( 'c', 1, 100, ), ( 'd', 2, 100, ), ( 'e', 1, 100, ), ( 'f', 1, 100, ), ( 'g', 1, 100, ), ( 'h', 1, 100, ), ( 'i', 1, 100, ), ] with beam.Pipeline(options=pipeline_options.PipelineOptions()) as p: students = p | 'create_data' >> beam.Create(data) class CreateKeyValue(beam.DoFn): def process(self, element): return [(element[0], (element[1], element[2]))] fixed_window = ( students | 'fixed_window' >> beam.WindowInto(beam.window.FixedWindows(60)) | 'fixed_window_do_fn' >> beam.ParDo(CreateKeyValue()) | 'fixed_window_group' >> beam.GroupByKey()) write_to_file(fixed_window, 'fixed_window') sliding_window = ( students | 'sliding_window' >> beam.WindowInto( beam.window.SlidingWindows(30, 5)) | 'sliding_window_do_fn' >> beam.ParDo(CreateKeyValue()) | 'sliding_window_group' >> beam.GroupByKey()) write_to_file(sliding_window, 'sliding_window') session_window = ( students | 'session_window' >> beam.WindowInto(beam.window.Sessions(10 * 60)) | 'session_window_do_fn' >> beam.ParDo(CreateKeyValue()) | 'session_window_group' >> beam.GroupByKey()) write_to_file(session_window, 'session_window') global_window = ( students | 'global_window' >> beam.WindowInto(beam.window.GlobalWindows()) | 'global_window_do_fn' >> beam.ParDo(CreateKeyValue()) | 'global_window_group' >> beam.GroupByKey()) write_to_file(global_window, 'global_window')
def expand(self, train_and_test_datasets): """Apply the transform. Args: train_and_test_datasets: A pair of (train, test) PCollections of json strings representing Example Protos Returns: A 2-tuple of A PCollection with a single TrainedModel, suitable for used by Predict A PCollection with a single TrainingJobResult that describes the result of training. Raises: ValueError: If the arguments are invalid. """ train_dataset, test_dataset = train_and_test_datasets pipeline = train_dataset.pipeline # For the job name use a combination of the transform label and a # datestamp. The datestamp is intended to make it unique. now = datetime.datetime.now() # We add some salt to the job name to avoid collisions if we try to submit # multiple jobs at the same time. # N.B. The job_name is fixed at pipeline construction time. This is # critical because multiple invocation of the Train transform (e.g. because # of retries) need to use the same job name. salt = '%04x' % random.getrandbits(4 * 4) # TODO(b/28989568): We need to lower case the name because the backend # only allows lower case letters for job names. The backend should probably # do this automatically but currently it doesn't. job_name = '{0}_{1}_{2}'.format(self.label, now.strftime('%y%m%d_%H%M%S'), salt).lower() options = pipeline.options # TODO(b/29163051) Options can be None depending on how the runner was # constructed. if options is None: options = df_options.PipelineOptions() cloud_options = options.view_as(df_options.GoogleCloudOptions) run_on_cloud = self.use_cloud_ml if run_on_cloud is None: # TODO(user): Remove the fallback after the next Dataflow release. try: dataflow_runner = beam.runners.DataflowRunner except AttributeError: dataflow_runner = beam.runners.DataflowPipelineRunner # Choose a default based on the runner. if isinstance(pipeline.runner, dataflow_runner): run_on_cloud = True else: run_on_cloud = False if self.output_dir: temp_dir = self.output_dir elif run_on_cloud: cloud_options = options.view_as(df_options.GoogleCloudOptions) if cloud_options.temp_location: temp_dir = os.path.join(cloud_options.temp_location, job_name) elif cloud_options.staging_location: temp_dir = os.path.join(cloud_options.staging_location, job_name) else: raise ValueError( '--staging_location must be specified to run in the cloud') else: temp_dir = tempfile.mkdtemp(job_name) logging.info('Temp dir: %s', temp_dir) if run_on_cloud: train_do = ml_func.TrainingJobDo() project = cloud_options.project else: train_do = ml_func._TrainingJobLocalDo() # pylint: disable=protected-access project = None _ = train_dataset | dfutil.CountPCollection('ml-train-input') # Write the train and test data to files so we can pass it to the trainer. train_data_path = os.path.join(temp_dir, 'training') test_data_path = os.path.join(temp_dir, 'testing') output_dir = os.path.join(temp_dir, 'model') # TODO(b/34839956) Make sure we can handle the tf.Transform metadata. metadata_path = os.path.join(output_dir, 'metadata.json') # This PTransform is primarily to avoid stage name collisions in writing # training and test data. # TODO(user): Figure out why i_type @beam.ptransform_fn breaks pickling. train_files = ( train_dataset | 'WriteTrainData' >> ml_func._WrapCallable( # pylint: disable=protected-access self.tf_main_spec.write_input_data, train_data_path)) test_files = ( test_dataset | 'WriteTestData' >> ml_func._WrapCallable( # pylint: disable=protected-access self.tf_main_spec.write_input_data, test_data_path)) if self.metadata: metadata_files = self.metadata | SaveMetadata(metadata_path) else: metadata_files = pipeline | beam.Create([None]) # Construct and run the training job. train_request = self.tf_main_spec.train_request.copy() if not train_request.package_uris: train_request.package_uris = [] if self.package_uris: if isinstance(self.package_uris, basestring): train_request.package_uris.extend([self.package_uris]) else: train_request.package_uris.extend(self.package_uris) # remove duplicates from train_request train_request.package_uris = list(set(train_request.package_uris)) train_request.job_args = self.job_args or [] if self.python_module: train_request.python_module = self.python_module if not train_request.project: train_request.parent = project if not train_request.job_name: train_request.job_name = job_name if not train_request.endpoint: train_request.endpoint = self.cloud_ml_endpoint if not train_request.hyperparameters: train_request.hyperparameters = self.hyperparameters if not train_request.region: train_request.region = self.region if not train_request.scale_tier: train_request.scale_tier = self.scale_tier if not train_request.worker_count: train_request.worker_count = self.worker_count if not train_request.ps_count: train_request.ps_count = self.ps_count if not train_request.worker_type: train_request.worker_type = self.worker_type if not train_request.ps_type: train_request.ps_type = self.ps_type if not train_request.master_type: train_request.master_type = self.master_type if not train_request.runtime_version: train_request.runtime_version = self.runtime_version requests = ( pipeline | 'CreateRequest' >> beam.Create([train_request]) | 'AugmentTrainingArgs' >> beam.ParDo( ml_func._AugmentTrainArgsDo( # pylint: disable=protected-access self.tf_main_spec), beam.pvalue.AsIter(train_files), beam.pvalue.AsIter(test_files), output_dir, beam.pvalue.AsSingleton(metadata_files))) train_results = requests | 'TrainModel' >> beam.ParDo(train_do) # Read and return the model directory and training results. model_directory = ( train_results | 'CreateModel' >> beam.Map(self.tf_main_spec.read_model, output_dir, self.export_subdir)) return model_directory, train_results
def test_read_messages_timestamp_attribute_missing( mocker, patch_sub_client, patch_msg_manager, ): exp_entity_id = "entity_id" kmsg = klio_pb2.KlioMessage() kmsg.data.element = bytes(exp_entity_id, "utf-8") data = kmsg.SerializeToString() attributes = {} publish_time_secs = 1520861821 publish_time_nanos = 234567000 publish_time = "2018-03-12T13:37:01.234567Z" ack_id = "ack_id" pull_response = beam_test_utils.create_pull_response([ beam_test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) pmsg = b_pubsub.PubsubMessage(data, attributes) expected_elements = [ beam_testing_util.TestWindowedValue( pmsg, beam_utils.timestamp.Timestamp.from_rfc3339(publish_time), [beam_transforms.window.GlobalWindow()], ), ] patch_sub_client.pull.return_value = pull_response options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with beam_test_pipeline.TestPipeline(options=options) as p: pcoll = p | b_pubsub.ReadFromPubSub( "projects/fakeprj/topics/a_topic", None, None, with_attributes=True, timestamp_attribute="nonexistent", ) # Check original functionality that was kept the same beam_testing_util.assert_that( pcoll, beam_testing_util.equal_to(expected_elements), reify_windows=True, ) # Check overridden functionality: # 1. Check that auto-acking is skipped patch_sub_client.acknowledge.assert_not_called() # 2. Check that MessageManager daemon threads were started patch_msg_manager.assert_called_once_with( patch_sub_client.subscription_path()) # 3. Check that messages were added to the MessageManager patch_msg_manager.return_value.add.assert_called_once_with(ack_id, pmsg) # 4. Check that one message is handled at a time, instead of the # original 10 patch_sub_client.pull.assert_called_once_with(mocker.ANY, max_messages=1, return_immediately=True) patch_sub_client.api.transport.channel.close.assert_called_once_with()
def populate_currency_dim(): # UDM table UDM_table_spec_currency = bigquery.TableReference(projectId=PROJECT, datasetId=DATASET_UDM, tableId='currency') # Consumption tables CONS_table_spec_currency_dim = bigquery.TableReference( projectId=PROJECT, datasetId=DATASET_CONS, tableId='currency_dim') currency_schema = ({ 'fields': [{ 'name': "CURRENCY_KEY", 'type': 'INTEGER', 'mode': 'REQUIRED' }, { 'name': "CRNCY_CDE", 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': "CRNCY_NAME", 'type': "STRING", 'mode': "REQUIRED" }, { 'name': "DEL_REC_IND", 'type': "BOOLEAN", 'mode': "REQUIRED" }, { 'name': "ACTV_REC_IND", 'type': "BOOLEAN", 'mode': "REQUIRED" }, { 'name': "DCML_ADJ_NUM", 'type': "INTEGER", 'mode': "REQUIRED" }, { 'name': "REC_CREAT_DT_TM", 'type': "TIMESTAMP", 'mode': "REQUIRED" }, { 'name': "REC_UPDT_DT_TM", 'type': "TIMESTAMP", 'mode': "NULLABLE" }] }) currency_query = 'SELECT CURRENCY_ID as CURRENCY_KEY, CRNCY_CDE, CRNCY_NAME, DEL_REC_IND, ACTV_REC_IND, DCML_ADJ_NUM, REC_CREAT_DT_TM, REC_UPDT_DT_TM '\ 'FROM [famous-store-237108:UDM.currency]' # Build and run the pipeline pipeline_options = opt.PipelineOptions( ) # This is deprecated, not future proof. Replacement TBA pipeline_options.view_as( opt.StandardOptions).streaming = False # Set options first google_cloud_options = pipeline_options.view_as(opt.GoogleCloudOptions) google_cloud_options.project = PROJECT google_cloud_options.job_name = 'loadcurrency' google_cloud_options.staging_location = 'gs://csacsi/staging' google_cloud_options.temp_location = 'gs://csacsi/temp' google_cloud_options.region = 'europe-west1' with beam.Pipeline(options=pipeline_options ) as pcoll: # Creates a pipeline, PCollection instance #rows = pcoll | "Read from UDM.location" >> beam.io.Read (beam.io.BigQuerySource (UDM_table_spec_location)) # Read from UDM.location rows = pcoll | "Read from UDM.currency" >> beam.io.Read( beam.io.BigQuerySource( query=currency_query)) # Read from UDM.location #rows = rows | beam.ParDo (id2key ()) rows | "Write to Consumption.Currency_dim" >> beam.io.WriteToBigQuery( CONS_table_spec_currency_dim, schema=currency_schema, # schema variable (list) could be used create_disposition=BigQueryDisposition.CREATE_NEVER, write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=True, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.') parser.add_argument('--output', dest='output', required=True, help='Output BQ table to write results to.') parser.add_argument('--load_dt', dest='load_dt', required=True, help='Load date in YYYY-MM-DD format.') known_args, pipeline_args = parser.parse_known_args(argv) row_transformer = RowTransformer(filename=ntpath.basename( known_args.input), load_dt=known_args.load_dt) p_opts = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=p_opts) as pipeline: rows = pipeline | "Read from text file" >> beam.io.ReadFromText( known_args.input) dict_records = rows | "Convert to BigQuery row" >> beam.Map( lambda r: row_transformer.parse(r)) bigquery_table_schema = { "fields": [{ "mode": "NULLABLE", "name": "BackorderOrderID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "Comments", "type": "STRING" }, { "mode": "NULLABLE", "name": "ContactPersonID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "CustomerID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "CustomerPurchaseOrderNumber", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "DeliveryInstructions", "type": "STRING" }, { "mode": "NULLABLE", "name": "ExpectedDeliveryDate", "type": "DATE" }, { "mode": "NULLABLE", "name": "InternalComments", "type": "STRING" }, { "mode": "NULLABLE", "name": "IsUndersupplyBackordered", "type": "BOOLEAN" }, { "mode": "NULLABLE", "name": "LastEditedBy", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "LastEditedWhen", "type": "TIMESTAMP" }, { "mode": "NULLABLE", "name": "OrderDate", "type": "DATE" }, { "mode": "NULLABLE", "name": "OrderID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "PickedByPersonID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "PickingCompletedWhen", "type": "TIMESTAMP" }, { "mode": "NULLABLE", "name": "SalespersonPersonID", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "filename", "type": "STRING" }, { "mode": "NULLABLE", "name": "load_dt", "type": "DATE" }] } dict_records | "Write to BigQuery" >> beam.io.WriteToBigQuery( known_args.output, schema=bigquery_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
def populate_country_dim(): # UDM table UDM_table_spec_location = bigquery.TableReference(projectId=PROJECT, datasetId=DATASET_UDM, tableId='location') # Consumption table CONS_table_spec_country_dim = bigquery.TableReference( projectId=PROJECT, datasetId=DATASET_CONS, tableId='country_dim') country_schema = ({ 'fields': [{ 'name': "LOCATION_KEY", 'type': 'INTEGER', 'mode': 'REQUIRED' }, { 'name': "CTRY_ISO2_CDE", 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': "CTRY_ISO3_CDE", 'type': "STRING", 'mode': "REQUIRED" }, { 'name': "CTRY_NAME", 'type': "STRING", 'mode': "REQUIRED" }, { 'name': "REGION_NAME", 'type': "STRING", 'mode': "REQUIRED" }, { 'name': "CPTL_CITY_NAME", 'type': "STRING", 'mode': "REQUIRED" }, { 'name': "DEL_REC_IND", 'type': "BOOLEAN", 'mode': "REQUIRED" }, { 'name': "ACTV_REC_IND", 'type': "BOOLEAN", 'mode': "REQUIRED" }, { 'name': "REC_CREAT_DT_TM", 'type': "TIMESTAMP", 'mode': "REQUIRED" }, { 'name': "REC_UPDT_DT_TM", 'type': "TIMESTAMP", 'mode': "NULLABLE" }] }) location_query = 'SELECT LOCATION_ID as LOCATION_KEY, CTRY_ISO2_CDE, CTRY_ISO3_CDE, CTRY_NAME, REGION_NAME, CPTL_CITY_NAME, DEL_REC_IND, '\ 'ACTV_REC_IND, REC_CREAT_DT_TM, REC_UPDT_DT_TM '\ 'FROM [famous-store-237108:UDM.location]' # Build and run the pipeline pipeline_options = opt.PipelineOptions( ) # This is deprecated, not future proof. Replacement TBA pipeline_options.view_as( opt.StandardOptions).streaming = False # Set options first google_cloud_options = pipeline_options.view_as(opt.GoogleCloudOptions) google_cloud_options.project = PROJECT google_cloud_options.job_name = 'loadcountryno' google_cloud_options.staging_location = 'gs://csacsi/staging' google_cloud_options.temp_location = 'gs://csacsi/temp' google_cloud_options.region = 'europe-west1' with beam.Pipeline(options=pipeline_options ) as pcoll: # Creates a pipeline, PCollection instance #rows = pcoll | "Read from UDM.location" >> beam.io.Read (beam.io.BigQuerySource (UDM_table_spec_location)) # Read from UDM.location rows = pcoll | "Read from UDM.location" >> beam.io.Read( beam.io.BigQuerySource( query=location_query)) # Read from UDM.location ''' [Final Output PCollection] = ([Initial Input PCollection] | [First Transform] | [Second Transform] | [Third Transform]) ''' #dict_rows = rows | "Convert to dictionary" >> beam.ParDo (createDict ()) # Convert tuples returned by Oracle into dictionary needed for BigQuery #rows = rows | beam.ParDo (id2key ()) rows | "Write to Consumption.Country_dim" >> beam.io.WriteToBigQuery( CONS_table_spec_country_dim, schema=country_schema, # schema variable (list) could be used create_disposition=BigQueryDisposition.CREATE_NEVER, write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ([annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) avro_root_path = _get_avro_root_path(beam_pipeline_options) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) schema = schema_converter.generate_schema_from_header_fields( header_fields, processed_variant_factory, variant_merger, known_args.use_1_based_coordinate, known_args.include_call_name) sharding = variant_sharding.VariantSharding( known_args.sharding_config_path) if sharding.should_keep_shard(sharding.get_residual_index()): num_shards = sharding.get_num_shards() else: num_shards = sharding.get_num_shards() - 1 if known_args.update_schema_on_append: for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) table_name = bigquery_util.compose_table_name( known_args.output_table, table_suffix) bigquery_util.update_bigquery_schema_on_append( schema.fields, table_name) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants( all_patterns, pipeline, known_args, pipeline_mode, use_1_based_coordinate=known_args.use_1_based_coordinate) if known_args.allow_malformed_records: variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants() sharded_variants = variants | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards()) variants = [] for i in range(num_shards): suffix = sharding.get_output_table_suffix(i) # Convert tuples to list variants.append(sharded_variants[i]) if variant_merger: variants[i] |= ('MergeVariants' + suffix >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + suffix >> beam.Map(processed_variant_factory.create_processed_variant). \ with_output_types(processed_variant.ProcessedVariant)) _ = (variants[i] | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles( avro_root_path + suffix, schema, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement), include_call_name=known_args.include_call_name)) result = pipeline.run() try: state = result.wait_until_finish() if state != beam.runners.runner.PipelineState.DONE: logging.error( 'Dataflow pipeline terminated in an unexpected state: %s', state) raise AssertionError( 'Dataflow pipeline terminated in {} state'.format(state)) except Exception as e: logging.error('Dataflow pipeline failed.') raise e else: logging.info('Dataflow pipeline finished successfully.') metrics_util.log_all_counters(result) # After pipeline is done, create output tables and load AVRO files into them. schema_file = _write_schema_to_temp_file(schema, avro_root_path) suffixes = [] try: for i in range(num_shards): suffixes.append(sharding.get_output_table_suffix(i)) partition_range_end = sharding.get_output_table_partition_range_end( i) if not known_args.append: table_name = bigquery_util.compose_table_name( known_args.output_table, suffixes[i]) partitioning.create_bq_table( table_name, schema_file, bigquery_util.ColumnKeyConstants.START_POSITION, partition_range_end) _record_newly_created_table(table_name) logging.info('Integer range partitioned table %s was created.', table_name) if not known_args.append: _record_newly_created_table( sample_info_table_schema_generator.create_sample_info_table( known_args.output_table)) suffixes.append( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table, suffixes, False) not_empty_variant_suffixes = load_avro.start_loading() logging.info('Following tables were loaded with at least 1 row:') for suffix in not_empty_variant_suffixes: logging.info( bigquery_util.compose_table_name(known_args.output_table, suffix)) # Remove sample_info table from both lists to avoid duplicating it when # --sample_lookup_optimized_output_table flag is set suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\ not_empty_variant_suffixes: not_empty_variant_suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) except Exception as e: logging.error( 'Something unexpected happened during the loading of AVRO ' 'files to BigQuery: %s', str(e)) logging.info( 'Since the write to BigQuery stage failed, we did not delete ' 'AVRO files in your GCS bucket. You can manually import them ' 'to BigQuery. To avoid extra storage charges, delete them if ' 'you do not need them, AVRO files are located at: %s', avro_root_path) raise e else: logging.warning('All AVRO files were successfully loaded to BigQuery.') if known_args.keep_intermediate_avro_files: logging.info( 'Since "--keep_intermediate_avro_files" flag is set, the ' 'AVRO files are kept and stored at: %s', avro_root_path) else: if bigquery_util.delete_gcs_files(avro_root_path) != 0: logging.error( 'Deletion of intermediate AVRO files located at "%s" has ' 'failed.', avro_root_path) if known_args.sample_lookup_optimized_output_table: flatten_call_column = partitioning.FlattenCallColumn( known_args.output_table, not_empty_variant_suffixes, known_args.append) try: flatten_schema_file = tempfile.mkstemp( suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] if not flatten_call_column.get_flatten_table_schema( flatten_schema_file): raise ValueError('Failed to extract schema of flatten table') # Create output flatten tables if needed if not known_args.append: # Create all sample optimized tables including those that will be empty. for suffix in suffixes: output_table_id = bigquery_util.compose_table_name( known_args.sample_lookup_optimized_output_table, suffix) partitioning.create_bq_table( output_table_id, flatten_schema_file, bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, partitioning.MAX_RANGE_END) _record_newly_created_table(output_table_id) logging.info( 'Sample lookup optimized table %s was created.', output_table_id) # Copy to flatten sample lookup tables from the variant lookup tables. # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607). flatten_call_column.copy_to_flatten_table( known_args.sample_lookup_optimized_output_table) logging.info( 'All sample lookup optimized tables are fully loaded.') except Exception as e: logging.error( 'Something unexpected happened during the loading rows to ' 'sample optimized table stage: %s', str(e)) raise e
def preprocess_data(test_mode): import shutil, os, subprocess ### Saving the job job_name = 'preprocess-credit-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if test_mode: ### If running in test mode, save the job locally print('Launching job in test mode:') OUTPUT_DIR = './preproc' # delete output directory if it exists shutil.rmtree(OUTPUT_DIR, ignore_errors=True) # create the directory os.makedirs(OUTPUT_DIR) else: ### If launching a Dataflow job, save the job on Google Cloud Storage (GCS) print('Launching Dataflow job {}:'.format(job_name)) OUTPUT_DIR = 'gs://{0}/credit_default/preproc/'.format(BUCKET) try: subprocess.check_call('gsutil -m rm -r {}'.format(OUTPUT_DIR).split()) except: pass ### Let's define our own Apache Beam Options: options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'region': REGION, 'project': PROJECT, 'max_num_workers': 6, 'setup_file':'directory/to/setup.py' # change this to the directory of the setup file } opts = pipeline_options.PipelineOptions(flags = [], **options) ### Choose the runner if test_mode: ### local mode RUNNER = 'DirectRunner' else: ### Dataflow RUNNER = 'DataflowRunner' p = beam.Pipeline(RUNNER, options = opts) ### Let's create the Train and Eval Datasets: query = """ SELECT ABS(FARM_FINGERPRINT(CAST(ID AS STRING))) AS hashid, LIMIT_BAL, SEX, EDUCATION, MARRIAGE, AGE, PAY_0 AS PAY_1, PAY_2, PAY_3, PAY_4, PAY_5, PAY_6, CAST(BILL_AMT1 AS FLOAT64) AS BILL_AMT1, CAST(BILL_AMT2 AS FLOAT64) AS BILL_AMT2, CAST(BILL_AMT3 AS FLOAT64) AS BILL_AMT3, CAST(BILL_AMT4 AS FLOAT64) AS BILL_AMT4, CAST(BILL_AMT5 AS FLOAT64) AS BILL_AMT5, CAST(BILL_AMT6 AS FLOAT64) AS BILL_AMT6, CAST(PAY_AMT1 AS FLOAT64) AS PAY_AMT1, CAST(PAY_AMT2 AS FLOAT64) AS PAY_AMT2, CAST(PAY_AMT3 AS FLOAT64) AS PAY_AMT3, CAST(PAY_AMT4 AS FLOAT64) AS PAY_AMT4, CAST(PAY_AMT5 AS FLOAT64) AS PAY_AMT5, CAST(PAY_AMT6 AS FLOAT64) AS PAY_AMT6, CAST(default_payment_next_month AS INT64) AS default_payment FROM `credit-default-277316.credit_default.credit_default` """ if test_mode: query = query + ' LIMIT 100' for step in ['train', 'eval']: if step == 'train': selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashid),5) < 4'.format(query) else: selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashid),5) = 4'.format(query) (p | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, use_standard_sql = True)) | '{}_csv'.format(step) >> beam.FlatMap(to_csv) | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText( os.path.join(OUTPUT_DIR, '{}.csv'.format(step)))) ) job = p.run() if test_mode: job.wait_until_finish() print("Done!")
def run(argv=None): """Main function. Main function containing the Apache Beam pipeline describing how to process the input CSV file to generate the LTV predictions. """ parser = argparse.ArgumentParser() _, pipeline_args = parser.parse_known_args(argv) options = pipeline_options.PipelineOptions(pipeline_args) runtime_options = options.view_as(RuntimeOptions) with beam.Pipeline(options=options) as pipeline: options = (pipeline | 'Create single element Stream containing options dict' >> beam.Create([options.get_all_options()]) | beam.Map( lambda x: { k: v.get() if isinstance( v, value_provider.ValueProvider) else v for (k, v) in x.items() }) | beam.Map(c.set_extra_options)) full_elog = ( pipeline | bq_mod.ReadFromBigQuery( project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT), query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY), gcs_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), use_standard_sql=True) | beam.FlatMap( c.bq_row_to_list, pvalue.AsSingleton(options)) # (customer_id, date_str, date, # sales, extra_dimension?) ) full_elog_merged = ( full_elog | beam.Filter(lambda x: x[3] > 0) # sales > 0 | beam.Map(lambda x: ((x[0], x[1]), x)) # key: (customer_id, date) | 'Group full elog by customer and date' >> beam.GroupByKey() | beam.Map(c.merge_full_elog_by_customer_and_date) # (customer_id, # date_str, date, # sales) ) min_max_dates = ( full_elog_merged | beam.Map(lambda x: x[2]) # date | beam.CombineGlobally(c.MinMaxDatesFn()) | beam.Map(c.min_max_dates_dict)) limits_dates = (min_max_dates | beam.FlatMap(c.limit_dates_boundaries, pvalue.AsSingleton(options))) cohort = (full_elog_merged | beam.FlatMap(c.filter_customers_in_cohort, pvalue.AsSingleton(limits_dates)) | 'Distinct Customer IDs in Cohort' >> util.Distinct()) cohort_count = ( cohort | 'Count cohort entries' >> beam.combiners.Count.Globally()) cohort_set = (cohort | beam.Map(lambda x: (x, 1))) all_customer_ids = ( full_elog_merged | beam.Map(lambda x: x[0]) # key: customer_id | 'Distinct all Customer IDs' >> util.Distinct()) all_customer_ids_count = ( all_customer_ids | 'Count all customers' >> beam.combiners.Count.Globally()) num_customers = ( pipeline | 'Create single elem Stream I' >> beam.Create([1]) | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count), pvalue.AsSingleton(all_customer_ids_count), pvalue.AsSingleton(options))) cal_hol_elog = (full_elog_merged | beam.FlatMap(c.filter_cohort_records_in_cal_hol, pvalue.AsDict(cohort_set), pvalue.AsSingleton(limits_dates))) cal_hol_elog_count = ( cal_hol_elog | 'Count cal hol elog entries' >> beam.combiners.Count.Globally()) calibration = (cal_hol_elog | beam.FlatMap(c.filter_records_in_calibration, pvalue.AsSingleton(limits_dates))) num_txns_total = ( full_elog_merged | beam.FlatMap(c.filter_records_in_cal_hol, pvalue.AsSingleton(limits_dates)) | 'Count num txns total' >> beam.combiners.Count.Globally()) num_txns = (pipeline | 'Create single elem Stream II' >> beam.Create([1]) | beam.FlatMap(c.count_txns, pvalue.AsSingleton(cal_hol_elog_count), pvalue.AsSingleton(num_txns_total), pvalue.AsSingleton(options))) calcbs = ( calibration | beam.Map(lambda x: (x[0], x)) | 'Group calibration elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_cal_cbs, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates) ) # (customer_id, number_of_transactions, average_order_value, # frequency, recency, total_time_observed) ) first_transaction_dates_by_customer = ( cal_hol_elog | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group cal hol elog by customer id' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1]))) ) # item 2 -> date ) cal_hol_elog_repeat = ( cal_hol_elog | beam.FlatMap(c.filter_first_transaction_date_records, pvalue.AsDict(first_transaction_dates_by_customer)) | beam.FlatMap( c.calculate_time_unit_numbers, # (customer_id, date, # time_unit_number) pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates)) | beam.Map(lambda x: (x[2], 1)) # key: time_unit_number | 'Group cal hol elog repeat by time unit number' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # (time_unit_number, occurrences) ) repeat_tx = ( pipeline | 'Create single elem Stream III' >> beam.Create([1]) | beam.FlatMap(c.calculate_cumulative_repeat_transactions, pvalue.AsIter(cal_hol_elog_repeat) ) # (time_unit_number, repeat_transactions, # repeat_transactions_cumulative) ) model_validation = ( pipeline | 'Create single elem Stream IV' >> beam.Create([1]) | beam.FlatMap( c.calculate_model_fit_validation, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs), pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns))) _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape)) _ = (model_validation | beam.Map(lambda x: x[0]) | 'Write to validation_params table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'validation_params'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema={ 'fields': [{ 'name': 'calibration_start_date', 'type': 'STRING' }, { 'name': 'calibration_end_date', 'type': 'STRING' }, { 'name': 'cohort_start_date', 'type': 'STRING' }, { 'name': 'cohort_end_date', 'type': 'STRING' }, { 'name': 'holdout_end_date', 'type': 'STRING' }, { 'name': 'model_time_granularity', 'type': 'STRING' }, { 'name': 'model', 'type': 'RECORD', 'fields': [ { 'name': 'frequency_model', 'type': 'STRING' }, { 'name': 'num_customers_cohort', 'type': 'INTEGER' }, { 'name': 'perc_customers_cohort', 'type': 'FLOAT' }, { 'name': 'num_transactions_validation', 'type': 'INTEGER' }, { 'name': 'perc_transactions_validation', 'type': 'FLOAT' }, { 'name': 'validation_mape', 'type': 'STRING' }, ] }] }, write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) fullcbs_without_extra_dimension = ( full_elog_merged | beam.Map(lambda x: (x[0], x)) # key: customer_id | 'Group full merged elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_fullcbs, pvalue.AsSingleton(options), pvalue.AsSingleton(min_max_dates) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed) ) full_elog_if_extra_dimension = ( full_elog | 'Discard records if no extra dimension' >> beam.FlatMap( c.discard_if_no_extra_dimension, pvalue.AsSingleton(options))) extra_dimensions_stats = ( full_elog_if_extra_dimension | beam.Map(lambda x: ( (x[0], x[4]), x)) # key: (customer_id, extra_dimension) | 'Group full elog by customer id and extra dimension' >> beam.GroupByKey() | beam.Map( c.create_extra_dimensions_stats ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) top_dimension_per_customer = ( extra_dimensions_stats | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group extra dimension stats by customer id' >> beam.GroupByKey() | beam.Map( c.extract_top_extra_dimension ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) customer_dimension_map = ( top_dimension_per_customer | beam.Map(lambda x: (x[0], x[1])) # (customer_id, extra_dimension) ) prediction = ( pipeline | 'Create single elem Stream V' >> beam.Create([1]) | beam.FlatMap( c.calculate_prediction, pvalue.AsSingleton(options), pvalue.AsIter(fullcbs_without_extra_dimension), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed], prediction_params ) prediction_by_customer_no_segments_no_extra_dimension = ( prediction | beam.FlatMap(lambda x: x[0]) # Extract predictions by customer ) prediction_by_customer_no_segments = ( prediction_by_customer_no_segments_no_extra_dimension | beam.FlatMap( c.add_top_extra_dimension_to_fullcbs, pvalue.AsSingleton(options), pvalue.AsDict(customer_dimension_map) ) # [customer_id, p_alive, predicted_purchases, future_aov # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] ) _ = ( prediction | beam.Map(lambda x: x[1]) # Extract prediction params | 'Write to prediction_params table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_params'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema={ 'fields': [{ 'name': 'prediction_period', 'type': 'INTEGER' }, { 'name': 'prediction_period_unit', 'type': 'STRING' }, { 'name': 'model_time_granularity', 'type': 'STRING' }, { 'name': 'customers_modeled', 'type': 'INTEGER' }, { 'name': 'transactions_observed', 'type': 'INTEGER' }, { 'name': 'frequency_model', 'type': 'STRING' }, { 'name': 'bgnbd_model_params', 'type': 'RECORD', 'fields': [{ 'name': 'a', 'type': 'FLOAT' }, { 'name': 'b', 'type': 'FLOAT' }, { 'name': 'r', 'type': 'FLOAT' }, { 'name': 'alpha', 'type': 'FLOAT' }] }, { 'name': 'bgbb_model_params', 'type': 'RECORD', 'fields': [{ 'name': 'alpha', 'type': 'FLOAT' }, { 'name': 'beta', 'type': 'FLOAT' }, { 'name': 'gamma', 'type': 'FLOAT' }, { 'name': 'delta', 'type': 'FLOAT' }] }, { 'name': 'paretonbd_model_params', 'type': 'RECORD', 'fields': [{ 'name': 'r', 'type': 'FLOAT' }, { 'name': 's', 'type': 'FLOAT' }, { 'name': 'alpha', 'type': 'FLOAT' }, { 'name': 'beta', 'type': 'FLOAT' }] }, { 'name': 'gamma_gamma_params', 'type': 'RECORD', 'fields': [{ 'name': 'p', 'type': 'FLOAT' }, { 'name': 'q', 'type': 'FLOAT' }, { 'name': 'v', 'type': 'FLOAT' }] }] }, write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) num_rows = (full_elog_merged | 'Count num rows in full elog merged' >> beam.combiners.Count.Globally()) segment_predictions_exact = ( pipeline | 'Create single elem Stream VII' >> beam.Create([1]) | beam.FlatMap( lambda _, rows_count: [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD], pvalue.AsSingleton(num_rows))) sharded_cust_predictions_no_segments_exact, \ sharded_cust_predictions_no_segments_hash = ( prediction_by_customer_no_segments | beam.FlatMap( c.prediction_sharded, pvalue.AsSingleton(options), pvalue.AsSingleton(segment_predictions_exact) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] | beam.Partition(lambda x, _: 0 if x[1] else 1, 2) ) # BEGIN of "exact" branch prediction_by_customer_exact = ( pipeline | 'Create single elem Stream VIII' >> beam.Create([1]) | beam.FlatMap( c.split_in_ntiles_exact, pvalue.AsSingleton(options), pvalue.AsIter(sharded_cust_predictions_no_segments_exact )) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "exact" branch # BEGIN of "hash" branch customer_count_by_expected_value = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: (x[0][5], 1)) # (expected_value, 1) | 'Group customer predictions by expected value' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # expected_value, customers_count ) hash_segment_limits = ( pipeline | 'Create single elem Stream IX' >> beam.Create([1]) | beam.FlatMap(c.expected_values_segment_limits, pvalue.AsSingleton(options), pvalue.AsIter(customer_count_by_expected_value), pvalue.AsSingleton(all_customer_ids_count))) prediction_by_customer_hash = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: x[0]) | beam.FlatMap(c.split_in_ntiles_hash, pvalue.AsSingleton(hash_segment_limits) ) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "hash" branch prediction_by_customer = ( # only one of these two streams will contains values (prediction_by_customer_exact, prediction_by_customer_hash) | beam.Flatten() | beam.Map(c.clean_nan_and_inf)) _ = (prediction_by_customer | beam.FlatMap( lambda x, opts: [x + ['']] if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x], pvalue.AsSingleton(options)) | 'prediction_by_customer to Dict' >> beam.Map(c.list_to_dict, [ 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov', 'historical_aov', 'expected_value', 'frequency', 'recency', 'total_time_observed', 'segment', 'extra_dimension' ]) | 'Write to prediction_by_customer table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_by_customer'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='customer_id:STRING, p_alive:FLOAT64' ', predicted_purchases:FLOAT64' ', future_aov:FLOAT64, historical_aov:FLOAT64' ', expected_value:FLOAT64, frequency:INT64' ', recency:FLOAT64' ', total_time_observed:FLOAT64, segment:INT64' ', extra_dimension:STRING', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) prediction_summary_temp = ( prediction_by_customer | beam.Map(lambda x: (x[9], x)) # key: segment | 'Group customer predictions by segment' >> beam.GroupByKey() | beam.FlatMap( c.generate_prediction_summary, pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, total_customer_value, # number_of_customers) ) tot_equity = ( prediction_summary_temp | beam.Map(lambda x: x[5]) # total_customer_value | beam.CombineGlobally(sum)) prediction_summary = ( prediction_summary_temp | beam.FlatMap( c.calculate_perc_of_total_customer_value, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, # total_customer_value, number_of_customers, # perc_of_total_customer_value) ) _ = ( prediction_summary | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [ 'segment', 'average_retention_probability', 'average_predicted_customer_value', 'average_predicted_order_value', 'average_predicted_purchases', 'total_customer_value', 'number_of_customers', 'perc_of_total_customer_value' ]) | 'Write to prediction_summary table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_summary'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='segment:INT64 ,average_retention_probability:FLOAT64' ', average_predicted_customer_value:FLOAT64' ', average_predicted_order_value:FLOAT64' ', average_predicted_purchases:FLOAT64' ', total_customer_value:FLOAT64' ', number_of_customers:FLOAT64' ', perc_of_total_customer_value:FLOAT64', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) prediction_summary_extra_dimension = ( prediction_by_customer | 'Discard prediction if there is not extra dimension' >> beam.FlatMap(c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)) | beam.Map(lambda x: (x[10], x)) # extra dimension | 'Group customer predictions by extra dimension' >> beam.GroupByKey() | beam.FlatMap(c.generate_prediction_summary_extra_dimension, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(options))) _ = (prediction_summary_extra_dimension | 'prediction_summary_extra_dimension to Dict' >> beam.Map( c.list_to_dict, [ 'extra_dimension', 'average_retention_probability', 'average_predicted_customer_value', 'average_predicted_order_value', 'average_predicted_purchases', 'total_customer_value', 'number_of_customers', 'perc_of_total_customer_value' ]) | 'Write to prediction_summary_extra_dimension table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_summary_extra_dimension'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='extra_dimension:STRING' ', average_retention_probability:FLOAT64' ', average_predicted_customer_value:FLOAT64' ', average_predicted_order_value:FLOAT64' ', average_predicted_purchases:FLOAT64' ', total_customer_value:FLOAT64' ', number_of_customers:INT64' ', perc_of_total_customer_value:FLOAT64', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
#### After this point, there are transforms used in the main pipeline def read_data(pipeline, input_file): return pipeline | covidpipe.datasource.ReadFromCsv(input_file) def select_wanted_columns(input_data, column_information, extra_columns): def select_wanted_columns(row: Dict[str, str], column_info: Dict[str, Set[str]]): empty_columns = set(column_info[ covidpipe.datasource.FindEmptyAndNonEmptyColumns.EMPTY]) sanitized_row = { k: v for k, v in row.items() if (k not in empty_columns or k in extra_columns) and v } # If the row does not contain any values, then we must discard it. if sanitized_row: yield sanitized_row return input_data | 'SelectColumns' >> beam.FlatMap( select_wanted_columns, column_information) #### After this point, the pipeline is set up to run if __name__ == '__main__': import sys options = pipeline_options.PipelineOptions(sys.argv[1:]) run(options)
import apache_beam as beam from apache_beam.options import pipeline_options from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.runners import DataflowRunner import google.auth from datetime import datetime, timedelta import json # Setting up the Apache Beam pipeline options. options = pipeline_options.PipelineOptions(flags=['--streaming']) options.view_as(pipeline_options.StandardOptions).streaming = True _, options.view_as(GoogleCloudOptions).project = google.auth.default() options.view_as(GoogleCloudOptions).region = 'us-west1' options.view_as(GoogleCloudOptions).staging_location = 'gs://abdul-dataflow/staging' options.view_as(GoogleCloudOptions).temp_location = 'gs://abdul-dataflow/temp' #options.view_as(pipeline_options.SetupOptions).sdk_location = ( # f'/root/apache-beam-custom/packages/beam/sdks/python/dist/apache-beam-{beam.version.__version__}0.tar.gz' ) topic = "projects/data228/topics/data228-hw8-in" with beam.Pipeline(options=options) as pipeline: data = pipeline | "read" >> beam.io.ReadFromPubSub(topic=topic) windowed_data = (data | "window" >> beam.WindowInto(beam.window.FixedWindows(500))
def options(): return pipeline_options.PipelineOptions()
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ( [annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode( all_patterns, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) partitioner = None if ((known_args.optimize_for_large_inputs and variant_merger) or known_args.partition_config_path): partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 if known_args.output_table: for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = (variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards, null_numeric_value_replacement=( known_args.null_numeric_value_replacement))) if known_args.output_avro_path: # TODO(bashir2): Add an integration test that outputs to Avro files and # also imports to BigQuery. Then import those Avro outputs using the bq # tool and verify that the two tables are identical. _ = ( variants | 'FlattenToOnePCollection' >> beam.Flatten() | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles( known_args.output_avro_path, header_fields, processed_variant_factory, variant_merger=variant_merger, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement)) ) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def CreatePipeline(pipeline_args): poptions = pipeline_options.PipelineOptions( pipeline_args, runner="directrunner", direct_running_mode="multi_threading") return beam.Pipeline(options=poptions)