示例#1
0
def test_set_google_cloud_options(
    all_options,
    config,
    update,
    exp_update,
    dataflow_endpoint,
    klio_cli_version,
    deployed_ci,
    user_env,
    gcp_options,
    mocker,
    monkeypatch,
):
    expected_opts = [
        "project",
        "region",
        "temp_location",
        "staging_location",
        "service_account_email",
        "no_auth",
        "template_location",
        "enable_streaming_engine",
        "dataflow_kms_key",
        "flexrs_goal",
    ]
    # this is to be changed when running `tox`; remove when no longer
    # supporting beam 2.14.0

    if dataflow_endpoint:
        all_options["dataflow_endpoint"] = dataflow_endpoint
    else:
        all_options.pop("dataflow_endpoint", None)

    options = pipeline_options.PipelineOptions().from_dictionary(all_options)

    actual_gcp_opts = options.view_as(pipeline_options.GoogleCloudOptions)

    monkeypatch.setattr(
        config.pipeline_options, "dataflow_endpoint", dataflow_endpoint
    )
    if klio_cli_version:
        monkeypatch.setenv("KLIO_CLI_VERSION", klio_cli_version)
        klio_cli_version_clean = klio_cli_version.replace(".", "-")
    if deployed_ci:
        monkeypatch.setenv("CI", "TRUE")
    if not user_env:
        monkeypatch.delenv("USER", raising=False)

    kpipe = run.KlioPipeline("test-job", config, mocker.Mock(update=update))
    kpipe._set_google_cloud_options(options)

    for opt in expected_opts:
        expected_value = gcp_options[opt]
        # getattr should explode when not setting a default value
        assert expected_value == getattr(actual_gcp_opts, opt)

    assert exp_update == actual_gcp_opts.update
    if dataflow_endpoint:
        assert dataflow_endpoint == actual_gcp_opts.dataflow_endpoint
    else:
        assert (
            "https://dataflow.googleapis.com"
            == actual_gcp_opts.dataflow_endpoint
        )
    user = None
    if deployed_ci:
        user = "******"
    elif user_env:
        user = os.environ["USER"]

    klio_exec_version_clean = klio_exec_version.replace(".", "-")
    klio_core_version_clean = klio_core_version.replace(".", "-")
    klio_lib_version_clean = klio_lib_version.replace(".", "-")
    exp_labels = [
        "foo=bar",
        "baz=bla",
        "klio-exec={}".format(klio_exec_version_clean),
        "klio-core={}".format(klio_core_version_clean),
        "klio={}".format(klio_lib_version_clean),
    ]
    if user:
        exp_labels.append("deployed_by={}".format(user).lower())
    if klio_cli_version:
        exp_labels.append("klio-cli={}".format(klio_cli_version_clean))
    assert sorted(exp_labels) == sorted(actual_gcp_opts.labels)
示例#2
0
def pipeline_options_from_dict(all_options):
    return pipeline_options.PipelineOptions().from_dictionary(all_options)
def run(argv=None):
  # type: (List[str]) -> None
  """Runs VCF to BigQuery pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = vcf_to_bq_common.parse_args(argv,
                                                          _COMMAND_LINE_OPTIONS)
  # Note VepRunner creates new input files, so it should be run before any
  # other access to known_args.input_pattern.
  if known_args.run_annotation_pipeline:
    runner = vep_runner.create_runner_and_update_args(known_args, pipeline_args)
    runner.run_on_all_files()
    runner.wait_until_done()
    logging.info('Using VEP processed files: %s', known_args.input_pattern)

  variant_merger = _get_variant_merge_strategy(known_args)
  pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
      known_args.input_pattern, known_args.optimize_for_large_inputs)

  # Starts a pipeline to merge VCF headers in beam if the total files that
  # match the input pattern exceeds _SMALL_DATA_THRESHOLD
  _merge_headers(known_args, pipeline_args, pipeline_mode)

  # Retrieve merged headers prior to launching the pipeline. This is needed
  # since the BigQuery schema cannot yet be dynamically created based on input.
  # See https://issues.apache.org/jira/browse/BEAM-2801.
  header_fields = vcf_header_parser.get_vcf_headers(
      known_args.representative_header_file)
  counter_factory = metrics_util.CounterFactory()
  processed_variant_factory = processed_variant.ProcessedVariantFactory(
      header_fields,
      known_args.split_alternate_allele_info_fields,
      known_args.annotation_fields,
      known_args.use_allele_num,
      known_args.minimal_vep_alt_matching,
      counter_factory)

  partitioner = None
  if ((known_args.optimize_for_large_inputs and variant_merger) or
      known_args.partition_config_path):
    partitioner = variant_partition.VariantPartition(
        known_args.partition_config_path)

  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  pipeline = beam.Pipeline(options=beam_pipeline_options)
  variants = _read_variants(pipeline, known_args)
  variants |= 'FilterVariants' >> filter_variants.FilterVariants(
      reference_names=known_args.reference_names)
  if partitioner:
    num_partitions = partitioner.get_num_partitions()
    partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
        partition_variants.PartitionVariants(partitioner), num_partitions)
    variants = []
    for i in range(num_partitions):
      if partitioner.should_keep_partition(i):
        variants.append(partitioned_variants[i])
      else:
        num_partitions -= 1
  else:
    # By default we don't partition the data, so we have only 1 partition.
    num_partitions = 1
    variants = [variants]

  for i in range(num_partitions):
    if variant_merger:
      variants[i] |= ('MergeVariants' + str(i) >>
                      merge_variants.MergeVariants(variant_merger))
    variants[i] |= (
        'ProcessVaraints' + str(i) >>
        beam.Map(processed_variant_factory.create_processed_variant).\
            with_output_types(processed_variant.ProcessedVariant))
  if partitioner and partitioner.should_flatten():
    variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
    num_partitions = 1

  for i in range(num_partitions):
    table_suffix = ''
    if partitioner and partitioner.get_partition_name(i):
      table_suffix = '_' + partitioner.get_partition_name(i)
    table_name = known_args.output_table + table_suffix
    _ = (variants[i] | 'VariantToBigQuery' + table_suffix >>
         variant_to_bigquery.VariantToBigQuery(
             table_name,
             header_fields,
             variant_merger,
             processed_variant_factory,
             append=known_args.append,
             update_schema_on_append=known_args.update_schema_on_append,
             allow_incompatible_records=known_args.allow_incompatible_records,
             omit_empty_sample_calls=known_args.omit_empty_sample_calls,
             num_bigquery_write_shards=known_args.num_bigquery_write_shards))

  result = pipeline.run()
  result.wait_until_finish()

  metrics_util.log_all_counters(result)
示例#4
0
    def expand(self, deployed_model):
        """Apply the transform.

    Args:
      deployed_model: A PCollection should be the output of DeployVersion, or a
        tuple of (model, version).

    Returns:
         A PCollection with a the results of the Prediction

    Raises:
       ValueError: If the arguments are invalid.
    """
        pipeline = deployed_model.pipeline

        # For the job name use a combination of the transform label and a
        # datestamp. The datestamp is intended to make it unique.
        now = datetime.datetime.now()
        # We add some salt to the job name to avoid collisions if we try to submit
        # multiple jobs at the same time.
        # N.B. The job_name is fixed at pipeline construction time. This is
        # critical because multiple invocation of the Train transform (e.g. because
        # of retries) need to use the same job name.
        salt = '%04x' % random.getrandbits(4 * 4)

        # TODO(b/28989568): We need to lower case the name because the backend
        # only allows lower case letters for job names. The backend should probably
        # do this automatically but currently it doesn't.
        job_name = '{0}_{1}_{2}'.format(self.label,
                                        now.strftime('%y%m%d_%H%M%S'),
                                        salt).lower().replace(' ', '_')

        options = pipeline.options
        # TODO(b/29163051) Options can be None depending on how the runner was
        # constructed.
        if options is None:
            options = df_options.PipelineOptions()

        cloud_options = options.view_as(df_options.GoogleCloudOptions)
        project_id = cloud_options.project

        if cloud_options.temp_location:
            temp_dir = cloud_options.temp_location
        elif cloud_options.staging_location:
            temp_dir = cloud_options.staging_location
        else:
            raise ValueError(
                '--staging_location must be specified to run in the cloud')

        if not self.output_uri:
            output_uri = os.path.join(temp_dir, 'prediction_results')
        else:
            output_uri = self.output_uri

        logging.info('Output uri : %s', output_uri)

        # Construct the batch prediction job.
        prediction_request = ml_func.PredictionJobRequest(
            project_id,
            job_name,
            self.input_uris,
            output_uri,
            self.region,
            self.data_format,
            endpoint=self.cloud_ml_endpoint,
            runtime_version=self.runtime_version)
        request = (
            pipeline | 'PredictRequest' >> beam.Create([prediction_request])
            | 'AugmentPredictArgs' >> beam.ParDo(
                ml_func._AugmentPredictArgsDo(),  # pylint: disable=protected-access
                beam.pvalue.AsSingleton(deployed_model)))

        # Run the batch prediction job
        predict_do = ml_func.BatchPredictionJobDo(api_class=self.api_version)
        unused_prediction_results = (
            request | 'BatchPrediction' >> beam.ParDo(predict_do))

        # Wait until the prediction job is done, then Read the results from the file
        # to which they were written and return.
        results = 'Read Results' >> beam.io.ReadFromText(output_uri,
                                                         validate=False)
        return results
def main(unused_args):
  """Runs the Beam pipeline."""
  options = pipeline_options.PipelineOptions()
  p = beam.Pipeline(options=options)
  pipeline(p)
  p.run().wait_until_finish()
示例#6
0
    def _create_large_avro_file(
            self,
            blob_name,
            staging_table_util,
            destination_prefix,
            compression,
            extension
    ):
        """Creates avro files from a staging table and stores in GCS.

        The avro file is generated in this method using DataFlow. BigQuery
        extract jobs do support avro as a destination format. However, if the
        size of the staging table is greater than 1 GB, the generated files
        must be sharded and then composed into a single file. The composition
        process causes errors when the destination format is avro, since some
        of the composed avro files end up with negative row counts. Therefore,
        this method can be called when generating an avro file from a staging
        table greater than 1 GB.

        Args:
            blob_name(str): Name of the file (or blob) to be generated. Starts
                with 'fileType=' and end with the file extension.
                Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv # pylint: disable=line-too-long
            staging_table_util(load_benchmark_tools.table_util.TableUtil): Util
                object for interacting with the staging table that the avro
                file will be generated from.
            destination_prefix(str): String containing the 'gs://' prefix, the
                bucket name, and the path of the file, without the extension.
                This is needed by the WriteToParquet class.
                Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long
            compression(str): String representing the compression format that
                the generated file should have. Options are 'none' if no
                compression is to be used, 'snappy', or 'deflate'.
            extension(str): String to be used as the extension for the avro
                file. Options are 'avro' if no compression is to be used,
                'snappy', or 'deflate'.
        """
        pipeline_args = ['--project', self.project_id,
                         '--staging_location', self.dataflow_staging_location,
                         '--temp_location', self.dataflow_temp_location,
                         '--save_main_session',
                         '--worker_machine_type', 'n1-highcpu-32',
                         '--runner', 'DataflowRunner',
                         '--setup_file', './setup.py']
        options = pipeline_options.PipelineOptions(pipeline_args)
        table_spec = beam_bigquery.TableReference(
            projectId=self.project_id,
            datasetId=self.primitive_staging_dataset_id,
            tableId=staging_table_util.table_id,
        )
        codec = 'null' if compression == 'none' else compression
        bq_schema = staging_table_util.table.schema
        table_name = staging_table_util.table.table_id
        avro_schema = avro_util.AvroUtil(
            bq_schema=bq_schema,
            schema_name=table_name
        ).get_avro_translated_schema()
        p = beam.Pipeline(options=options)
        table = (p
                 | 'ReadTable' >> beam.io.Read(
                     beam.io.BigQuerySource(table_spec)))
        (table | beam.io.WriteToAvro(
            file_path_prefix=destination_prefix,
            schema=avro_schema,
            file_name_suffix='.' + extension,
            use_fastavro=True,
            codec=codec,
            num_shards=1,
            shard_name_template='',
        ))
        p.run().wait_until_finish()
        logging.info('Created file: {0:s}'.format(blob_name))
def run():
    ''' Create the Publisher '''
    publisher = pubsub_v1.PublisherClient()  # Creates a publisher client
    topic_name = 'projects/{project_id}/topics/{topic}'.format(
        project_id="famous-store-237108", topic="BQTopic")
    topic_path = publisher.topic_path(
        "famous-store-237108", "BQTopic"
    )  # Creates a fully qualified topic path. Same as previous row

    project_path = publisher.project_path(
        "famous-store-237108")  # Creates a fully qualified project path

    found = False  # Check if topic exists in project
    for topic in publisher.list_topics(
            project_path):  # topic is a fully qualified topic path
        if topic.name == topic_name:
            found = True
    if not found:  # If not found, create it
        publisher.create_topic(topic_name)

    future = publisher.publish(topic_name,
                               b"3,3,Three Three")  # Publish a message
    if future._completed:  # Check if successful
        print("Message sent successfully!")
    """Build and run the pipeline."""

    pipeline_options = opt.PipelineOptions()
    pipeline_options.view_as(opt.StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:  # Creates a pipeline
        # Read the pubsub topic into a PCollection.
        msg = p | "Read from pubSub" >> beam.io.ReadFromPubSub(
            topic_path)  # Read

        lines2 = (
            p | "Create from in-memory List" >> beam.Create([  # Create
                'To be, or not to be: that is the question: ',
                'Whether \'tis nobler in the mind to suffer ',
                'The slings and arrows of outrageous fortune, ',
                'Or to take arms against a sea of troubles, '
            ]))

        # PCollection: immutable, elements are of same type, no random access. Can be bounded or stream. Windows are used with timestamps

        # Transforms: ParDo, Combine, composite: combines core transforms
        ''' [Final Output PCollection] = ([Initial Input PCollection] | [First Transform]
             | [Second Transform]
             | [Third Transform]) '''

        # Apply a ParDo to the PCollection "words" to compute lengths for each word.
        # ParDo: “Map” phase of a Map/Shuffle/Reduce-style algorithm
        # Filter, convert, pick part of the data, simple computation
        # You must supply a DoFn class
        rows = msg | "Convert to dict" >> beam.ParDo(MyFn())

        #rows = [{"id1": 3, "id2": 3, "val1": "Three Three"}]
        rows | beam.io.WriteToBigQuery(
            table='mytable',
            dataset="mydataset",
            project="famous-store-237108",
            schema='id1:INTEGER, id2:INTEGER, val1:STRING',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
        )  # Could be WRITE_TRUNCATE
示例#8
0
 def _run_pipeline(self, pipeline):
   options = pipeline_options.PipelineOptions(
       runner='DirectRunner', direct_running_mode='in_memory')
   p = beam.Pipeline(options=options)
   pipeline(p)
   p.run().wait_until_finish()
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | beam.io.ReadFromText(getattr(runtime_options,
                                           c._OPTION_INPUT_CSV),
                                   skip_header_lines=1)
            | beam.Map(lambda x: list(csv.reader([x]))[0])
            | beam.FlatMap(
                c.csv_line_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | beam.FlatMap(c.calculate_model_fit_validation_to_text,
                            pvalue.AsSingleton(options)))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs_without_extra_dimension),
                pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed], prediction_params
        )

        prediction_by_customer_no_segments_no_extra_dimension = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        prediction_by_customer_no_segments = (
            prediction_by_customer_no_segments_no_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # [customer_id, p_alive, predicted_purchases, future_aov
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?]
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract predictions params
            | beam.FlatMap(c.calculate_prediction_to_text,
                           pvalue.AsSingleton(options)))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten())

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             | 'Write prediction_by_customer' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='customer_id,p_alive'
                 ',predicted_purchases'
                 ',future_aov,historical_aov'
                 ',expected_value,frequency,recency'
                 ',total_time_observed,segment'
                 ',extra_dimension',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_by_customer.csv'))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (prediction_summary
             | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line)
             | 'Write prediction_summary' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='segment,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value,average_predicted_purchases'
                 ',total_customer_value,number_of_customers'
                 ',perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary.csv'))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             |
             'Write prediction_summary_extra_dimension' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='extra_dimension,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value'
                 ',average_predicted_purchases,total_customer_value'
                 ',number_of_customers,perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary_extra_dimension.csv'))
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()

    # Add the arguments needed for this specific Dataflow job.
    parser.add_argument(
        '--input',
        dest='input',
        required=True,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.')

    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output BQ table to write results to.')

    parser.add_argument('--delimiter',
                        dest='delimiter',
                        required=False,
                        help='Delimiter to split input records.',
                        default=',')

    parser.add_argument('--fields',
                        dest='fields',
                        required=True,
                        help='Comma separated list of field names.')

    parser.add_argument('--load_dt',
                        dest='load_dt',
                        required=True,
                        help='Load date in YYYY-MM-DD format.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    row_transformer = RowTransformer(delimiter=known_args.delimiter,
                                     header=known_args.fields,
                                     filename=ntpath.basename(
                                         known_args.input),
                                     load_dt=known_args.load_dt)

    p_opts = pipeline_options.PipelineOptions(pipeline_args)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is.
    with beam.Pipeline(options=p_opts) as pipeline:
        # Read the file.  This is the source of the pipeline.  All further
        # processing starts with lines read from the file.  We use the input
        # argument from the command line.
        rows = pipeline | "Read from text file" >> beam.io.ReadFromText(
            known_args.input)

        # This stage of the pipeline translates from a delimited single row
        # input to a dictionary object consumable by BigQuery.
        # It refers to a function we have written.  This function will
        # be run in parallel on different workers using input from the
        # previous stage of the pipeline.
        dict_records = rows | "Convert to BigQuery row" >> beam.Map(
            lambda r: row_transformer.parse(r))

        # This stage of the pipeline writes the dictionary records into
        # an existing BigQuery table. The sink is also configured to truncate
        # the table if it contains any existing records.
        dict_records | "Write to BigQuery" >> beam.io.Write(
            beam.io.BigQuerySink(
                known_args.output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
示例#11
0
  def add_input (self, sum_count, input):
    (sum) = sum_count
    return sum + input

  def merge_accumulators (self, accumulators):
    sums = zip(*accumulators)
    return sum (sums)

  def extract_output(self, sum_count):
    (sum) = sum_count
    return {'Sum': sum}
  def default_label(self):
    return self.__class__.__name__

pipeline_options = opt.PipelineOptions ()
pipeline_options.view_as (opt.StandardOptions).streaming = False
#options=pipeline_options

with beam.Pipeline () as p: # Creates a pipeline
       
    lines = (p | "Create from in-memory List" >> beam.Create ([ # Create
                 'To be, or not to be: that is the question: ',
                 'Whether \'tis nobler in the mind to suffer ',
                 'The slings and arrows of outrageous fortune, ',
                 'Or to take arms against a sea of troubles, ']))
    
    # Example 1
    msg = ["5, 5, Five Five"]

    words = p | beam.Create (["Cat", "Mouse", "Horse", "Chimpanzee", "Fish"])
示例#12
0
def main():
    data = [
        (
            'a',
            1,
            100,
        ),
        (
            'b',
            2,
            100,
        ),
        (
            'c',
            1,
            100,
        ),
        (
            'd',
            2,
            100,
        ),
        (
            'e',
            1,
            100,
        ),
        (
            'f',
            1,
            100,
        ),
        (
            'g',
            1,
            100,
        ),
        (
            'h',
            1,
            100,
        ),
        (
            'i',
            1,
            100,
        ),
    ]

    with beam.Pipeline(options=pipeline_options.PipelineOptions()) as p:
        students = p | 'create_data' >> beam.Create(data)

        class CreateKeyValue(beam.DoFn):
            def process(self, element):
                return [(element[0], (element[1], element[2]))]

        fixed_window = (
            students
            | 'fixed_window' >> beam.WindowInto(beam.window.FixedWindows(60))
            | 'fixed_window_do_fn' >> beam.ParDo(CreateKeyValue())
            | 'fixed_window_group' >> beam.GroupByKey())
        write_to_file(fixed_window, 'fixed_window')

        sliding_window = (
            students
            | 'sliding_window' >> beam.WindowInto(
                beam.window.SlidingWindows(30, 5))
            | 'sliding_window_do_fn' >> beam.ParDo(CreateKeyValue())
            | 'sliding_window_group' >> beam.GroupByKey())
        write_to_file(sliding_window, 'sliding_window')

        session_window = (
            students
            |
            'session_window' >> beam.WindowInto(beam.window.Sessions(10 * 60))
            | 'session_window_do_fn' >> beam.ParDo(CreateKeyValue())
            | 'session_window_group' >> beam.GroupByKey())
        write_to_file(session_window, 'session_window')

        global_window = (
            students
            | 'global_window' >> beam.WindowInto(beam.window.GlobalWindows())
            | 'global_window_do_fn' >> beam.ParDo(CreateKeyValue())
            | 'global_window_group' >> beam.GroupByKey())
        write_to_file(global_window, 'global_window')
示例#13
0
    def expand(self, train_and_test_datasets):
        """Apply the transform.

    Args:
      train_and_test_datasets: A pair of (train, test) PCollections of
        json strings representing Example Protos

    Returns:
       A 2-tuple of
         A PCollection with a single TrainedModel, suitable for used by Predict
         A PCollection with a single TrainingJobResult that describes the
         result of training.

    Raises:
       ValueError: If the arguments are invalid.
    """
        train_dataset, test_dataset = train_and_test_datasets
        pipeline = train_dataset.pipeline

        # For the job name use a combination of the transform label and a
        # datestamp. The datestamp is intended to make it unique.
        now = datetime.datetime.now()
        # We add some salt to the job name to avoid collisions if we try to submit
        # multiple jobs at the same time.
        # N.B. The job_name is fixed at pipeline construction time. This is
        # critical because multiple invocation of the Train transform (e.g. because
        # of retries) need to use the same job name.
        salt = '%04x' % random.getrandbits(4 * 4)

        # TODO(b/28989568): We need to lower case the name because the backend
        # only allows lower case letters for job names. The backend should probably
        # do this automatically but currently it doesn't.
        job_name = '{0}_{1}_{2}'.format(self.label,
                                        now.strftime('%y%m%d_%H%M%S'),
                                        salt).lower()

        options = pipeline.options
        # TODO(b/29163051) Options can be None depending on how the runner was
        # constructed.
        if options is None:
            options = df_options.PipelineOptions()

        cloud_options = options.view_as(df_options.GoogleCloudOptions)
        run_on_cloud = self.use_cloud_ml

        if run_on_cloud is None:
            # TODO(user): Remove the fallback after the next Dataflow release.
            try:
                dataflow_runner = beam.runners.DataflowRunner
            except AttributeError:
                dataflow_runner = beam.runners.DataflowPipelineRunner

            # Choose a default based on the runner.
            if isinstance(pipeline.runner, dataflow_runner):
                run_on_cloud = True
            else:
                run_on_cloud = False

        if self.output_dir:
            temp_dir = self.output_dir
        elif run_on_cloud:
            cloud_options = options.view_as(df_options.GoogleCloudOptions)

            if cloud_options.temp_location:
                temp_dir = os.path.join(cloud_options.temp_location, job_name)
            elif cloud_options.staging_location:
                temp_dir = os.path.join(cloud_options.staging_location,
                                        job_name)
            else:
                raise ValueError(
                    '--staging_location must be specified to run in the cloud')
        else:
            temp_dir = tempfile.mkdtemp(job_name)
        logging.info('Temp dir: %s', temp_dir)

        if run_on_cloud:
            train_do = ml_func.TrainingJobDo()
            project = cloud_options.project
        else:
            train_do = ml_func._TrainingJobLocalDo()  # pylint: disable=protected-access
            project = None

        _ = train_dataset | dfutil.CountPCollection('ml-train-input')

        # Write the train and test data to files so we can pass it to the trainer.
        train_data_path = os.path.join(temp_dir, 'training')
        test_data_path = os.path.join(temp_dir, 'testing')
        output_dir = os.path.join(temp_dir, 'model')
        # TODO(b/34839956) Make sure we can handle the tf.Transform metadata.
        metadata_path = os.path.join(output_dir, 'metadata.json')

        # This PTransform is primarily to avoid stage name collisions in writing
        # training and test data.
        # TODO(user): Figure out why i_type @beam.ptransform_fn breaks pickling.
        train_files = (
            train_dataset | 'WriteTrainData' >> ml_func._WrapCallable(  # pylint: disable=protected-access
                self.tf_main_spec.write_input_data, train_data_path))
        test_files = (
            test_dataset | 'WriteTestData' >> ml_func._WrapCallable(  # pylint: disable=protected-access
                self.tf_main_spec.write_input_data, test_data_path))
        if self.metadata:
            metadata_files = self.metadata | SaveMetadata(metadata_path)
        else:
            metadata_files = pipeline | beam.Create([None])

        # Construct and run the training job.
        train_request = self.tf_main_spec.train_request.copy()
        if not train_request.package_uris:
            train_request.package_uris = []
        if self.package_uris:
            if isinstance(self.package_uris, basestring):
                train_request.package_uris.extend([self.package_uris])
            else:
                train_request.package_uris.extend(self.package_uris)
        # remove duplicates from train_request
        train_request.package_uris = list(set(train_request.package_uris))

        train_request.job_args = self.job_args or []
        if self.python_module:
            train_request.python_module = self.python_module
        if not train_request.project:
            train_request.parent = project
        if not train_request.job_name:
            train_request.job_name = job_name
        if not train_request.endpoint:
            train_request.endpoint = self.cloud_ml_endpoint
        if not train_request.hyperparameters:
            train_request.hyperparameters = self.hyperparameters
        if not train_request.region:
            train_request.region = self.region
        if not train_request.scale_tier:
            train_request.scale_tier = self.scale_tier
        if not train_request.worker_count:
            train_request.worker_count = self.worker_count
        if not train_request.ps_count:
            train_request.ps_count = self.ps_count
        if not train_request.worker_type:
            train_request.worker_type = self.worker_type
        if not train_request.ps_type:
            train_request.ps_type = self.ps_type
        if not train_request.master_type:
            train_request.master_type = self.master_type
        if not train_request.runtime_version:
            train_request.runtime_version = self.runtime_version

        requests = (
            pipeline | 'CreateRequest' >> beam.Create([train_request])
            | 'AugmentTrainingArgs' >> beam.ParDo(
                ml_func._AugmentTrainArgsDo(  # pylint: disable=protected-access
                    self.tf_main_spec),
                beam.pvalue.AsIter(train_files),
                beam.pvalue.AsIter(test_files),
                output_dir,
                beam.pvalue.AsSingleton(metadata_files)))

        train_results = requests | 'TrainModel' >> beam.ParDo(train_do)

        # Read and return the model directory and training results.
        model_directory = (
            train_results
            | 'CreateModel' >> beam.Map(self.tf_main_spec.read_model,
                                        output_dir, self.export_subdir))

        return model_directory, train_results
示例#14
0
def test_read_messages_timestamp_attribute_missing(
    mocker,
    patch_sub_client,
    patch_msg_manager,
):
    exp_entity_id = "entity_id"
    kmsg = klio_pb2.KlioMessage()
    kmsg.data.element = bytes(exp_entity_id, "utf-8")
    data = kmsg.SerializeToString()

    attributes = {}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    publish_time = "2018-03-12T13:37:01.234567Z"
    ack_id = "ack_id"
    pull_response = beam_test_utils.create_pull_response([
        beam_test_utils.PullResponseMessage(data, attributes,
                                            publish_time_secs,
                                            publish_time_nanos, ack_id)
    ])
    pmsg = b_pubsub.PubsubMessage(data, attributes)
    expected_elements = [
        beam_testing_util.TestWindowedValue(
            pmsg,
            beam_utils.timestamp.Timestamp.from_rfc3339(publish_time),
            [beam_transforms.window.GlobalWindow()],
        ),
    ]
    patch_sub_client.pull.return_value = pull_response

    options = pipeline_options.PipelineOptions([])
    options.view_as(pipeline_options.StandardOptions).streaming = True
    with beam_test_pipeline.TestPipeline(options=options) as p:
        pcoll = p | b_pubsub.ReadFromPubSub(
            "projects/fakeprj/topics/a_topic",
            None,
            None,
            with_attributes=True,
            timestamp_attribute="nonexistent",
        )
        # Check original functionality that was kept the same
        beam_testing_util.assert_that(
            pcoll,
            beam_testing_util.equal_to(expected_elements),
            reify_windows=True,
        )

    # Check overridden functionality:
    # 1. Check that auto-acking is skipped
    patch_sub_client.acknowledge.assert_not_called()
    # 2. Check that MessageManager daemon threads were started
    patch_msg_manager.assert_called_once_with(
        patch_sub_client.subscription_path())
    # 3. Check that messages were added to the MessageManager
    patch_msg_manager.return_value.add.assert_called_once_with(ack_id, pmsg)
    # 4. Check that one message is handled at a time, instead of the
    #    original 10
    patch_sub_client.pull.assert_called_once_with(mocker.ANY,
                                                  max_messages=1,
                                                  return_immediately=True)

    patch_sub_client.api.transport.channel.close.assert_called_once_with()
示例#15
0
def populate_currency_dim():
    # UDM table
    UDM_table_spec_currency = bigquery.TableReference(projectId=PROJECT,
                                                      datasetId=DATASET_UDM,
                                                      tableId='currency')

    # Consumption tables
    CONS_table_spec_currency_dim = bigquery.TableReference(
        projectId=PROJECT, datasetId=DATASET_CONS, tableId='currency_dim')

    currency_schema = ({
        'fields': [{
            'name': "CURRENCY_KEY",
            'type': 'INTEGER',
            'mode': 'REQUIRED'
        }, {
            'name': "CRNCY_CDE",
            'type': 'STRING',
            'mode': 'REQUIRED'
        }, {
            'name': "CRNCY_NAME",
            'type': "STRING",
            'mode': "REQUIRED"
        }, {
            'name': "DEL_REC_IND",
            'type': "BOOLEAN",
            'mode': "REQUIRED"
        }, {
            'name': "ACTV_REC_IND",
            'type': "BOOLEAN",
            'mode': "REQUIRED"
        }, {
            'name': "DCML_ADJ_NUM",
            'type': "INTEGER",
            'mode': "REQUIRED"
        }, {
            'name': "REC_CREAT_DT_TM",
            'type': "TIMESTAMP",
            'mode': "REQUIRED"
        }, {
            'name': "REC_UPDT_DT_TM",
            'type': "TIMESTAMP",
            'mode': "NULLABLE"
        }]
    })

    currency_query = 'SELECT CURRENCY_ID as CURRENCY_KEY, CRNCY_CDE, CRNCY_NAME, DEL_REC_IND, ACTV_REC_IND, DCML_ADJ_NUM, REC_CREAT_DT_TM, REC_UPDT_DT_TM '\
                     'FROM [famous-store-237108:UDM.currency]'

    # Build and run the pipeline
    pipeline_options = opt.PipelineOptions(
    )  # This is deprecated, not future proof. Replacement TBA
    pipeline_options.view_as(
        opt.StandardOptions).streaming = False  # Set options first
    google_cloud_options = pipeline_options.view_as(opt.GoogleCloudOptions)
    google_cloud_options.project = PROJECT
    google_cloud_options.job_name = 'loadcurrency'
    google_cloud_options.staging_location = 'gs://csacsi/staging'
    google_cloud_options.temp_location = 'gs://csacsi/temp'
    google_cloud_options.region = 'europe-west1'

    with beam.Pipeline(options=pipeline_options
                       ) as pcoll:  # Creates a pipeline, PCollection instance

        #rows = pcoll | "Read from UDM.location" >> beam.io.Read (beam.io.BigQuerySource (UDM_table_spec_location)) # Read from UDM.location
        rows = pcoll | "Read from UDM.currency" >> beam.io.Read(
            beam.io.BigQuerySource(
                query=currency_query))  # Read from UDM.location

        #rows = rows | beam.ParDo (id2key ())

        rows | "Write to Consumption.Currency_dim" >> beam.io.WriteToBigQuery(
            CONS_table_spec_currency_dim,
            schema=currency_schema,  # schema variable (list) could be used
            create_disposition=BigQueryDisposition.CREATE_NEVER,
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
示例#16
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        required=True,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output BQ table to write results to.')
    parser.add_argument('--load_dt',
                        dest='load_dt',
                        required=True,
                        help='Load date in YYYY-MM-DD format.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    row_transformer = RowTransformer(filename=ntpath.basename(
        known_args.input),
                                     load_dt=known_args.load_dt)
    p_opts = pipeline_options.PipelineOptions(pipeline_args)

    with beam.Pipeline(options=p_opts) as pipeline:

        rows = pipeline | "Read from text file" >> beam.io.ReadFromText(
            known_args.input)

        dict_records = rows | "Convert to BigQuery row" >> beam.Map(
            lambda r: row_transformer.parse(r))

        bigquery_table_schema = {
            "fields": [{
                "mode": "NULLABLE",
                "name": "BackorderOrderID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "Comments",
                "type": "STRING"
            }, {
                "mode": "NULLABLE",
                "name": "ContactPersonID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "CustomerID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "CustomerPurchaseOrderNumber",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "DeliveryInstructions",
                "type": "STRING"
            }, {
                "mode": "NULLABLE",
                "name": "ExpectedDeliveryDate",
                "type": "DATE"
            }, {
                "mode": "NULLABLE",
                "name": "InternalComments",
                "type": "STRING"
            }, {
                "mode": "NULLABLE",
                "name": "IsUndersupplyBackordered",
                "type": "BOOLEAN"
            }, {
                "mode": "NULLABLE",
                "name": "LastEditedBy",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "LastEditedWhen",
                "type": "TIMESTAMP"
            }, {
                "mode": "NULLABLE",
                "name": "OrderDate",
                "type": "DATE"
            }, {
                "mode": "NULLABLE",
                "name": "OrderID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "PickedByPersonID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "PickingCompletedWhen",
                "type": "TIMESTAMP"
            }, {
                "mode": "NULLABLE",
                "name": "SalespersonPersonID",
                "type": "INTEGER"
            }, {
                "mode": "NULLABLE",
                "name": "filename",
                "type": "STRING"
            }, {
                "mode": "NULLABLE",
                "name": "load_dt",
                "type": "DATE"
            }]
        }

        dict_records | "Write to BigQuery" >> beam.io.WriteToBigQuery(
            known_args.output,
            schema=bigquery_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
示例#17
0
def populate_country_dim():
    # UDM table
    UDM_table_spec_location = bigquery.TableReference(projectId=PROJECT,
                                                      datasetId=DATASET_UDM,
                                                      tableId='location')

    # Consumption table
    CONS_table_spec_country_dim = bigquery.TableReference(
        projectId=PROJECT, datasetId=DATASET_CONS, tableId='country_dim')

    country_schema = ({
        'fields': [{
            'name': "LOCATION_KEY",
            'type': 'INTEGER',
            'mode': 'REQUIRED'
        }, {
            'name': "CTRY_ISO2_CDE",
            'type': 'STRING',
            'mode': 'REQUIRED'
        }, {
            'name': "CTRY_ISO3_CDE",
            'type': "STRING",
            'mode': "REQUIRED"
        }, {
            'name': "CTRY_NAME",
            'type': "STRING",
            'mode': "REQUIRED"
        }, {
            'name': "REGION_NAME",
            'type': "STRING",
            'mode': "REQUIRED"
        }, {
            'name': "CPTL_CITY_NAME",
            'type': "STRING",
            'mode': "REQUIRED"
        }, {
            'name': "DEL_REC_IND",
            'type': "BOOLEAN",
            'mode': "REQUIRED"
        }, {
            'name': "ACTV_REC_IND",
            'type': "BOOLEAN",
            'mode': "REQUIRED"
        }, {
            'name': "REC_CREAT_DT_TM",
            'type': "TIMESTAMP",
            'mode': "REQUIRED"
        }, {
            'name': "REC_UPDT_DT_TM",
            'type': "TIMESTAMP",
            'mode': "NULLABLE"
        }]
    })

    location_query = 'SELECT LOCATION_ID as LOCATION_KEY, CTRY_ISO2_CDE, CTRY_ISO3_CDE, CTRY_NAME, REGION_NAME, CPTL_CITY_NAME, DEL_REC_IND, '\
                     'ACTV_REC_IND, REC_CREAT_DT_TM, REC_UPDT_DT_TM '\
                     'FROM [famous-store-237108:UDM.location]'
    # Build and run the pipeline
    pipeline_options = opt.PipelineOptions(
    )  # This is deprecated, not future proof. Replacement TBA
    pipeline_options.view_as(
        opt.StandardOptions).streaming = False  # Set options first
    google_cloud_options = pipeline_options.view_as(opt.GoogleCloudOptions)
    google_cloud_options.project = PROJECT
    google_cloud_options.job_name = 'loadcountryno'
    google_cloud_options.staging_location = 'gs://csacsi/staging'
    google_cloud_options.temp_location = 'gs://csacsi/temp'
    google_cloud_options.region = 'europe-west1'

    with beam.Pipeline(options=pipeline_options
                       ) as pcoll:  # Creates a pipeline, PCollection instance

        #rows = pcoll | "Read from UDM.location" >> beam.io.Read (beam.io.BigQuerySource (UDM_table_spec_location)) # Read from UDM.location
        rows = pcoll | "Read from UDM.location" >> beam.io.Read(
            beam.io.BigQuerySource(
                query=location_query))  # Read from UDM.location
        ''' [Final Output PCollection] = ([Initial Input PCollection] | [First Transform] | [Second Transform] | [Third Transform]) '''

        #dict_rows = rows | "Convert to dictionary" >> beam.ParDo (createDict ()) # Convert tuples returned by Oracle into dictionary needed for BigQuery

        #rows = rows | beam.ParDo (id2key ())

        rows | "Write to Consumption.Country_dim" >> beam.io.WriteToBigQuery(
            CONS_table_spec_country_dim,
            schema=country_schema,  # schema variable (list) could be used
            create_disposition=BigQueryDisposition.CREATE_NEVER,
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
示例#18
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    if known_args.auto_flags_experiment:
        _get_input_dimensions(known_args, pipeline_args)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    all_patterns = ([annotated_vcf_pattern]
                    if annotated_vcf_pattern else known_args.all_patterns)

    variant_merger = _get_variant_merge_strategy(known_args)

    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    avro_root_path = _get_avro_root_path(beam_pipeline_options)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path,
                   annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    schema = schema_converter.generate_schema_from_header_fields(
        header_fields, processed_variant_factory, variant_merger,
        known_args.use_1_based_coordinate, known_args.include_call_name)

    sharding = variant_sharding.VariantSharding(
        known_args.sharding_config_path)
    if sharding.should_keep_shard(sharding.get_residual_index()):
        num_shards = sharding.get_num_shards()
    else:
        num_shards = sharding.get_num_shards() - 1

    if known_args.update_schema_on_append:
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            table_name = bigquery_util.compose_table_name(
                known_args.output_table, table_suffix)
            bigquery_util.update_bigquery_schema_on_append(
                schema.fields, table_name)

    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(
        all_patterns,
        pipeline,
        known_args,
        pipeline_mode,
        use_1_based_coordinate=known_args.use_1_based_coordinate)
    if known_args.allow_malformed_records:
        variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants()
    sharded_variants = variants | 'ShardVariants' >> beam.Partition(
        shard_variants.ShardVariants(sharding), sharding.get_num_shards())
    variants = []
    for i in range(num_shards):
        suffix = sharding.get_output_table_suffix(i)
        # Convert tuples to list
        variants.append(sharded_variants[i])
        if variant_merger:
            variants[i] |= ('MergeVariants' + suffix >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + suffix >>
            beam.Map(processed_variant_factory.create_processed_variant). \
            with_output_types(processed_variant.ProcessedVariant))
        _ = (variants[i]
             | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles(
                 avro_root_path + suffix,
                 schema,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement),
                 include_call_name=known_args.include_call_name))
    result = pipeline.run()
    try:
        state = result.wait_until_finish()
        if state != beam.runners.runner.PipelineState.DONE:
            logging.error(
                'Dataflow pipeline terminated in an unexpected state: %s',
                state)
            raise AssertionError(
                'Dataflow pipeline terminated in {} state'.format(state))
    except Exception as e:
        logging.error('Dataflow pipeline failed.')
        raise e
    else:
        logging.info('Dataflow pipeline finished successfully.')
        metrics_util.log_all_counters(result)

    # After pipeline is done, create output tables and load AVRO files into them.
    schema_file = _write_schema_to_temp_file(schema, avro_root_path)
    suffixes = []
    try:
        for i in range(num_shards):
            suffixes.append(sharding.get_output_table_suffix(i))
            partition_range_end = sharding.get_output_table_partition_range_end(
                i)
            if not known_args.append:
                table_name = bigquery_util.compose_table_name(
                    known_args.output_table, suffixes[i])
                partitioning.create_bq_table(
                    table_name, schema_file,
                    bigquery_util.ColumnKeyConstants.START_POSITION,
                    partition_range_end)
                _record_newly_created_table(table_name)
                logging.info('Integer range partitioned table %s was created.',
                             table_name)
        if not known_args.append:
            _record_newly_created_table(
                sample_info_table_schema_generator.create_sample_info_table(
                    known_args.output_table))

        suffixes.append(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table,
                                       suffixes, False)
        not_empty_variant_suffixes = load_avro.start_loading()
        logging.info('Following tables were loaded with at least 1 row:')
        for suffix in not_empty_variant_suffixes:
            logging.info(
                bigquery_util.compose_table_name(known_args.output_table,
                                                 suffix))
        # Remove sample_info table from both lists to avoid duplicating it when
        # --sample_lookup_optimized_output_table flag is set
        suffixes.remove(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\
            not_empty_variant_suffixes:
            not_empty_variant_suffixes.remove(
                sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
    except Exception as e:
        logging.error(
            'Something unexpected happened during the loading of AVRO '
            'files to BigQuery: %s', str(e))
        logging.info(
            'Since the write to BigQuery stage failed, we did not delete '
            'AVRO files in your GCS bucket. You can manually import them '
            'to BigQuery. To avoid extra storage charges, delete them if '
            'you do not need them, AVRO files are located at: %s',
            avro_root_path)
        raise e
    else:
        logging.warning('All AVRO files were successfully loaded to BigQuery.')
        if known_args.keep_intermediate_avro_files:
            logging.info(
                'Since "--keep_intermediate_avro_files" flag is set, the '
                'AVRO files are kept and stored at: %s', avro_root_path)
        else:
            if bigquery_util.delete_gcs_files(avro_root_path) != 0:
                logging.error(
                    'Deletion of intermediate AVRO files located at "%s" has '
                    'failed.', avro_root_path)

    if known_args.sample_lookup_optimized_output_table:
        flatten_call_column = partitioning.FlattenCallColumn(
            known_args.output_table, not_empty_variant_suffixes,
            known_args.append)
        try:
            flatten_schema_file = tempfile.mkstemp(
                suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
            if not flatten_call_column.get_flatten_table_schema(
                    flatten_schema_file):
                raise ValueError('Failed to extract schema of flatten table')
            # Create output flatten tables if needed
            if not known_args.append:
                # Create all sample optimized tables including those that will be empty.
                for suffix in suffixes:
                    output_table_id = bigquery_util.compose_table_name(
                        known_args.sample_lookup_optimized_output_table,
                        suffix)
                    partitioning.create_bq_table(
                        output_table_id, flatten_schema_file,
                        bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                        partitioning.MAX_RANGE_END)
                    _record_newly_created_table(output_table_id)
                    logging.info(
                        'Sample lookup optimized table %s was created.',
                        output_table_id)
            # Copy to flatten sample lookup tables from the variant lookup tables.
            # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607).
            flatten_call_column.copy_to_flatten_table(
                known_args.sample_lookup_optimized_output_table)
            logging.info(
                'All sample lookup optimized tables are fully loaded.')
        except Exception as e:
            logging.error(
                'Something unexpected happened during the loading rows to '
                'sample optimized table stage: %s', str(e))
            raise e
示例#19
0
def preprocess_data(test_mode):
    import shutil, os, subprocess
    
    ### Saving the job
    job_name = 'preprocess-credit-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')

    if test_mode:
        ### If running in test mode, save the job locally
        print('Launching job in test mode:')
        OUTPUT_DIR = './preproc'
        
        # delete output directory if it exists
        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
        
        # create the directory
        os.makedirs(OUTPUT_DIR)
    else:
        ### If launching a Dataflow job, save the job on Google Cloud Storage (GCS)
        print('Launching Dataflow job {}:'.format(job_name))
        OUTPUT_DIR = 'gs://{0}/credit_default/preproc/'.format(BUCKET)
        try:
            subprocess.check_call('gsutil -m rm -r {}'.format(OUTPUT_DIR).split())
        except:
            pass
    
    
    ### Let's define our own Apache Beam Options:
    options = {
        'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
        'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
        'job_name': job_name,
        'region': REGION,
        'project': PROJECT,
        'max_num_workers': 6,
        'setup_file':'directory/to/setup.py'  # change this to the directory of the setup file
    }
   
    opts = pipeline_options.PipelineOptions(flags = [], **options)
    
    ### Choose the runner
    if test_mode:
        ### local mode
        RUNNER = 'DirectRunner'
    else:
        ### Dataflow
        RUNNER = 'DataflowRunner'
        
    p = beam.Pipeline(RUNNER, options = opts)
    
    ### Let's create the Train and Eval Datasets:
    query = """
        SELECT 
            ABS(FARM_FINGERPRINT(CAST(ID AS STRING))) AS hashid,
            LIMIT_BAL,
            SEX,
            EDUCATION,
            MARRIAGE,
            AGE,
            PAY_0 AS PAY_1,
            PAY_2,
            PAY_3,
            PAY_4,
            PAY_5,
            PAY_6,
            CAST(BILL_AMT1 AS FLOAT64) AS BILL_AMT1,
            CAST(BILL_AMT2 AS FLOAT64) AS BILL_AMT2,
            CAST(BILL_AMT3 AS FLOAT64) AS BILL_AMT3,
            CAST(BILL_AMT4 AS FLOAT64) AS BILL_AMT4,
            CAST(BILL_AMT5 AS FLOAT64) AS BILL_AMT5,
            CAST(BILL_AMT6 AS FLOAT64) AS BILL_AMT6,
            CAST(PAY_AMT1 AS FLOAT64) AS PAY_AMT1,
            CAST(PAY_AMT2 AS FLOAT64) AS PAY_AMT2,
            CAST(PAY_AMT3 AS FLOAT64) AS PAY_AMT3,
            CAST(PAY_AMT4 AS FLOAT64) AS PAY_AMT4,
            CAST(PAY_AMT5 AS FLOAT64) AS PAY_AMT5,
            CAST(PAY_AMT6 AS FLOAT64) AS PAY_AMT6,
            CAST(default_payment_next_month AS INT64) AS default_payment
        FROM
            `credit-default-277316.credit_default.credit_default`
        """

    if test_mode:
        query = query + ' LIMIT 100' 

    for step in ['train', 'eval']:
        if step == 'train':
            selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashid),5) < 4'.format(query)
        else:
            selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashid),5) = 4'.format(query)

        (p 
         | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, 
                                                                     use_standard_sql = True))
         | '{}_csv'.format(step) >> beam.FlatMap(to_csv)
         | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText(
             os.path.join(OUTPUT_DIR, '{}.csv'.format(step))))
        )

    job = p.run()
    
    
    if test_mode:
        job.wait_until_finish()
        print("Done!")
示例#20
0
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | bq_mod.ReadFromBigQuery(
                project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT),
                query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY),
                gcs_location=getattr(runtime_options,
                                     c._OPTION_TEMP_GCS_LOCATION),
                use_standard_sql=True)
            | beam.FlatMap(
                c.bq_row_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | 'Write to validation_params table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'validation_params'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema={
                     'fields': [{
                         'name': 'calibration_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'calibration_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'holdout_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'model_time_granularity',
                         'type': 'STRING'
                     }, {
                         'name':
                         'model',
                         'type':
                         'RECORD',
                         'fields': [
                             {
                                 'name': 'frequency_model',
                                 'type': 'STRING'
                             },
                             {
                                 'name': 'num_customers_cohort',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_customers_cohort',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'num_transactions_validation',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_transactions_validation',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'validation_mape',
                                 'type': 'STRING'
                             },
                         ]
                     }]
                 },
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs_without_extra_dimension),
                pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed], prediction_params
        )

        prediction_by_customer_no_segments_no_extra_dimension = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        prediction_by_customer_no_segments = (
            prediction_by_customer_no_segments_no_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # [customer_id, p_alive, predicted_purchases, future_aov
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?]
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract prediction params
            | 'Write to prediction_params table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_params'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema={
                    'fields': [{
                        'name': 'prediction_period',
                        'type': 'INTEGER'
                    }, {
                        'name': 'prediction_period_unit',
                        'type': 'STRING'
                    }, {
                        'name': 'model_time_granularity',
                        'type': 'STRING'
                    }, {
                        'name': 'customers_modeled',
                        'type': 'INTEGER'
                    }, {
                        'name': 'transactions_observed',
                        'type': 'INTEGER'
                    }, {
                        'name': 'frequency_model',
                        'type': 'STRING'
                    }, {
                        'name':
                        'bgnbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'a',
                            'type': 'FLOAT'
                        }, {
                            'name': 'b',
                            'type': 'FLOAT'
                        }, {
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'bgbb_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }, {
                            'name': 'gamma',
                            'type': 'FLOAT'
                        }, {
                            'name': 'delta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'paretonbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 's',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'gamma_gamma_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'p',
                            'type': 'FLOAT'
                        }, {
                            'name': 'q',
                            'type': 'FLOAT'
                        }, {
                            'name': 'v',
                            'type': 'FLOAT'
                        }]
                    }]
                },
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten()
            | beam.Map(c.clean_nan_and_inf))

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to Dict' >>
             beam.Map(c.list_to_dict, [
                 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov',
                 'historical_aov', 'expected_value', 'frequency', 'recency',
                 'total_time_observed', 'segment', 'extra_dimension'
             ])
             | 'Write to prediction_by_customer table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_by_customer'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='customer_id:STRING, p_alive:FLOAT64'
                 ', predicted_purchases:FLOAT64'
                 ', future_aov:FLOAT64, historical_aov:FLOAT64'
                 ', expected_value:FLOAT64, frequency:INT64'
                 ', recency:FLOAT64'
                 ', total_time_observed:FLOAT64, segment:INT64'
                 ', extra_dimension:STRING',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (
            prediction_summary
            | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [
                'segment', 'average_retention_probability',
                'average_predicted_customer_value',
                'average_predicted_order_value', 'average_predicted_purchases',
                'total_customer_value', 'number_of_customers',
                'perc_of_total_customer_value'
            ])
            | 'Write to prediction_summary table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_summary'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema='segment:INT64 ,average_retention_probability:FLOAT64'
                ', average_predicted_customer_value:FLOAT64'
                ', average_predicted_order_value:FLOAT64'
                ', average_predicted_purchases:FLOAT64'
                ', total_customer_value:FLOAT64'
                ', number_of_customers:FLOAT64'
                ', perc_of_total_customer_value:FLOAT64',
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to Dict' >> beam.Map(
                 c.list_to_dict, [
                     'extra_dimension', 'average_retention_probability',
                     'average_predicted_customer_value',
                     'average_predicted_order_value',
                     'average_predicted_purchases', 'total_customer_value',
                     'number_of_customers', 'perc_of_total_customer_value'
                 ])
             | 'Write to prediction_summary_extra_dimension table' >>
             io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_summary_extra_dimension'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='extra_dimension:STRING'
                 ', average_retention_probability:FLOAT64'
                 ', average_predicted_customer_value:FLOAT64'
                 ', average_predicted_order_value:FLOAT64'
                 ', average_predicted_purchases:FLOAT64'
                 ', total_customer_value:FLOAT64'
                 ', number_of_customers:INT64'
                 ', perc_of_total_customer_value:FLOAT64',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
示例#21
0
#### After this point, there are transforms used in  the main pipeline
def read_data(pipeline, input_file):
    return pipeline | covidpipe.datasource.ReadFromCsv(input_file)


def select_wanted_columns(input_data, column_information, extra_columns):
    def select_wanted_columns(row: Dict[str, str],
                              column_info: Dict[str, Set[str]]):
        empty_columns = set(column_info[
            covidpipe.datasource.FindEmptyAndNonEmptyColumns.EMPTY])

        sanitized_row = {
            k: v
            for k, v in row.items()
            if (k not in empty_columns or k in extra_columns) and v
        }

        # If the row does not contain any values, then we must discard it.
        if sanitized_row:
            yield sanitized_row

    return input_data | 'SelectColumns' >> beam.FlatMap(
        select_wanted_columns, column_information)


#### After this point, the pipeline is set up to run
if __name__ == '__main__':
    import sys
    options = pipeline_options.PipelineOptions(sys.argv[1:])
    run(options)
示例#22
0
import apache_beam as beam
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner

import google.auth
from datetime import datetime, timedelta
import json


# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=['--streaming'])

options.view_as(pipeline_options.StandardOptions).streaming = True
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

options.view_as(GoogleCloudOptions).region = 'us-west1'
options.view_as(GoogleCloudOptions).staging_location = 'gs://abdul-dataflow/staging'
options.view_as(GoogleCloudOptions).temp_location = 'gs://abdul-dataflow/temp'

#options.view_as(pipeline_options.SetupOptions).sdk_location = (
#            f'/root/apache-beam-custom/packages/beam/sdks/python/dist/apache-beam-{beam.version.__version__}0.tar.gz' )


topic = "projects/data228/topics/data228-hw8-in"


with beam.Pipeline(options=options) as pipeline:

    data = pipeline | "read" >> beam.io.ReadFromPubSub(topic=topic)
    windowed_data = (data | "window" >> beam.WindowInto(beam.window.FixedWindows(500))
示例#23
0
def options():
    return pipeline_options.PipelineOptions()
示例#24
0
def run(argv=None):
  # type: (List[str]) -> None
  """Runs VCF to BigQuery pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)

  if known_args.auto_flags_experiment:
    _get_input_dimensions(known_args, pipeline_args)

  annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

  all_patterns = (
      [annotated_vcf_pattern] if annotated_vcf_pattern
      else known_args.all_patterns)

  variant_merger = _get_variant_merge_strategy(known_args)

  pipeline_mode = pipeline_common.get_pipeline_mode(
      all_patterns,
      known_args.optimize_for_large_inputs)
  # Starts a pipeline to merge VCF headers in beam if the total files that
  # match the input pattern exceeds _SMALL_DATA_THRESHOLD
  _merge_headers(known_args, pipeline_args,
                 pipeline_mode, annotated_vcf_pattern)


  # Retrieve merged headers prior to launching the pipeline. This is needed
  # since the BigQuery schema cannot yet be dynamically created based on input.
  # See https://issues.apache.org/jira/browse/BEAM-2801.
  header_fields = vcf_header_parser.get_vcf_headers(
      known_args.representative_header_file)
  counter_factory = metrics_util.CounterFactory()
  processed_variant_factory = processed_variant.ProcessedVariantFactory(
      header_fields,
      known_args.split_alternate_allele_info_fields,
      known_args.allow_malformed_records,
      known_args.annotation_fields,
      known_args.use_allele_num,
      known_args.minimal_vep_alt_matching,
      known_args.infer_annotation_types,
      counter_factory)

  partitioner = None
  if ((known_args.optimize_for_large_inputs and variant_merger) or
      known_args.partition_config_path):
    partitioner = variant_partition.VariantPartition(
        known_args.partition_config_path)

  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  pipeline = beam.Pipeline(options=beam_pipeline_options)
  variants = _read_variants(all_patterns, pipeline, known_args, pipeline_mode)
  variants |= 'FilterVariants' >> filter_variants.FilterVariants(
      reference_names=known_args.reference_names)
  if partitioner:
    num_partitions = partitioner.get_num_partitions()
    partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
        partition_variants.PartitionVariants(partitioner), num_partitions)
    variants = []
    for i in range(num_partitions):
      if partitioner.should_keep_partition(i):
        variants.append(partitioned_variants[i])
      else:
        num_partitions -= 1
  else:
    # By default we don't partition the data, so we have only 1 partition.
    num_partitions = 1
    variants = [variants]

  for i in range(num_partitions):
    if variant_merger:
      variants[i] |= ('MergeVariants' + str(i) >>
                      merge_variants.MergeVariants(variant_merger))
    variants[i] |= (
        'ProcessVariants' + str(i) >>
        beam.Map(processed_variant_factory.create_processed_variant).\
            with_output_types(processed_variant.ProcessedVariant))
  if partitioner and partitioner.should_flatten():
    variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
    num_partitions = 1

  if known_args.output_table:
    for i in range(num_partitions):
      table_suffix = ''
      if partitioner and partitioner.get_partition_name(i):
        table_suffix = '_' + partitioner.get_partition_name(i)
      table_name = known_args.output_table + table_suffix
      _ = (variants[i] | 'VariantToBigQuery' + table_suffix >>
           variant_to_bigquery.VariantToBigQuery(
               table_name,
               header_fields,
               variant_merger,
               processed_variant_factory,
               append=known_args.append,
               update_schema_on_append=known_args.update_schema_on_append,
               allow_incompatible_records=known_args.allow_incompatible_records,
               omit_empty_sample_calls=known_args.omit_empty_sample_calls,
               num_bigquery_write_shards=known_args.num_bigquery_write_shards,
               null_numeric_value_replacement=(
                   known_args.null_numeric_value_replacement)))

  if known_args.output_avro_path:
    # TODO(bashir2): Add an integration test that outputs to Avro files and
    # also imports to BigQuery. Then import those Avro outputs using the bq
    # tool and verify that the two tables are identical.
    _ = (
        variants | 'FlattenToOnePCollection' >> beam.Flatten()
        | 'VariantToAvro' >>
        variant_to_avro.VariantToAvroFiles(
            known_args.output_avro_path,
            header_fields,
            processed_variant_factory,
            variant_merger=variant_merger,
            allow_incompatible_records=known_args.allow_incompatible_records,
            omit_empty_sample_calls=known_args.omit_empty_sample_calls,
            null_numeric_value_replacement=(
                known_args.null_numeric_value_replacement))
    )

  result = pipeline.run()
  result.wait_until_finish()

  metrics_util.log_all_counters(result)
示例#25
0
def CreatePipeline(pipeline_args):
    poptions = pipeline_options.PipelineOptions(
        pipeline_args,
        runner="directrunner",
        direct_running_mode="multi_threading")
    return beam.Pipeline(options=poptions)