示例#1
0
文件: task.py 项目: roger-mike/beam
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# beam-playground:
#   name: Flatten
#   description: Task from katas that merges two PCollections of words into a single PCollection.
#   multifile: false
#   categories:
#     - Flatten

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

    wordsStartingWithA = \
        p | 'Words starting with A' >> beam.Create(['apple', 'ant', 'arrow'])

    wordsStartingWithB = \
        p | 'Words starting with B' >> beam.Create(['ball', 'book', 'bow'])

    ((wordsStartingWithA, wordsStartingWithB) | beam.Flatten() | LogElements())
示例#2
0
    def expand(self, inputs):
        pcoll, = inputs
        if self._top_k is not None and self._top_k < 0:
            raise ValueError(
                'top_k for VocabularyImpl should be >= 0 or None, got '
                '{}.'.format(self._top_k))
        if self._frequency_threshold is not None and self._frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold for VocabularyImpl should be >= 0 or None, '
                'got {}.'.format(self._frequency_threshold))
        if self._coverage_top_k is not None and self._coverage_top_k < 0:
            raise ValueError(
                'coverage_top_k for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_top_k))
        if (self._coverage_frequency_threshold is not None
                and self._coverage_frequency_threshold < 0):
            raise ValueError(
                'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_frequency_threshold))

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and b'\n' not in string and b'\r' not in string

        if (self._vocab_ordering_type ==
                tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            flatten_map_fn = _flatten_to_key_and_means_accumulator_list
            combine_transform = _MutualInformationTransform(  # pylint: disable=no-value-for-parameter
                self._use_adjusted_mutual_info, self._min_diff_from_avg)
        elif (self._vocab_ordering_type ==
              tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
            flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = _flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        raw_counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeightsLabels' >>
            beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        counts = (
            raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
                _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                    self._frequency_threshold, self._top_k, None)))

        if self._key_fn:
            coverage_counts = (
                raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> (
                    _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                        self._coverage_frequency_threshold,
                        self._coverage_top_k, self._key_fn)))

            counts = ((counts, coverage_counts)
                      | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                      | 'RemoveDuplicates' >> beam.RemoveDuplicates())

        return counts | 'WriteVocabFile' >> (
            _WriteVocabFile(  # pylint: disable=no-value-for-parameter
                self._base_temp_dir, self._vocab_filename,
                self._store_frequency))
def run(argv=None):
    default_db = 'beam_mongodbio_it_db'
    default_coll = 'integration_test_%d' % time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument('--mongo_uri',
                        default='mongodb://localhost:27017',
                        help='mongo uri string for connection')
    parser.add_argument('--mongo_db',
                        default=default_db,
                        help='mongo uri string for connection')
    parser.add_argument('--mongo_coll',
                        default=default_coll,
                        help='mongo uri string for connection')
    parser.add_argument(
        '--num_documents',
        default=100000,
        help='The expected number of documents to be generated '
        'for write or read',
        type=int)
    parser.add_argument('--batch_size',
                        default=10000,
                        type=int,
                        help=('batch size for writing to mongodb'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Test Write to MongoDB
    with TestPipeline(options=PipelineOptions(pipeline_args)) as p:
        start_time = time.time()
        _LOGGER.info('Writing %d documents to mongodb',
                     known_args.num_documents)

        _ = (p | beam.Create([known_args.num_documents])
             | 'Create documents' >> beam.ParDo(GenerateDocs())
             | 'WriteToMongoDB' >> beam.io.WriteToMongoDB(
                 known_args.mongo_uri, known_args.mongo_db,
                 known_args.mongo_coll, known_args.batch_size))
    elapsed = time.time() - start_time
    _LOGGER.info('Writing %d documents to mongodb finished in %.3f seconds' %
                 (known_args.num_documents, elapsed))

    # Test Read from MongoDB
    total_sum = sum(range(known_args.num_documents))
    mod_3_sum = sum(num for num in range(known_args.num_documents)
                    if num % 3 == 0)
    mod_3_count = sum(1 for num in range(known_args.num_documents)
                      if num % 3 == 0)
    # yapf: disable
    read_cases = [
        # (reader_params, expected)
        (
            {
                'projection': ['number']
            },
            {
                'number_sum': total_sum,
                'docs_count': known_args.num_documents
            }
        ),
        (
            {
                'filter': {'number_mod_3': 0},
                'projection': ['number']
            },
            {
                'number_sum': mod_3_sum,
                'docs_count': mod_3_count
            }
        ),
        (
            {
                'projection': ['number'],
                'bucket_auto': True
            },
            {
                'number_sum': total_sum,
                'docs_count': known_args.num_documents
            }
        ),
        (
            {
                'filter': {'number_mod_3': 0},
                'projection': ['number'],
                'bucket_auto': True
            },
            {
                'number_sum': mod_3_sum,
                'docs_count': mod_3_count
            }
        ),
    ]
    # yapf: enable
    for reader_params, expected in read_cases:
        with TestPipeline(options=PipelineOptions(pipeline_args)) as p:
            start_time = time.time()
            _LOGGER.info('=' * 80)
            _LOGGER.info('Reading from mongodb %s:%s', known_args.mongo_db,
                         known_args.mongo_coll)
            _LOGGER.info('reader params   : %s', reader_params)
            _LOGGER.info('expected results: %s', expected)
            docs = (p | 'ReadFromMongoDB' >> beam.io.ReadFromMongoDB(
                known_args.mongo_uri, known_args.mongo_db,
                known_args.mongo_coll, **reader_params)
                    | 'Map' >> beam.Map(lambda doc: doc['number']))
            number_sum = (docs | 'Combine' >> beam.CombineGlobally(sum))
            docs_count = (docs | 'Count' >> beam.combiners.Count.Globally())
            r = ([number_sum, docs_count] | 'Flatten' >> beam.Flatten())
            assert_that(
                r, equal_to([expected['number_sum'], expected['docs_count']]))

        elapsed = time.time() - start_time
        _LOGGER.info('Reading documents from mongodb finished in %.3f seconds',
                     elapsed)

    # Clean-up
    with MongoClient(host=known_args.mongo_uri) as client:
        client.drop_database(known_args.mongo_db)
示例#4
0
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | bq_mod.ReadFromBigQuery(
                project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT),
                query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY),
                gcs_location=getattr(runtime_options,
                                     c._OPTION_TEMP_GCS_LOCATION),
                use_standard_sql=True)
            | beam.FlatMap(
                c.bq_row_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | 'Write to validation_params table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'validation_params'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema={
                     'fields': [{
                         'name': 'calibration_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'calibration_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'holdout_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'model_time_granularity',
                         'type': 'STRING'
                     }, {
                         'name':
                         'model',
                         'type':
                         'RECORD',
                         'fields': [
                             {
                                 'name': 'frequency_model',
                                 'type': 'STRING'
                             },
                             {
                                 'name': 'num_customers_cohort',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_customers_cohort',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'num_transactions_validation',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_transactions_validation',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'validation_mape',
                                 'type': 'STRING'
                             },
                         ]
                     }]
                 },
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs_without_extra_dimension),
                pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed], prediction_params
        )

        prediction_by_customer_no_segments_no_extra_dimension = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        prediction_by_customer_no_segments = (
            prediction_by_customer_no_segments_no_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # [customer_id, p_alive, predicted_purchases, future_aov
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?]
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract prediction params
            | 'Write to prediction_params table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_params'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema={
                    'fields': [{
                        'name': 'prediction_period',
                        'type': 'INTEGER'
                    }, {
                        'name': 'prediction_period_unit',
                        'type': 'STRING'
                    }, {
                        'name': 'model_time_granularity',
                        'type': 'STRING'
                    }, {
                        'name': 'customers_modeled',
                        'type': 'INTEGER'
                    }, {
                        'name': 'transactions_observed',
                        'type': 'INTEGER'
                    }, {
                        'name': 'frequency_model',
                        'type': 'STRING'
                    }, {
                        'name':
                        'bgnbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'a',
                            'type': 'FLOAT'
                        }, {
                            'name': 'b',
                            'type': 'FLOAT'
                        }, {
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'bgbb_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }, {
                            'name': 'gamma',
                            'type': 'FLOAT'
                        }, {
                            'name': 'delta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'paretonbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 's',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'gamma_gamma_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'p',
                            'type': 'FLOAT'
                        }, {
                            'name': 'q',
                            'type': 'FLOAT'
                        }, {
                            'name': 'v',
                            'type': 'FLOAT'
                        }]
                    }]
                },
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten()
            | beam.Map(c.clean_nan_and_inf))

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to Dict' >>
             beam.Map(c.list_to_dict, [
                 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov',
                 'historical_aov', 'expected_value', 'frequency', 'recency',
                 'total_time_observed', 'segment', 'extra_dimension'
             ])
             | 'Write to prediction_by_customer table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_by_customer'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='customer_id:STRING, p_alive:FLOAT64'
                 ', predicted_purchases:FLOAT64'
                 ', future_aov:FLOAT64, historical_aov:FLOAT64'
                 ', expected_value:FLOAT64, frequency:INT64'
                 ', recency:FLOAT64'
                 ', total_time_observed:FLOAT64, segment:INT64'
                 ', extra_dimension:STRING',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (
            prediction_summary
            | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [
                'segment', 'average_retention_probability',
                'average_predicted_customer_value',
                'average_predicted_order_value', 'average_predicted_purchases',
                'total_customer_value', 'number_of_customers',
                'perc_of_total_customer_value'
            ])
            | 'Write to prediction_summary table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_summary'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema='segment:INT64 ,average_retention_probability:FLOAT64'
                ', average_predicted_customer_value:FLOAT64'
                ', average_predicted_order_value:FLOAT64'
                ', average_predicted_purchases:FLOAT64'
                ', total_customer_value:FLOAT64'
                ', number_of_customers:FLOAT64'
                ', perc_of_total_customer_value:FLOAT64',
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to Dict' >> beam.Map(
                 c.list_to_dict, [
                     'extra_dimension', 'average_retention_probability',
                     'average_predicted_customer_value',
                     'average_predicted_order_value',
                     'average_predicted_purchases', 'total_customer_value',
                     'number_of_customers', 'perc_of_total_customer_value'
                 ])
             | 'Write to prediction_summary_extra_dimension table' >>
             io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_summary_extra_dimension'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='extra_dimension:STRING'
                 ', average_retention_probability:FLOAT64'
                 ', average_predicted_customer_value:FLOAT64'
                 ', average_predicted_order_value:FLOAT64'
                 ', average_predicted_purchases:FLOAT64'
                 ', total_customer_value:FLOAT64'
                 ', number_of_customers:INT64'
                 ', perc_of_total_customer_value:FLOAT64',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
示例#5
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   singleton_pc):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            singleton_pc
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(temp_tables_load_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            singleton_pc
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs))

        _ = (singleton_pc
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 beam.pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
示例#6
0
    def _run_model_inference(
        self,
        data_spec: bulk_inferrer_pb2.DataSpec,
        output_example_spec: bulk_inferrer_pb2.OutputExampleSpec,
        examples: List[types.Artifact],
        output_examples: Optional[types.Artifact],
        inference_result: Optional[types.Artifact],
        inference_endpoint: model_spec_pb2.InferenceSpecType,
    ) -> None:
        """Runs model inference on given examples data.

    Args:
      data_spec: bulk_inferrer_pb2.DataSpec instance.
      output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance.
      examples: List of `standard_artifacts.Examples` artifacts.
      output_examples: Optional output `standard_artifacts.Examples` artifact.
      inference_result: Optional output `standard_artifacts.InferenceResult`
        artifact.
      inference_endpoint: Model inference endpoint.
    """

        example_uris = {}
        for example_artifact in examples:
            for split in artifact_utils.decode_split_names(
                    example_artifact.split_names):
                if data_spec.example_splits:
                    if split in data_spec.example_splits:
                        example_uris[split] = artifact_utils.get_split_uri(
                            [example_artifact], split)
                else:
                    example_uris[split] = artifact_utils.get_split_uri(
                        [example_artifact], split)

        payload_format, _ = tfxio_utils.resolve_payload_format_and_data_view_uri(
            examples)

        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples,
            _TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            # We have to specify this parameter in order to create a RawRecord TFXIO
            # but we won't use the RecordBatches so the column name of the raw
            # records does not matter.
            raw_record_column_name='unused')

        if output_examples:
            output_examples.split_names = artifact_utils.encode_split_names(
                sorted(example_uris.keys()))

        with self._make_beam_pipeline() as pipeline:
            data_list = []
            for split, example_uri in example_uris.items():
                tfxio = tfxio_factory(
                    [io_utils.all_files_pattern(example_uri)])
                assert isinstance(
                    tfxio, record_based_tfxio.RecordBasedTFXIO
                ), ('Unable to use TFXIO {} as it does not support reading raw records.'
                    .format(type(tfxio)))
                # pylint: disable=no-value-for-parameter
                data = (pipeline
                        | 'ReadData[{}]'.format(split) >>
                        tfxio.RawRecordBeamSource()
                        | 'RunInference[{}]'.format(split) >> _RunInference(
                            payload_format, inference_endpoint))
                if output_examples:
                    output_examples_split_uri = artifact_utils.get_split_uri(
                        [output_examples], split)
                    logging.info('Path of output examples split `%s` is %s.',
                                 split, output_examples_split_uri)
                    _ = (data
                         | 'WriteExamples[{}]'.format(split) >> _WriteExamples(
                             output_example_spec, output_examples_split_uri))
                    # pylint: enable=no-value-for-parameter

                data_list.append(data)

            if inference_result:
                _ = (
                    data_list
                    |
                    'FlattenInferenceResult' >> beam.Flatten(pipeline=pipeline)
                    | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
                        os.path.join(inference_result.uri,
                                     _PREDICTION_LOGS_FILE_NAME),
                        file_name_suffix='.gz',
                        coder=beam.coders.ProtoCoder(
                            prediction_log_pb2.PredictionLog)))

        if output_examples:
            logging.info('Output examples written to %s.', output_examples.uri)
        if inference_result:
            logging.info('Inference result written to %s.',
                         inference_result.uri)
示例#7
0
    def expand(self,
               pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
        def _sum_pairwise(
            iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int,
                                                                   float]]]
        ) -> Tuple[Union[int, float], Union[int, float]]:
            """Computes sum of counts and weights."""
            # We take advantage of the fact that constructing a np array from a list
            # is much faster as the length is known beforehand.
            if isinstance(iter_of_pairs, list):
                arr = np.array(iter_of_pairs,
                               dtype=[('c', np.int64), ('w', np.float)])
            else:
                arr = np.fromiter(iter_of_pairs,
                                  dtype=[('c', np.int64), ('w', np.float)])
            return arr['c'].sum(), arr['w'].sum()

        if self._weight_feature is not None:
            sum_fn = _sum_pairwise
        else:
            # For non-weighted case, use sum combine fn over integers to allow Beam
            # to use Cython combiner.
            sum_fn = sum
        top_k_tuples_combined = (
            pcoll
            | 'ToTopKTuples' >> beam.FlatMap(
                _to_topk_tuples,
                categorical_features=self._categorical_features,
                weight_feature=self._weight_feature)
            | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn))

        top_k = top_k_tuples_combined
        if self._weight_feature is not None:
            top_k |= 'Unweighted_DropWeights' >> beam.Map(lambda x:
                                                          (x[0], x[1][0]))
        # (slice_key, feature, v), c
        top_k |= (
            'Unweighted_Prepare' >> beam.Map(lambda x: ((x[0][0], x[0][1]),
                                                        (x[0][2], x[1])))
            # (slice_key, feature), (v, c)
            | 'Unweighted_TopK' >> beam.combiners.Top().PerKey(max(
                self._num_top_values, self._num_rank_histogram_buckets),
                                                               key=lambda x:
                                                               (x[1], x[0]))
            | 'Unweighted_ToProto' >> beam.Map(
                _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                categorical_features=self._categorical_features,
                is_weighted_stats=False,
                num_top_values=self._num_top_values,
                frequency_threshold=self._frequency_threshold,
                num_rank_histogram_buckets=self._num_rank_histogram_buckets))
        uniques = (
            top_k_tuples_combined
            | 'Uniques_DropValues' >> beam.Map(lambda x: (x[0][0], x[0][1]))
            | 'Uniques_CountPerFeatureName' >>
            beam.combiners.Count().PerElement()
            | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_uniques_for_single_feature,
                categorical_features=self._categorical_features))
        result_protos = [top_k, uniques]

        if self._weight_feature is not None:
            weighted_top_k = (
                top_k_tuples_combined
                | 'Weighted_DropCounts' >> beam.Map(lambda x: (x[0], x[1][1]))
                | 'Weighted_Prepare' >>
                # (slice_key, feature), (v, w)
                beam.Map(lambda x: ((x[0][0], x[0][1]), (x[0][2], x[1])))
                | 'Weighted_TopK' >> beam.combiners.Top().PerKey(max(
                    self._num_top_values, self._num_rank_histogram_buckets),
                                                                 key=lambda x:
                                                                 (x[1], x[0]))
                | 'Weighted_ToProto' >> beam.Map(
                    _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=True,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._weighted_frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets
                ))
            result_protos.append(weighted_top_k)

        def _deserialize_sliced_feature_stats_proto(entry):
            feature_stats_proto = statistics_pb2.DatasetFeatureStatistics()
            feature_stats_proto.ParseFromString(entry[1])
            return entry[0], feature_stats_proto

        return (
            result_protos
            | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten()
            # TODO(b/121152126): This deserialization stage is a workaround.
            # Remove this once it is no longer needed.
            | 'DeserializeTopKUniquesFeatureStatsProto' >>
            beam.Map(_deserialize_sliced_feature_stats_proto))
示例#8
0
文件: run.py 项目: trucnguyenlam/klio
    def _setup_data_io_filters(self, in_pcol, label_prefix=None):
        # label prefixes are required for multiple inputs (to avoid label
        # name collisions in Beam)
        if self._has_multi_data_inputs or self._has_multi_data_outputs:
            logging.error(
                "Klio does not (yet) support multiple data inputs and outputs."
            )
            raise SystemExit(1)

        data_in_config, data_out_config = None, None
        if self._has_data_inputs:
            data_in_config = self.config.job_config.data.inputs[0]
        if self._has_data_outputs:
            data_out_config = self.config.job_config.data.outputs[0]

        pfx = ""
        if label_prefix is not None:
            pfx = "[{}] ".format(label_prefix)

        def lbl(label):
            return "{}{}".format(pfx, label)

        to_process_output = in_pcol
        pass_thru = None
        if data_in_config:
            pings = in_pcol | lbl("Ping Filter") >> helpers.KlioFilterPing()
            to_process_output = pings.process
            pass_thru = pings.pass_thru

        if data_out_config and not data_out_config.skip_klio_existence_check:
            output_exists = (to_process_output
                             | lbl("Output Exists Filter") >>
                             helpers.KlioGcsCheckOutputExists())
            output_force = (
                output_exists.found
                | lbl("Output Force Filter") >> helpers.KlioFilterForce())
            to_pass_thru_tuple = (pass_thru, output_force.pass_thru)
            to_pass_thru = (to_pass_thru_tuple
                            | lbl("Flatten to Pass Thru") >> beam.Flatten())

            to_filter_input_tuple = (
                output_exists.not_found,
                output_force.process,
            )
            to_filter_input = (to_filter_input_tuple
                               | lbl("Flatten to Process") >> beam.Flatten())
        else:
            to_pass_thru = pass_thru
            to_filter_input = to_process_output

        if data_in_config and not data_in_config.skip_klio_existence_check:
            input_exists = (to_filter_input
                            | lbl("Input Exists Filter") >>
                            helpers.KlioGcsCheckInputExists())
            _ = (input_exists.not_found
                 | lbl("Drop Not Found Data") >> helpers.KlioDrop())
            to_process = input_exists.found
        else:
            to_process = to_filter_input

        return to_process, to_pass_thru
示例#9
0
    def test_instrument_mixed_streaming_batch(self):
        """Tests caching for both batch and streaming sources in the same pipeline.

    This ensures that cached bounded and unbounded sources are read from the
    TestStream.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)
        streaming_cache_manager = StreamingCache(cache_dir=None)
        ie.current_env().set_cache_manager(streaming_cache_manager, p_original)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.Create([1, 2, 3, 4, 5])

        # pylint: disable=possibly-unused-variable
        pcoll_1 = ((source_1, source_2)
                   | beam.Flatten()
                   | 'square1' >> beam.Map(lambda x: x * x))

        # Watch but do not cache the PCollections.
        ib.watch(locals())

        self._mock_write_cache(p_original, [b''],
                               self.cache_key_of('source_2', source_2))
        ie.current_env().mark_pcollection_computed([source_2])

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        source_2_cache_key = self.cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
        test_stream = (
            p_expected
            | TestStream(output_tags=[source_1_cache_key, source_2_cache_key]))
        # pylint: disable=expression-not-assigned
        ((test_stream[self.cache_key_of('source_1', source_1)],
          test_stream[self.cache_key_of('source_2', source_2)])
         | beam.Flatten()
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(
            self, p_expected.to_runner_api(use_fake_coders=True),
            instrumenter.instrumented_pipeline_proto())
示例#10
0
query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt')

# Par do to the Lyft count
lyft_pcoll = query_results | 'Lyft Count Fn' >> beam.ParDo(LyftFn())

#checking the pcoll
lyft_pcoll | 'write log 2' >> WriteToText(DIR_PATH + 'lyft_pcoll.txt')

# ParDo to the Uber count
uber_pcoll = query_results | 'Uber Count Fn' >> beam.ParDo(UberFn())

#checking the pcoll
uber_pcoll | 'write log 3' >> WriteToText(DIR_PATH + 'uber_pcoll.txt')

# Combine both query results into one
all_rides_pcoll = (lyft_pcoll, uber_pcoll)| 'Merge pCollections' >> beam.Flatten()

# write all rides to output
all_rides_pcoll | 'Write log 4' >> WriteToText(DIR_PATH + 'all_rides_pcoll.txt')

# fix the format of the output table
fixed_rides_pcoll = all_rides_pcoll | 'Fix Format' >> beam.ParDo(FixFormatFn())

#write the output table to txt
fixed_rides_pcoll | 'Write log 5' >> WriteToText(DIR_PATH + 'fixed_rides_pcoll.txt')

dataset_id = 'rideshare_modeled'
table_id = 'Rider_Beam_DF'
schema_id = 'id:STRING, cab_type:STRING'

# write PCollection to new BQ table
示例#11
0
文件: c4.py 项目: lukewheless/NLP
    def _get_page_content(self, pipeline, file_paths, dl_manager):
        """Build PCollection of un-split page content."""

        wet_file_paths = pipeline | "create_wet_files" >> beam.Create(
            file_paths["wet_files"])
        if "wet_urls" in file_paths:

            def download_url(url, downloader, pipeline):
                path = downloader.download(url)
                if not pipeline.is_local():
                    path = downloader.ship_files_with_pipeline(path, pipeline)
                return path

            dl_wet_file_paths = (
                pipeline
                | "create_wet_urls" >> beam.Create(file_paths["wet_urls"])
                | beam.Map(
                    download_url, downloader=dl_manager, pipeline=pipeline))
            wet_file_paths = (wet_file_paths,
                              dl_wet_file_paths) | beam.Flatten()

        # Parse WET files and filter by length.
        # Output: url, text
        page_content = wet_file_paths | beam.FlatMap(
            split_wet_file) | beam.Filter(is_valid_length)

        # Optionally filter for RealNews domains.
        # Output: url, text
        if self.config.realnewslike:
            with open(file_paths["realnews_domains"], "r") as f:
                realnews_domains = json.load(f)
            page_content = page_content | beam.Filter(is_realnews_domain,
                                                      realnews_domains)

        # Normalize and deduplicate by URL.
        # Output: url, text
        page_content = (page_content
                        | "normalize_url" >> beam.Map(normalize_url)
                        | "group_url" >> beam.GroupByKey()
                        | beam.Map(dedupe_urls))

        # Optionally filter for WebText-like URLs.
        # Output: url, text
        if self.config.webtextlike:
            webtextlike_urls = (
                pipeline
                | "read_webtextlike_urls" >> beam.io.ReadFromText(
                    os.path.join(file_paths["openwebtext_urls_zip"],
                                 _OPENWEBTEXT_URLS_FILE_PATTERN))
                | "add_dummy_page" >> beam.Map(lambda x: (x, ""))
                | "normal_webtext_url" >> beam.Map(normalize_url))
            page_content = ({
                "text": page_content,
                "webtextlike_urls": webtextlike_urls
            }
                            | "group_webtextlike_urls" >> beam.CoGroupByKey()
                            | beam.FlatMap(filter_by_webtextlike))

        # Optionally clean pages of badwords, boilerpolate text, and duplicate
        # spans of sentences.
        # Output: url, text
        if self.config.clean:
            with open(file_paths["badwords"], "r") as f:
                badwords = [l.strip() for l in f]
            page_content = page_content | "clean_pages" >> beam.FlatMap(
                get_clean_page_fn(badwords))
            page_content = remove_duplicate_text(page_content)

        # Optionally filter out non-`language` pages. We do this after cleaning
        # since it may change the predominate language.
        if self.config.lang != "all":
            page_content |= beam.Filter(is_language, language=self.config.lang)

        return page_content
示例#12
0
  def expand(self, tensor_pcoll_mapping):
    """Converts a dict of statistics to a transform function.

    Args:
      tensor_pcoll_mapping: A dictionary mapping `Tensor`s to a singleton
          PCollection containing a _TensorValue.

    Returns:
      A dict from tensor names to singleton `PCollection`s.
    """
    # Convert tensor_value_mapping into a DictPCollectionView so it can be
    # passed as a side input to the beam Map below.
    tensor_value_pairs = []
    for name, pcoll in six.iteritems(tensor_pcoll_mapping):
      tensor_value_pairs.append(
          pcoll
          | 'AddName[%s]' % name >> beam.Map(lambda x, name=name: (name, x)))
    tensor_value_mapping = beam.pvalue.AsDict(
        tensor_value_pairs
        | 'MergeTensorValuePairs' >> beam.Flatten(pipeline=self.pipeline))

    def compute_deferred_metadata(metadata, column_schema_overrides,
                                  saved_model_dir, tensor_value_mapping):
      """Extracts constant values from graph."""
      tensor_names = {
          tensor_name
          for override in six.itervalues(column_schema_overrides)
          for tensor_name in [override.min_value, override.max_value]}

      graph = tf.Graph()
      with graph.as_default():
        tensor_replacement_map = {}
        for orig_tensor_name, (value,
                               is_asset) in six.iteritems(tensor_value_mapping):
          new_tensor = tf.constant(value)
          if is_asset:
            # Any newly frozen constant tensors containing filenames must be
            # added to the ASSET_FILENAMES collection.
            graph.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, new_tensor)
          tensor_replacement_map[orig_tensor_name] = new_tensor

        with tf.Session(graph=graph) as session:
          tensors_by_name = (
              saved_transform_io.fetch_tensor_values(
                  saved_model_dir, tensor_replacement_map, tensor_names))
          session.run(tf.global_variables_initializer())
          session.run(tf.tables_initializer())
          tensor_values_by_name = session.run(tensors_by_name)

      new_column_schemas = {}
      for key, column_schema in six.iteritems(metadata.schema.column_schemas):
        if key in column_schema_overrides:
          override = column_schema_overrides[key]
          min_value = tensor_values_by_name[override.min_value]
          max_value = tensor_values_by_name[override.max_value]
          assert column_schema.domain.dtype == tf.int64
          assert isinstance(column_schema.domain, dataset_schema.IntDomain)
          # Create a new column schema.  An override always results in a
          # categorical column.
          new_column_schemas[key] = dataset_schema.ColumnSchema(
              dataset_schema.IntDomain(tf.int64, min_value, max_value,
                                       is_categorical=True),
              column_schema.axes,
              column_schema.representation)
        else:
          new_column_schemas[key] = column_schema

      return dataset_metadata.DatasetMetadata(dataset_schema.Schema(
          new_column_schemas))

    return (
        self.pipeline
        | 'CreateMetadata' >> beam.Create([self._metadata])
        | 'ExtractScalarConstants' >> beam.Map(
            compute_deferred_metadata,
            column_schema_overrides=self._column_schema_overrides,
            saved_model_dir=self._saved_model_dir,
            tensor_value_mapping=tensor_value_mapping))
示例#13
0
def run(in_pcol, job_config):
    # load 5 seconds of audio and get STFT
    stft = (in_pcol
            | aio.GcsLoadBinary()
            | audio.LoadAudio(offset=10, duration=5)
            | audio.GetSTFT())
    # get magnitude of audio
    magnitude = (stft | "Get magnitude" >> beam.ParDo(
        transforms.GetMagnitude()).with_outputs())
    # map the result to a key (the KlioMessage element)
    # so we can group all results by key
    magnitude_key = (
        magnitude.spectrogram
        | "element to spec" >> beam.Map(transforms.create_key_from_element))
    # get nearest neighbors and map the result to a key (the KlioMessage element)
    nn_filter = (
        magnitude.spectrogram
        | "Get nn filter" >> beam.ParDo(transforms.FilterNearestNeighbors())
        | "element to filter" >> beam.Map(transforms.create_key_from_element))
    # map together the full magnitude with its filter by key  (the KlioMessage element)
    merge = ({
        "full": magnitude_key,
        "nnfilter": nn_filter
    }
             | "merge" >> beam.CoGroupByKey())
    # calc the difference between full magnitude and the filter
    net = merge | beam.Map(transforms.subtract_filter_from_full)
    # create a mask from the filter minus the difference of full & filter
    first_mask = ({
        "first": nn_filter,
        "second": net,
        "full": magnitude_key
    }
                  | "first mask group" >> beam.CoGroupByKey()
                  |
                  "first mask" >> beam.ParDo(transforms.GetSoftMask(margin=2)))
    # create another mask from the difference of full & filter minus the filter
    second_mask = (
        {
            "first": net,
            "second": nn_filter,
            "full": magnitude_key
        }
        | "second mask group" >> beam.CoGroupByKey()
        | "second mask" >> beam.ParDo(transforms.GetSoftMask(margin=10)))
    # plot the full magnitude spectrogram
    magnitude_out = (magnitude.spectrogram
                     | "full spec" >> audio.GetSpec()
                     | "plot full spec" >> audio.SpecToPlot(
                         title="Full Spectrogam for {element}", y_axis="log")
                     | "save full" >> aio.GcsUploadPlot(suffix="-full"))
    # plot the first mask (background) spectrogram
    background_out = (
        first_mask
        | "background spec" >> audio.GetSpec()
        | "plot background spec" >> audio.SpecToPlot(
            title="Background Spectrogam for {element}", y_axis="log")
        | "save background" >> aio.GcsUploadPlot(suffix="-background"))
    # plot the second mask (foreground) spectrogram
    foreground_out = (
        second_mask
        | "foreground spec" >> audio.GetSpec()
        | "plot forground spec" >> audio.SpecToPlot(
            title="Foreground Spectrogam for {element}", y_axis="log")
        | "save foreground" >> aio.GcsUploadPlot(suffix="-foreground"))
    # flatten all outputs into one PCollection, then remove duplicates
    out_pcol = ((magnitude_out, background_out, foreground_out)
                | "flatten output paths" >> beam.Flatten()
                | "remove dups" >> beam.Distinct())
    return out_pcol
示例#14
0
    def _RunBeamImpl(self, inputs, outputs, preprocessing_fn,
                     input_dataset_metadata, raw_examples_data_format,
                     transform_output_path, compute_statistics,
                     materialize_output_paths):
        """Perform data preprocessing with FlumeC++ runner.

    Args:
      inputs: A dictionary of labelled input values.
      outputs: A dictionary of labelled output values.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      raw_examples_data_format: A string describing the raw data format.
      transform_output_path: An absolute path to write the output to.
      compute_statistics: A bool indicating whether or not compute statistics.
      materialize_output_paths: Paths to materialized outputs.

    Raises:
      RuntimeError: If reset() is not being invoked between two run().
      ValueError: If the schema is empty.

    Returns:
      Status of the execution.
    """
        raw_examples_file_format = common.GetSoleValue(
            inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False)
        analyze_and_transform_data_paths = common.GetValues(
            inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL)
        transform_only_data_paths = common.GetValues(
            inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL)
        stats_use_tfdv = common.GetSoleValue(
            inputs, labels.TFT_STATISTICS_USE_TFDV_LABEL)
        per_set_stats_output_paths = common.GetValues(
            outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL)
        temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL)

        tf.logging.info('Analyze and transform data patterns: %s',
                        list(enumerate(analyze_and_transform_data_paths)))
        tf.logging.info('Transform data patterns: %s',
                        list(enumerate(transform_only_data_paths)))
        tf.logging.info('Transform materialization output paths: %s',
                        list(enumerate(materialize_output_paths)))
        tf.logging.info('Transform output path: %s', transform_output_path)

        feature_spec = input_dataset_metadata.schema.as_feature_spec()
        try:
            analyze_input_columns = tft.get_analyze_input_columns(
                preprocessing_fn, feature_spec)
            transform_input_columns = (tft.get_transform_input_columns(
                preprocessing_fn, feature_spec))
        except AttributeError:
            # If using TFT 1.12, fall back to assuming all features are used.
            analyze_input_columns = feature_spec.keys()
            transform_input_columns = feature_spec.keys()
        # Use the same dataset (same columns) for AnalyzeDataset and computing
        # pre-transform stats so that the data will only be read once for these
        # two operations.
        if compute_statistics:
            analyze_input_columns = list(
                set(
                    list(analyze_input_columns) +
                    list(transform_input_columns)))
        analyze_input_dataset_metadata = copy.deepcopy(input_dataset_metadata)
        transform_input_dataset_metadata = copy.deepcopy(
            input_dataset_metadata)
        if input_dataset_metadata.schema is not _RAW_EXAMPLE_SCHEMA:
            analyze_input_dataset_metadata.schema = dataset_schema.from_feature_spec(
                {
                    feature: feature_spec[feature]
                    for feature in analyze_input_columns
                })
            transform_input_dataset_metadata.schema = (
                dataset_schema.from_feature_spec({
                    feature: feature_spec[feature]
                    for feature in transform_input_columns
                }))

        can_process_jointly = not bool(per_set_stats_output_paths
                                       or materialize_output_paths)
        analyze_data_list = self._MakeDatasetList(
            analyze_and_transform_data_paths, raw_examples_file_format,
            raw_examples_data_format, analyze_input_dataset_metadata,
            can_process_jointly)
        transform_data_list = self._MakeDatasetList(
            list(analyze_and_transform_data_paths) +
            list(transform_only_data_paths), raw_examples_file_format,
            raw_examples_data_format, transform_input_dataset_metadata,
            can_process_jointly)

        desired_batch_size = self._GetDesiredBatchSize(
            raw_examples_data_format)

        with self._CreatePipeline(outputs) as p:
            with tft_beam.Context(
                    temp_dir=temp_path,
                    desired_batch_size=desired_batch_size,
                    passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY},
                    use_deep_copy_optimization=True):
                # pylint: disable=expression-not-assigned
                # pylint: disable=no-value-for-parameter

                analyze_decode_fn = (self._GetDecodeFunction(
                    raw_examples_data_format,
                    analyze_input_dataset_metadata.schema))

                for (idx, dataset) in enumerate(analyze_data_list):
                    dataset.encoded = (p
                                       | 'ReadAnalysisDataset[{}]'.format(idx)
                                       >> self._ReadExamples(dataset))
                    dataset.decoded = (
                        dataset.encoded
                        | 'DecodeAnalysisDataset[{}]'.format(idx) >>
                        self._DecodeInputs(analyze_decode_fn))

                input_analysis_data = (
                    [dataset.decoded for dataset in analyze_data_list]
                    | 'FlattenAnalysisDatasets' >> beam.Flatten())
                transform_fn = ((input_analysis_data, input_dataset_metadata)
                                | 'AnalyzeDataset' >>
                                tft_beam.AnalyzeDataset(preprocessing_fn))
                # Write the raw/input metadata.
                (input_dataset_metadata
                 | 'WriteMetadata' >> tft_beam.WriteMetadata(
                     os.path.join(transform_output_path,
                                  tft.TFTransformOutput.RAW_METADATA_DIR), p))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                (transform_fn | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                if compute_statistics or materialize_output_paths:
                    # Do not compute pre-transform stats if the input format is raw proto,
                    # as StatsGen would treat any input as tf.Example.
                    if (compute_statistics and not self._IsDataFormatProto(
                            raw_examples_data_format)):
                        # Aggregated feature stats before transformation.
                        pre_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            PRE_TRANSFORM_FEATURE_STATS_PATH)

                        # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
                        # schema. Currently input dataset schema only contains dtypes,
                        # and other metadata is dropped due to roundtrip to tensors.
                        schema_proto = schema_utils.schema_from_feature_spec(
                            analyze_input_dataset_metadata.schema.
                            as_feature_spec())
                        ([
                            dataset.decoded
                            if stats_use_tfdv else dataset.encoded
                            for dataset in analyze_data_list
                        ]
                         | 'FlattenPreTransformAnalysisDatasets' >>
                         beam.Flatten()
                         | 'GenerateAggregatePreTransformAnalysisStats' >>
                         self._GenerateStats(pre_transform_feature_stats_path,
                                             schema_proto,
                                             use_deep_copy_optimization=True,
                                             use_tfdv=stats_use_tfdv))

                    transform_decode_fn = (self._GetDecodeFunction(
                        raw_examples_data_format,
                        transform_input_dataset_metadata.schema))
                    # transform_data_list is a superset of analyze_data_list, we pay the
                    # cost to read the same dataset (analyze_data_list) again here to
                    # prevent certain beam runner from doing large temp materialization.
                    for (idx, dataset) in enumerate(transform_data_list):
                        dataset.encoded = (
                            p
                            | 'ReadTransformDataset[{}]'.format(idx) >>
                            self._ReadExamples(dataset))
                        dataset.decoded = (
                            dataset.encoded
                            | 'DecodeTransformDataset[{}]'.format(idx) >>
                            self._DecodeInputs(transform_decode_fn))
                        (dataset.transformed, metadata) = (
                            ((dataset.decoded,
                              transform_input_dataset_metadata), transform_fn)
                            | 'TransformDataset[{}]'.format(idx) >>
                            tft_beam.TransformDataset())

                        if materialize_output_paths or not stats_use_tfdv:
                            dataset.transformed_and_encoded = (
                                dataset.transformed
                                | 'EncodeTransformedDataset[{}]'.format(idx) >>
                                beam.ParDo(self._EncodeAsExamples(), metadata))

                    if compute_statistics:
                        # Aggregated feature stats after transformation.
                        _, metadata = transform_fn
                        post_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            POST_TRANSFORM_FEATURE_STATS_PATH)

                        # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
                        # schema. Currently input dataset schema only contains dtypes,
                        # and other metadata is dropped due to roundtrip to tensors.
                        transformed_schema_proto = schema_utils.schema_from_feature_spec(
                            metadata.schema.as_feature_spec())

                        ([(dataset.transformed if stats_use_tfdv else
                           dataset.transformed_and_encoded)
                          for dataset in transform_data_list]
                         | 'FlattenPostTransformAnalysisDatasets' >>
                         beam.Flatten()
                         | 'GenerateAggregatePostTransformAnalysisStats' >>
                         self._GenerateStats(post_transform_feature_stats_path,
                                             transformed_schema_proto,
                                             use_tfdv=stats_use_tfdv))

                        if per_set_stats_output_paths:
                            assert len(transform_data_list) == len(
                                per_set_stats_output_paths)
                            # TODO(b/67632871): Remove duplicate stats gen compute that is
                            # done both on a flattened view of the data, and on each span
                            # below.
                            bundles = zip(transform_data_list,
                                          per_set_stats_output_paths)
                            for (idx, (dataset,
                                       output_path)) in enumerate(bundles):
                                if stats_use_tfdv:
                                    data = dataset.transformed
                                else:
                                    data = dataset.transformed_and_encoded
                                (data
                                 | 'GeneratePostTransformStats[{}]'.format(idx)
                                 >> self._GenerateStats(
                                     output_path,
                                     transformed_schema_proto,
                                     use_tfdv=stats_use_tfdv))

                    if materialize_output_paths:
                        assert len(transform_data_list) == len(
                            materialize_output_paths)
                        bundles = zip(transform_data_list,
                                      materialize_output_paths)
                        for (idx, (dataset,
                                   output_path)) in enumerate(bundles):
                            (dataset.transformed_and_encoded
                             | 'Materialize[{}]'.format(idx) >>
                             self._WriteExamples(raw_examples_file_format,
                                                 output_path))

        return _Status.OK()
示例#15
0
    def expand(self, pcoll):
        """Computes top-k most frequent values and number of uniques."""
        # Convert input example to tuples of form
        # (slice_key, feature_name, feature_value_list, optional weight)
        # corresponding to each example.
        feature_values_with_weights = (
            pcoll
            | 'TopKUniques_ConvertInputToFeatureValuesWithWeights' >>
            beam.FlatMap(_convert_input_to_feature_values_with_weights,
                         categorical_features=self._categorical_features,
                         weight_feature=self._weight_feature))

        # Lambda to convert from ((slice_key, feature_name, feature_value), count)
        # to ((slice_key, feature_name), (feature_value, count))
        modify_key = (lambda x:
                      ((x[0][0], x[0][1]), FeatureValueCount(x[0][2], x[1])))

        # Key to order values.
        key_fn = lambda x: (x.count, x.feature_value)

        sliced_feature_name_value_count = (
            feature_values_with_weights
            # Flatten (slice_key, feature_name, feature_value_list, optional weight)
            # to (slice_key, feature_name, feature_value)
            | 'TopKUniques_FlattenToSlicedFeatureNameValueTuples' >>
            beam.FlatMap(_flatten_value_list)
            # Compute the frequency of each feature_value per slice. Output is a
            # PCollection of ((slice_key, feature_name, feature_value), count)
            | 'TopKUniques_CountSlicedFeatureNameValueTuple' >>
            beam.combiners.Count().PerElement()
            # Convert from ((slice_key, feature_name, feature_value), count) to
            # ((slice_key, feature_name), (feature_value, count))
            |
            'TopKUniques_ModifyKeyToSlicedFeatureName' >> beam.Map(modify_key))

        result_protos = []
        # Find topk values for each feature.
        topk = (
            sliced_feature_name_value_count
            # Obtain the top-k most frequent feature value for each feature in a
            # slice.
            | 'TopK_GetTopK' >> beam.combiners.Top.PerKey(max(
                self._num_top_values, self._num_rank_histogram_buckets),
                                                          key=key_fn)
            | 'TopK_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                categorical_features=self._categorical_features,
                is_weighted_stats=False,
                num_top_values=self._num_top_values,
                frequency_threshold=self._frequency_threshold,
                num_rank_histogram_buckets=self._num_rank_histogram_buckets))

        result_protos.append(topk)

        # If a weight feature is provided, find the weighted topk values for each
        # feature.
        if self._weight_feature is not None:
            weighted_topk = (
                # Flatten (slice_key, feature_name, feature_value_list, weight) to
                # ((slice_key, feature_name, feature_value), weight)
                feature_values_with_weights
                | 'TopKWeighted_FlattenToSlicedFeatureNameValueTuples' >>
                beam.FlatMap(_flatten_weighted_value_list)
                # Sum the weights of each feature_value per slice. Output is a
                # PCollection of
                # ((slice_key, feature_name, feature_value), weighted_count)
                | 'TopKWeighted_CountSlicedFeatureNameValueTuple' >>
                beam.CombinePerKey(sum)
                # Convert from
                # ((slice_key, feature_name, feature_value), weighted_count) to
                # ((slice_key, feature_name), (feature_value, weighted_count))
                | 'TopKWeighted_ModifyKeyToSlicedFeatureName' >>
                beam.Map(modify_key)
                # Obtain the top-k most frequent feature value for each feature in a
                # slice.
                | 'TopKWeighted_GetTopK' >> beam.combiners.Top().PerKey(
                    max(self._num_top_values,
                        self._num_rank_histogram_buckets),
                    key=key_fn)
                | 'TopKWeighted_ConvertToSingleFeatureStats' >> beam.Map(
                    _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=True,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._weighted_frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets
                ))
            result_protos.append(weighted_topk)

        uniques = (
            sliced_feature_name_value_count
            # Drop the values to only have the slice_key and feature_name with
            # each repeated the number of unique values times.
            | 'Uniques_DropValues' >> beam.Keys()
            | 'Uniques_CountPerFeatureName' >>
            beam.combiners.Count().PerElement()
            | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_uniques_for_single_feature,
                categorical_features=self._categorical_features))
        result_protos.append(uniques)

        def _deserialize_sliced_feature_stats_proto(entry):
            feature_stats_proto = statistics_pb2.DatasetFeatureStatistics()
            feature_stats_proto.ParseFromString(entry[1])
            return entry[0], feature_stats_proto

        return (
            result_protos
            | 'FlattenTopKUniquesResults' >> beam.Flatten()
            # TODO(b/121152126): This deserialization stage is a workaround.
            # Remove this once it is no longer needed.
            | 'DeserializeTopKUniquesFeatureStatsProto' >>
            beam.Map(_deserialize_sliced_feature_stats_proto))
示例#16
0
    def run(self):
        """Returns a PCollection of audit errors aggregated from all models.

        Returns:
            PCollection. A PCollection of audit errors discovered during the
            audit.

        Raises:
            ValueError. When the `model_getter` option, which should be the type
                of PTransform we will use to fetch models from the datastore, is
                None.
        """
        if self.job_options.model_getter is None:
            raise ValueError('JobOptions.model_getter must not be None')

        existing_models, deleted_models = (
            self.pipeline
            | 'Get all models' >> self.job_options.model_getter()
            | 'Partition by model.deleted' >>
            (beam.Partition(lambda model, _: int(model.deleted), 2)))

        models_of_kind_by_index = (
            existing_models
            # NOTE: Partition returns a statically-sized list of PCollections.
            # Creating partitions is wasteful when there are fewer items than
            # there are partitions, like in our unit tests. In exchange, in
            # production the job will be able to take advantage of the high
            # parallelizability of PCollections, which are designed for enormous
            # datasets and parallel processing.
            #
            # Alternatively, we could have used GroupBy. However, that returns
            # an _iterable_ of items rather than a PCollection, and so it is
            # vulnerable to out-of-memory errors.
            #
            # Since this job is concerned with running audits on EVERY MODEL IN
            # STORAGE, Partition is the clear winner regardless of the overhead
            # we'll see in unit tests.
            |
            'Split models into parallelizable PCollections' >> beam.Partition(
                lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)),
                # NOTE: Partition requires a hard-coded number of slices; it
                # cannot be used with dynamic numbers generated in a pipeline.
                # KIND_BY_INDEX is a constant tuple so that requirement is
                # satisfied in this case.
                len(KIND_BY_INDEX),
                KIND_BY_INDEX))

        existing_key_count_pcolls = []
        missing_key_error_pcolls = []
        audit_error_pcolls = [
            deleted_models
            | 'Apply ValidateDeletedModel on deleted models' >>
            (beam.ParDo(base_model_audits.ValidateDeletedModel()))
        ]

        model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index)
        for kind, models_of_kind in model_groups:
            audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind))

            if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES:
                existing_key_count_pcolls.append(
                    models_of_kind | GetExistingModelKeyCounts(kind))

            if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR:
                missing_key_error_pcolls.extend(
                    models_of_kind | GetMissingModelKeyErrors(kind))

        existing_key_counts = (
            existing_key_count_pcolls
            | 'Flatten PCollections of existing key counts' >> beam.Flatten())
        missing_key_errors = (
            missing_key_error_pcolls
            | 'Flatten PCollections of missing key errors' >> beam.Flatten())
        audit_error_pcolls.append(
            (existing_key_counts, missing_key_errors)
            | 'Group counts and errors by key' >> beam.CoGroupByKey()
            | 'Filter keys without any errors' >>
            (beam.FlatMapTuple(self._get_model_relationship_errors)))

        return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
示例#17
0
    def expand(self, pcoll):
        p = pcoll.pipeline
        try:
            step_name = self.label
        except AttributeError:
            step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
            BigQueryBatchFileLoads.COUNT += 1

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location
        job_name = (p.options.view_as(GoogleCloudOptions).job_name
                    or 'AUTOMATIC_JOB_NAME')

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))

        schema_mod_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            |
            "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD,
                'SCHEMA_MOD_STEP')))

        copy_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        if not self.with_auto_sharding:
            all_destination_file_pairs_pc = self._write_files(
                destination_data_kv_pc, file_prefix_pcv)
        else:
            all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
                destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(all_partitions, empty_pc, load_job_name_pcv,
                                schema_mod_job_name_pcv, copy_job_name_pcv, p,
                                step_name))
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(multiple_partitions_per_destination_pc,
                                single_partition_per_destination_pc,
                                load_job_name_pcv, schema_mod_job_name_pcv,
                                copy_job_name_pcv, p, step_name))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
 def test_flatten(self):
     with self.create_pipeline() as p:
         res = (p | 'a' >> beam.Create(['a']),
                p | 'bc' >> beam.Create(['b', 'c']),
                p | 'd' >> beam.Create(['d'])) | beam.Flatten()
         assert_that(res, equal_to(['a', 'b', 'c', 'd']))
示例#19
0
    def test_multiple_destinations_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        full_output_table_1 = '%s:%s' % (self.project, output_table_1)
        full_output_table_2 = '%s:%s' % (self.project, output_table_2)

        schema1 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }
        schema2 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'foundation',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        bad_record = {'language': 1, 'manguage': 2}

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_2,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(_ELEMENTS)

            schema_table_pcv = beam.pvalue.AsDict(
                p
                | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1),
                                                (full_output_table_2,
                                                 schema2)]))

            table_record_pcv = beam.pvalue.AsDict(
                p
                | "MakeTables" >> beam.Create([('table1', full_output_table_1),
                                               ('table2',
                                                full_output_table_2)]))

            input2 = p | "Broken record" >> beam.Create([bad_record])

            input = (input, input2) | beam.Flatten()

            r = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=lambda x, tables:
                     (tables['table1']
                      if 'language' in x else tables['table2']),
                     table_side_inputs=(table_record_pcv, ),
                     schema=lambda dest, table_map: table_map.get(dest, None),
                     schema_side_inputs=(schema_table_pcv, ),
                     method='STREAMING_INSERTS'))

            assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                        equal_to([(full_output_table_1, bad_record)]))
示例#20
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        self._custom_gcs_temp_location = (
            self._custom_gcs_temp_location
            or p.options.view_as(GoogleCloudOptions).temp_location)

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create(
                [self._custom_gcs_temp_location])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate)))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
示例#21
0
def run(argv=None):
    """Main entry point; defines and runs the tfidf pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_baseline',
                        required=False,
                        help='baseline URIs to process.')
    parser.add_argument('--input_updates',
                        required=False,
                        help='updates URIs to process.')
    parser.add_argument('--input_enriched',
                        required=False,
                        help='updates URIs to process.')
    parser.add_argument('--output',
                        required=False,
                        help='Output file to write results to.')
    parser.add_argument('--output_enriched',
                        required=False,
                        help='Output file to write results to.')
    parser.add_argument('--output_splitted',
                        required=False,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # bq_table_schema = parse_bq_json_schema(json.load(open('schemas/medline.papers.json')))
    bq_table_schema = parse_bq_json_schema(json.loads(BQ_SCHEMA))
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        if known_args.input_baseline or known_args.input_updates:

            if known_args.input_baseline and known_args.input_updates:
                medline_articles_base = p | 'BaselineEmitXML' >> ReadMedlineFiles(
                    known_args.input_baseline)
                medline_articles_updates = p | 'UpdatesEmitXML' >> ReadMedlineFiles(
                    known_args.input_updates)

                medline_articles = (
                    (medline_articles_base, medline_articles_updates)
                    | beam.Flatten())
            elif known_args.input_baseline:
                medline_articles = p | 'BaselineEmitXML' >> ReadMedlineFiles(
                    known_args.input_baseline)
            elif known_args.input_updates:
                medline_articles = p | 'UpdatesEmitXML' >> ReadMedlineFiles(
                    known_args.input_updates)
            else:
                raise AttributeError('at least an XML input is required')

            parsed_medline_articles = medline_articles | 'ParseXMLtoDict' >> beam.ParDo(
                MedlineXMLParser())

            medline_articles_grouped_by_id = parsed_medline_articles | 'GroupByPMID' >> beam.GroupByKey(
            )

            unique_medline_articles = medline_articles_grouped_by_id | 'SortByFilename' >> beam.ParDo(
                GetLatestVersion())

            enriched_articles = unique_medline_articles | 'NLPAnalysis' >> beam.ParDo(
                NLPAnalysis())

            json_enriched_medline_articles = enriched_articles | 'EnrichedMedlineToJSON' >> beam.ParDo(
                ToJSON())

            json_enriched_medline_articles | 'WriteEnrichedJSONToGS' >> WriteToText(
                known_args.output_enriched,
                file_name_suffix='_enriched.json.gz')

        elif known_args.input_enriched:

            json_enriched_medline_articles = p | 'GetEnrichedArticles' >> ReadFromText(
                known_args.input_enriched)

        else:
            raise AttributeError('missing json enriched data  input')

        if known_args.output_splitted:

            concepts = json_enriched_medline_articles | 'ArticleToConcepts' >> beam.ParDo(
                ExtractConcepts())
            concepts | 'WriteConceptJSONToGS' >> WriteToText(
                known_args.output_splitted,
                file_name_suffix='_concepts.json.gz')

            bioentities = json_enriched_medline_articles | 'ArticleToBioentities' >> beam.ParDo(
                ExtractBioentities())
            bioentities | 'WriteBioentityJSONToGS' >> WriteToText(
                known_args.output_splitted,
                file_name_suffix='_bioentities.json.gz')

            taggedtext = json_enriched_medline_articles | 'ArticleToTaggedText' >> beam.ParDo(
                ExtractTaggedText())
            taggedtext | 'WriteTaggedTextJSONToGS' >> WriteToText(
                known_args.output_splitted,
                file_name_suffix='_taggedtext.json.gz')

            smallarticles = json_enriched_medline_articles | 'ArticleToSmallArticles' >> beam.ParDo(
                CleanPublication())
            smallarticles | 'WriteSmallArticleJSONToGS' >> WriteToText(
                known_args.output_splitted, file_name_suffix='_small.json.gz')
示例#22
0
            def expand(self, pcolls):

                scalar_inputs = [
                    expr for expr in self.stage.inputs if is_scalar(expr)
                ]
                tabular_inputs = [
                    expr for expr in self.stage.inputs if not is_scalar(expr)
                ]

                if len(tabular_inputs) == 0:
                    partitioned_pcoll = next(
                        pcolls.values()).pipeline | beam.Create([{}])

                elif self.stage.partitioning != partitionings.Nothing():
                    # Partitioning required for these operations.
                    # Compute the number of partitions to use for the inputs based on
                    # the estimated size of the inputs.
                    if self.stage.partitioning == partitionings.Singleton():
                        # Always a single partition, don't waste time computing sizes.
                        num_partitions = 1
                    else:
                        # Estimate the sizes from the outputs of a *previous* stage such
                        # that using these estimates will not cause a fusion break.
                        input_sizes = [
                            estimate_size(input, same_stage_ok=False)
                            for input in tabular_inputs
                        ]
                        if None in input_sizes:
                            # We were unable to (cheaply) compute the size of one or more
                            # inputs.
                            num_partitions = DEFAULT_PARTITIONS
                        else:
                            num_partitions = beam.pvalue.AsSingleton(
                                input_sizes
                                | 'FlattenSizes' >> beam.Flatten()
                                | 'SumSizes' >> beam.CombineGlobally(sum)
                                | 'NumPartitions' >> beam.Map(lambda size: max(
                                    MIN_PARTITIONS,
                                    min(MAX_PARTITIONS, size //
                                        TARGET_PARTITION_SIZE))))

                    partition_fn = self.stage.partitioning.partition_fn

                    class Partition(beam.PTransform):
                        def expand(self, pcoll):
                            return (
                                pcoll
                                # Attempt to create batches of reasonable size.
                                | beam.ParDo(_PreBatch())
                                # Actually partition.
                                | beam.FlatMap(partition_fn, num_partitions)
                                # Don't bother shuffling empty partitions.
                                | beam.Filter(lambda k_df: len(k_df[1])))

                    # Arrange such that partitioned_pcoll is properly partitioned.
                    main_pcolls = {
                        expr._id: pcolls[expr._id] | 'Partition_%s_%s' %
                        (self.stage.partitioning, expr._id) >> Partition()
                        for expr in tabular_inputs
                    } | beam.CoGroupByKey()
                    partitioned_pcoll = main_pcolls | beam.ParDo(_ReBatch())

                else:
                    # Already partitioned, or no partitioning needed.
                    assert len(tabular_inputs) == 1
                    tag = tabular_inputs[0]._id
                    partitioned_pcoll = pcolls[tag] | beam.Map(
                        lambda df: {tag: df})

                side_pcolls = {
                    expr._id: beam.pvalue.AsSingleton(pcolls[expr._id])
                    for expr in scalar_inputs
                }

                # Actually evaluate the expressions.
                def evaluate(partition, stage=self.stage, **side_inputs):
                    def lookup(expr):
                        # Use proxy if there's no data in this partition
                        return expr.proxy().iloc[:0] if partition[
                            expr._id] is None else partition[expr._id]

                    session = expressions.Session(
                        dict([(expr, lookup(expr))
                              for expr in tabular_inputs] +
                             [(expr, side_inputs[expr._id])
                              for expr in scalar_inputs]))
                    for expr in stage.outputs:
                        yield beam.pvalue.TaggedOutput(
                            expr._id, expr.evaluate_at(session))

                return partitioned_pcoll | beam.FlatMap(
                    evaluate, **side_pcolls).with_outputs()
示例#23
0
def pipeline(config_map, dataset_config_map, preprocess_example_fn,
             input_tensors_to_example_fn):
    """Pipeline for dataset creation."""
    tf.flags.mark_flags_as_required(['output_directory'])

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))

    config = config_map[FLAGS.config]
    hparams = config.hparams
    hparams.parse(FLAGS.hparams)

    datasets = dataset_config_map[FLAGS.dataset_config]

    if tf.gfile.Exists(FLAGS.output_directory):
        raise ValueError('Output directory %s already exists!' %
                         FLAGS.output_directory)
    tf.gfile.MakeDirs(FLAGS.output_directory)
    with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'),
                       'w') as f:
        f.write('\n\n'.join([
            'min_length: {}'.format(FLAGS.min_length),
            'max_length: {}'.format(FLAGS.max_length),
            'sample_rate: {}'.format(FLAGS.sample_rate),
            'preprocess_examples: {}'.format(FLAGS.preprocess_examples),
            'preprocess_train_example_multiplier: {}'.format(
                FLAGS.preprocess_train_example_multiplier),
            'config: {}'.format(FLAGS.config),
            'hparams: {}'.format(hparams.to_json(sort_keys=True)),
            'dataset_config: {}'.format(FLAGS.dataset_config),
            'datasets: {}'.format(datasets),
        ]))

    with beam.Pipeline(options=pipeline_options) as p:
        for dataset in datasets:
            if isinstance(dataset.path, (list, tuple)):
                # If dataset.path is a list, then it's a list of sources to mix together
                # to form new examples. First, do the mixing, then pass the results to
                # the rest of the pipeline.
                id_exs = []
                sourceid_to_exids = []
                for source_id, stem_path in enumerate(dataset.path):
                    if dataset.num_mixes is None:
                        raise ValueError(
                            'If path is not a list, num_mixes must not be None: {}'
                            .format(dataset))
                    stem_p = p | 'tfrecord_list_%s_%d' % (
                        dataset.name, source_id) >> (beam.Create(
                            data.generate_sharded_filenames(stem_path)))

                    # Note that we do not specify a coder when reading here.
                    # This is so that the hashing in key_example below can work directly
                    # on the serialized version instead of having to re-serialize it.
                    # Also, deserializing with a coder and then re-serializing does not
                    # always generate the same hash for the same example (likely due to
                    # the map fields in tf.train.Example). This is important when reading
                    # the same dataset multiple times to mix it with itself.
                    stem_p |= 'read_tfrecord_%s_%d' % (
                        dataset.name, source_id) >> (
                            beam.io.tfrecordio.ReadAllFromTFRecord())
                    stem_p |= 'shuffle_stems_%s_%d' % (
                        dataset.name, source_id) >> (beam.Reshuffle())

                    # Key all examples with a hash.
                    def key_example(ex):
                        return (hashlib.sha256(ex).hexdigest(), ex)

                    stem_p |= 'add_id_key_%s_%d' % (
                        dataset.name, source_id) >> (beam.Map(key_example))
                    id_exs.append(stem_p)

                    # Create a list of source_id to example id.
                    def sourceid_to_exid(id_ex, source_id):
                        return (source_id, id_ex[0])

                    sourceid_to_exids.append(
                        stem_p | 'key_%s_%d' % (dataset.name, source_id) >>
                        (beam.Map(sourceid_to_exid, source_id=source_id)))

                # ('example_hash', serialized_example)
                id_exs = (
                    id_exs
                    | 'id_exs_flatten_%s' % dataset.name >> beam.Flatten()
                    | 'id_exs_distinct_%s' % dataset.name >> beam.Distinct())

                # ('source_id, 'example_hash')
                sourceid_to_exids = (sourceid_to_exids
                                     | 'sourceid_to_exids_flatten_%s' %
                                     dataset.name >> beam.Flatten())

                # Pass the list of source id to example IDs to generate_mixes,
                # which will create mixes by selecting random IDs from each source
                # (with replacement). This is represented as a list of example IDs
                # to Mix IDs.
                # Note: beam.Create([0]) is just a single dummy value to allow the
                # sourceid_to_exids to be passed in as a python list so we can do the
                # sampling with numpy.
                exid_to_mixids = (
                    p
                    | 'create_dummy_%s' % dataset.name >> beam.Create([0])
                    | 'generate_mixes_%s' % dataset.name >> beam.Map(
                        create_dataset_lib.generate_mixes,
                        num_mixes=dataset.num_mixes,
                        sourceid_to_exids=beam.pvalue.AsList(
                            sourceid_to_exids)))

                # Create a list of (Mix ID, Full Example proto). Note: Examples may be
                # present in more than one mix. Then, group by Mix ID.
                def mixid_to_exs(id_ex, exid_to_mixids):
                    exid, ex = id_ex
                    for mixid in exid_to_mixids[exid]:
                        yield mixid, ex

                mixid_exs = (
                    id_exs
                    | 'mixid_to_exs_%s' % dataset.name >> beam.FlatMap(
                        mixid_to_exs,
                        exid_to_mixids=beam.pvalue.AsSingleton(exid_to_mixids))
                    | 'group_by_key_%s' % dataset.name >> beam.GroupByKey())
                # Take these groups of Examples, mix their audio and sequences to return
                # a single new Example. Then, carry on with the rest of the pipeline
                # like normal.
                split_p = (mixid_exs
                           | 'mix_examples_%s' % dataset.name >> beam.Map(
                               mix_examples, FLAGS.sample_rate,
                               FLAGS.load_audio_with_librosa))
            else:
                if dataset.num_mixes is not None:
                    raise ValueError(
                        'If path is not a list, num_mixes must be None: {}'.
                        format(dataset))
                split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create(
                    data.generate_sharded_filenames(dataset.path))
                split_p |= 'read_tfrecord_%s' % dataset.name >> (
                    beam.io.tfrecordio.ReadAllFromTFRecord(
                        coder=beam.coders.ProtoCoder(tf.train.Example)))
            split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap(
                split_wav,
                min_length=FLAGS.min_length,
                max_length=FLAGS.max_length,
                sample_rate=FLAGS.sample_rate,
                debug_output_directory=FLAGS.output_directory,
                split_example=dataset.process_for_training,
                load_audio_with_librosa=FLAGS.load_audio_with_librosa)
            if FLAGS.preprocess_examples:
                if dataset.process_for_training:
                    mul_name = 'preprocess_multiply_%dx_%s' % (
                        FLAGS.preprocess_train_example_multiplier,
                        dataset.name)
                    split_p |= mul_name >> beam.FlatMap(
                        multiply_example,
                        FLAGS.preprocess_train_example_multiplier)
                split_p |= 'preprocess_%s' % dataset.name >> beam.Map(
                    preprocess_data, preprocess_example_fn,
                    input_tensors_to_example_fn, hparams,
                    dataset.process_for_training)
            split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord(
                os.path.join(FLAGS.output_directory,
                             '%s.tfrecord' % dataset.name),
                coder=beam.coders.ProtoCoder(tf.train.Example))
示例#24
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()

    parser.add_argument('--pid', dest='pid', help='project id')

    parser.add_argument('--mbucket', dest='mbucket', help='model bucket name')

    known_args, pipeline_args = parser.parse_known_args(
        argv)  # set the arguments

    pipeline_options = PipelineOptions(  # set the pipeline options
        flags=pipeline_args,
        project='data-engeneering-289509',
        temp_location='gs://data_engineering2020/tmp/',
        region='europe-west4')

    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(
            options=pipeline_options
    ) as p:  #all the pipeline steps are described in detail in the report
        # get all the tweets for trump
        trump_tweets = (
            p
            | 'Query Trump tweets' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query=
                    'SELECT `tweet` FROM `data-engeneering-289509.tweetdata.tweets_trump`',
                    use_standard_sql=True))
            | 'ExtractTweetsTrump' >> beam.Map(lambda elem: elem['tweet']))

        #determine the sentiment
        trump_sentiment = (p
                           | 'GetPID Trump' >> beam.Create([known_args.pid])
                           | 'Analyse sentiment Trump' >> beam.FlatMap(
                               sentimentAnalysis,
                               bucket_name=known_args.mbucket,
                               name="Trump",
                               tweets=beam.pvalue.AsList(trump_tweets)))

        # biden tweets
        biden_tweets = (
            p
            | 'Query Biden tweets' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query=
                    'SELECT `tweet`  FROM `data-engeneering-289509.tweetdata.tweets_biden`',
                    use_standard_sql=True))
            | 'ExtractTweetsBiden' >> beam.Map(lambda elem: elem['tweet']))
        # biden sentiment
        biden_sentiment = (
            p
            | 'GetProjectID Biden' >> beam.Create([known_args.pid])
            | 'Analyse sentiment Biden' >> beam.FlatMap(
                sentimentAnalysis,
                bucket_name=known_args.mbucket,
                name="Biden",
                tweets=beam.pvalue.AsList(biden_tweets)))

        # combine the two results
        composed_result = ((trump_sentiment, biden_sentiment)
                           | 'Merge sentiments' >> beam.Flatten())

        (p
         | 'GetBucketName' >> beam.Create([known_args.mbucket])
         | 'Create Visualization' >> beam.FlatMap(
             survey, pre_results=beam.pvalue.AsList(
                 composed_result))  # create the visualisation
         )
示例#25
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(all_partitions, empty_pc, load_job_name_pcv,
                         singleton_pc)
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(multiple_partitions_per_destination_pc,
                         single_partition_per_destination_pc,
                         load_job_name_pcv, singleton_pc)

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
示例#26
0
    def expand(self, tensor_pcoll_mapping):
        """Converts a dict of statistics to a transform function.

    Args:
      tensor_pcoll_mapping: A dictionary mapping `Tensor`s to a singleton
          PCollection containing a _TensorValue.

    Returns:
      A single-element PCollection containing the directory name with the
          SavedModel.
    """
        transform_fn = (
            self.pipeline
            | 'CreateTransformFn' >> beam.Create([self._saved_model_dir]))

        if not tensor_pcoll_mapping:
            return transform_fn

        # Convert tensor_value_mapping into a DictPCollectionView so it can be
        # passed as a side input to the beam Map below.
        tensor_value_pairs = []
        for name, pcoll in six.iteritems(tensor_pcoll_mapping):
            tensor_value_pairs.append(
                pcoll
                | 'AddName[%s]' % name >> beam.Map(lambda x, name=name:
                                                   (name, x)))
        tensor_value_mapping = beam.pvalue.AsDict(
            tensor_value_pairs | 'MergeTensorValuePairs' >> beam.Flatten())

        def replace_tensors_with_constant_values(saved_model_dir,
                                                 tensor_value_mapping):
            """Replaces specified `Tensor`s with constant values.

      Constants are accepted as Python values; these are automatically
      wrapped in `tf.constant()`.

      This method creates its own temp dir, and is therefore idempotent
      since any retry will use a different temp dir.

      Args:
        saved_model_dir: A SavedModel directory providing a transform
          graph.  The MetaGraphDef and signature are selected from the
          SavedModel using keys defined in `../constants.py` ('transform'
          and 'transform_signature', respectively).
        tensor_value_mapping: a dict of tensor names to values to use in
          place of those tensors.

      Returns:
        The directory name containing the updated SavedModel.

      Raises:
        RuntimeError: if there is no default graph available to which to
          apply the transform.
      """

            graph = tf.get_default_graph()
            if graph is None:
                raise RuntimeError('replace_tensors_with_constant_values() '
                                   'requires a default graph.')

            tensor_replacement_map = {}
            for orig_tensor_name, (
                    value, is_asset) in six.iteritems(tensor_value_mapping):
                new_tensor = tf.constant(value)
                if is_asset:
                    # Any newly frozen constant tensors containing filenames must be
                    # added to the ASSET_FILENAMES collection.
                    graph.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS,
                                            new_tensor)
                tensor_replacement_map[orig_tensor_name] = new_tensor

            with tf.Session() as session:
                temp_dir = _make_unique_temp_dir(self._base_temp_dir)
                input_tensors, output_tensors = (
                    saved_transform_io.partially_apply_saved_transform(
                        saved_model_dir, {}, tensor_replacement_map))
                saved_transform_io.write_saved_transform_from_session(
                    session, input_tensors, output_tensors, temp_dir)
            return temp_dir

        return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map(
            replace_tensors_with_constant_values,
            tensor_value_mapping=tensor_value_mapping))
示例#27
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    # Note VepRunner creates new input files, so it should be run before any
    # other access to known_args.input_pattern.
    if known_args.run_annotation_pipeline:
        runner = vep_runner.create_runner_and_update_args(
            known_args, pipeline_args)
        runner.run_on_all_files()
        runner.wait_until_done()
        logging.info('Using VEP processed files: %s', known_args.input_pattern)

    variant_merger = _get_variant_merge_strategy(known_args)
    pipeline_mode = pipeline_common.get_pipeline_mode(
        known_args.input_pattern, known_args.optimize_for_large_inputs)

    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    partitioner = None
    if ((known_args.optimize_for_large_inputs and variant_merger)
            or known_args.partition_config_path):
        partitioner = variant_partition.VariantPartition(
            known_args.partition_config_path)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(pipeline, known_args)
    variants |= 'FilterVariants' >> filter_variants.FilterVariants(
        reference_names=known_args.reference_names)
    if partitioner:
        num_partitions = partitioner.get_num_partitions()
        partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner), num_partitions)
        variants = []
        for i in range(num_partitions):
            if partitioner.should_keep_partition(i):
                variants.append(partitioned_variants[i])
            else:
                num_partitions -= 1
    else:
        # By default we don't partition the data, so we have only 1 partition.
        num_partitions = 1
        variants = [variants]

    for i in range(num_partitions):
        if variant_merger:
            variants[i] |= ('MergeVariants' + str(i) >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + str(i) >>
            beam.Map(processed_variant_factory.create_processed_variant).\
                with_output_types(processed_variant.ProcessedVariant))
    if partitioner and partitioner.should_flatten():
        variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
        num_partitions = 1

    if known_args.output_table:
        for i in range(num_partitions):
            table_suffix = ''
            if partitioner and partitioner.get_partition_name(i):
                table_suffix = '_' + partitioner.get_partition_name(i)
            table_name = known_args.output_table + table_suffix
            _ = (
                variants[i] | 'VariantToBigQuery' + table_suffix >>
                variant_to_bigquery.VariantToBigQuery(
                    table_name,
                    header_fields,
                    variant_merger,
                    processed_variant_factory,
                    append=known_args.append,
                    update_schema_on_append=known_args.update_schema_on_append,
                    allow_incompatible_records=known_args.
                    allow_incompatible_records,
                    omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                    num_bigquery_write_shards=known_args.
                    num_bigquery_write_shards,
                    null_numeric_value_replacement=(
                        known_args.null_numeric_value_replacement)))

    if known_args.output_avro_path:
        # TODO(bashir2): Add an integration test that outputs to Avro files and
        # also imports to BigQuery. Then import those Avro outputs using the bq
        # tool and verify that the two tables are identical.
        _ = (variants | 'FlattenToOnePCollection' >> beam.Flatten()
             | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles(
                 known_args.output_avro_path,
                 header_fields,
                 processed_variant_factory,
                 variant_merger=variant_merger,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement)))

    result = pipeline.run()
    result.wait_until_finish()

    metrics_util.log_all_counters(result)
示例#28
0
 def expand(self, inputs):
     return inputs | beam.Flatten()
示例#29
0
    def test_multiple_destinations_transform(self):
        streaming = self.test_pipeline.options.view_as(
            StandardOptions).streaming
        if streaming and isinstance(self.test_pipeline.runner,
                                    TestDataflowRunner):
            self.skipTest("TestStream is not supported on TestDataflowRunner")

        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        full_output_table_1 = '%s:%s' % (self.project, output_table_1)
        full_output_table_2 = '%s:%s' % (self.project, output_table_2)

        schema1 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }
        schema2 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'foundation',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        bad_record = {'language': 1, 'manguage': 2}

        if streaming:
            pipeline_verifiers = [
                PipelineStateMatcher(PipelineState.RUNNING),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]
        else:
            pipeline_verifiers = [
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            if streaming:
                _SIZE = len(_ELEMENTS)
                test_stream = (
                    TestStream().advance_watermark_to(0).add_elements(
                        _ELEMENTS[:_SIZE // 2]).advance_watermark_to(
                            100).add_elements(
                                _ELEMENTS[_SIZE //
                                          2:]).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)

            schema_table_pcv = beam.pvalue.AsDict(
                p
                | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1),
                                                (full_output_table_2,
                                                 schema2)]))

            table_record_pcv = beam.pvalue.AsDict(
                p
                | "MakeTables" >> beam.Create([('table1', full_output_table_1),
                                               ('table2',
                                                full_output_table_2)]))

            input2 = p | "Broken record" >> beam.Create([bad_record])

            input = (input, input2) | beam.Flatten()

            r = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=lambda x, tables:
                     (tables['table1']
                      if 'language' in x else tables['table2']),
                     table_side_inputs=(table_record_pcv, ),
                     schema=lambda dest, table_map: table_map.get(dest, None),
                     schema_side_inputs=(schema_table_pcv, ),
                     method='STREAMING_INSERTS'))

            assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                        equal_to([(full_output_table_1, bad_record)]))
示例#30
0
    def _execute(self, window_fn, trigger_fn, accumulation_mode,
                 timestamp_combiner, transcript, spec):

        runner_name = TestPipeline().runner.__class__.__name__
        if runner_name in spec.get('broken_on', ()):
            self.skipTest('Known to be broken on %s' % runner_name)

        # Elements are encoded as a json strings to allow other languages to
        # decode elements while executing the test stream.
        # TODO(BEAM-8600): Eliminate these gymnastics.
        test_stream = TestStream(
            coder=coders.StrUtf8Coder()).with_output_types(str)
        for action, params in transcript:
            if action == 'expect':
                test_stream.add_elements([json.dumps(('expect', params))])
            else:
                test_stream.add_elements([json.dumps(('expect', []))])
                if action == 'input':
                    test_stream.add_elements(
                        [json.dumps(('input', e)) for e in params])
                elif action == 'watermark':
                    test_stream.advance_watermark_to(params)
                elif action == 'clock':
                    test_stream.advance_processing_time(params)
                elif action == 'state':
                    pass  # Requires inspection of implementation details.
                else:
                    raise ValueError('Unexpected action: %s' % action)
        test_stream.add_elements([json.dumps(('expect', []))])

        read_test_stream = test_stream | beam.Map(json.loads)

        class Check(beam.DoFn):
            """A StatefulDoFn that verifies outputs are produced as expected.

      This DoFn takes in two kinds of inputs, actual outputs and
      expected outputs.  When an actual output is received, it is buffered
      into state, and when an expected output is received, this buffered
      state is retrieved and compared against the expected value(s) to ensure
      they match.

      The key is ignored, but all items must be on the same key to share state.
      """
            def __init__(self, allow_out_of_order=True):
                # Some runners don't support cross-stage TestStream semantics.
                self.allow_out_of_order = allow_out_of_order

            def process(self,
                        element,
                        seen=beam.DoFn.StateParam(
                            beam.transforms.userstate.BagStateSpec(
                                'seen', beam.coders.FastPrimitivesCoder())),
                        expected=beam.DoFn.StateParam(
                            beam.transforms.userstate.BagStateSpec(
                                'expected',
                                beam.coders.FastPrimitivesCoder()))):
                _, (action, data) = element

                if self.allow_out_of_order:
                    if action == 'expect' and not list(seen.read()):
                        if data:
                            expected.add(data)
                        return
                    elif action == 'actual' and list(expected.read()):
                        seen.add(data)
                        all_data = list(seen.read())
                        all_expected = list(expected.read())
                        if len(all_data) == len(all_expected[0]):
                            expected.clear()
                            for expect in all_expected[1:]:
                                expected.add(expect)
                            action, data = 'expect', all_expected[0]
                        else:
                            return

                if action == 'actual':
                    seen.add(data)

                elif action == 'expect':
                    actual = list(seen.read())
                    seen.clear()

                    if len(actual) > len(data):
                        raise AssertionError(
                            'Unexpected output: expected %s but got %s' %
                            (data, actual))
                    elif len(data) > len(actual):
                        raise AssertionError(
                            'Unmatched output: expected %s but got %s' %
                            (data, actual))
                    else:

                        def diff(actual, expected):
                            for key in sorted(expected.keys(), reverse=True):
                                if key in actual:
                                    if actual[key] != expected[key]:
                                        return key

                        for output in actual:
                            diffs = [
                                diff(output, expected) for expected in data
                            ]
                            if all(diffs):
                                raise AssertionError(
                                    'Unmatched output: %s not found in %s (diffs in %s)'
                                    % (output, data, diffs))

                else:
                    raise ValueError('Unexpected action: %s' % action)

        with TestPipeline() as p:
            # TODO(BEAM-8601): Pass this during pipeline construction.
            p.options.view_as(StandardOptions).streaming = True
            # Split the test stream into a branch of to-be-processed elements, and
            # a branch of expected results.
            inputs, expected = (
                p
                | read_test_stream
                | beam.MapTuple(lambda tag, value: beam.pvalue.TaggedOutput(
                    tag, ('key', value))).with_outputs('input', 'expect'))
            # Process the inputs with the given windowing to produce actual outputs.
            outputs = (
                inputs
                | beam.MapTuple(lambda key, value: TimestampedValue(
                    (key, value), value))
                | beam.WindowInto(window_fn,
                                  trigger=trigger_fn,
                                  accumulation_mode=accumulation_mode,
                                  timestamp_combiner=timestamp_combiner)
                | beam.GroupByKey()
                | beam.MapTuple(
                    lambda k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn.
                    TimestampParam, p=beam.DoFn.PaneInfoParam:
                    (k,
                     _windowed_value_info(
                         WindowedValue(
                             vs, windows=[window], timestamp=t, pane_info=p))))
                # Place outputs back into the global window to allow flattening
                # and share a single state in Check.
                | 'Global' >> beam.WindowInto(
                    beam.transforms.window.GlobalWindows()))
            # Feed both the expected and actual outputs to Check() for comparison.
            tagged_expected = (
                expected
                | beam.MapTuple(lambda key, value: (key, ('expect', value))))
            tagged_outputs = (
                outputs
                | beam.MapTuple(lambda key, value: (key, ('actual', value))))
            # pylint: disable=expression-not-assigned
            ([tagged_expected, tagged_outputs]
             | beam.Flatten()
             | beam.ParDo(Check(self.allow_out_of_order)))