示例#1
0
    def expand(self, entities):
        """Writes the given models to the datastore.

        This overrides the expand() method from the parent class.

        Args:
            entities: PCollection. A PCollection of NDB models to write
                to the datastore. Can also contain just one model.

        Returns:
            PCollection. An empty PCollection. This is needed because all
            expand() methods need to return some PCollection.
        """
        return (entities
                | 'Transforming the NDB models into Apache Beam entities' >>
                (beam.Map(job_utils.get_beam_entity_from_ndb_model))
                | 'Writing the NDB models to the datastore' >>
                (datastoreio.WriteToDatastore(feconf.OPPIA_PROJECT_ID)))
示例#2
0
def main():
    options = PipelineOptions()
    p = beam.Pipeline(options=options)
    options.view_as(GoogleCloudOptions).project = 'chromeperf'
    stats_options = options.view_as(CalcStatsOptions)
    query_provider = stats_options.GetSQLQueryProvider()

    # Query 'rows' table for sample_values
    rows = p | 'QueryTable' >> beam.io.ReadFromBigQuery(query=query_provider,
                                                        use_standard_sql=True,
                                                        validate=True,
                                                        flatten_results=False)

    # Group the sample values (by measurement, and by measurement+bot+bot_group),
    # and calculate noisiness stats.
    stats_by_m = (
        rows | 'CalcStats(measurement)' >> SampleValuesStatsBy('measurement'))
    stats_by_bg_b_m = (rows | 'CalcStats(bot_group,bot,measurement)' >>
                       SampleValuesStatsBy('bot_group', 'bot', 'measurement'))

    # Emit results to stats_by_* tables in BigQuery.
    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.stats_by_measurement_7d`
  (`date` DATE NOT NULL,

   measurement STRING NOT NULL,

   num_samples INT64 NOT NULL,
   kurtosis FLOAT64,
   skewness FLOAT64,
   iqr FLOAT64,
   variance FLOAT64,
   `min` FLOAT64,
   `max` FLOAT64,
   mean FLOAT64,
   first_quartile FLOAT64,
   median FLOAT64,
   third_quartile FLOAT64,
   cv FLOAT64,
   std_dev FLOAT64,
   std_err FLOAT64,
   )
  PARTITION BY `date`
  CLUSTER BY measurement;

  CREATE TABLE `chromeperf.chromeperf_dashboard_data.stats_by_botgroup_7d`
  (`date` DATE NOT NULL,

   bot_group STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING NOT NULL,

   num_samples INT64 NOT NULL,
   kurtosis FLOAT64,
   skewness FLOAT64,
   iqr FLOAT64,
   variance FLOAT64,
   `min` FLOAT64,
   `max` FLOAT64,
   mean FLOAT64,
   first_quartile FLOAT64,
   median FLOAT64,
   third_quartile FLOAT64,
   cv FLOAT64,
   std_dev FLOAT64,
   std_err FLOAT64,
   )
  PARTITION BY `date`
  CLUSTER BY bot_group, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_stats_by_measurement_schema = {
        'fields': [
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'num_samples',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'kurtosis',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'skewness',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'iqr',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'variance',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'min',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'max',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'mean',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'first_quartile',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'median',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'third_quartile',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'cv',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'std_dev',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'std_err',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
        ],
    }

    bq_stats_by_bg_b_m_schema = {
        'fields': [
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot_group',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'num_samples',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'kurtosis',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'skewness',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'iqr',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'variance',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'min',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'max',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'mean',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'first_quartile',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'median',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'third_quartile',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'cv',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'std_dev',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'std_err',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
        ],
    }

    def GetTableNameFn(table_name):
        def TableNameFn(unused_element):
            # The tables are partitioned by end date (only), so we have to manually
            # partition by window size so that pipelines for e.g. 7d vs. 28d
            # windows don't overwrite each other.  Thus we include days in the table
            # name (rather than as an extra column).
            return '{project}:{dataset}.{table_name}_{days}d'.format(
                project=options.view_as(GoogleCloudOptions).project,
                dataset=stats_options.dataset.get(),
                table_name=table_name,
                days=stats_options.window_in_days.get())

        return TableNameFn

    def DateToYYYYMMDD(elem):
        return elem['date'].strftime('%Y%m%d')

    _ = (stats_by_m
         | 'ConvertToBigQueryRow(by_measurement)' >> FlattenForBQ(
             stats_options.GetFixedColumnsProvider())
         | 'WriteToPartitionedBigQuery(stats_by_measurement)' >>
         WriteToPartitionedBigQuery(GetTableNameFn('stats_by_measurement'),
                                    schema=bq_stats_by_measurement_schema,
                                    element_to_yyyymmdd_fn=DateToYYYYMMDD,
                                    additional_bq_parameters={
                                        'clustering': {
                                            'fields': ['measurement']
                                        }
                                    }))
    _ = (stats_by_bg_b_m
         | 'ConvertToBigQueryRow(by_botgroup)' >> FlattenForBQ(
             stats_options.GetFixedColumnsProvider())
         | 'WriteToPartitionedBigQuery(stats_by_botgroup)' >>
         WriteToPartitionedBigQuery(
             GetTableNameFn('stats_by_botgroup'),
             schema=bq_stats_by_bg_b_m_schema,
             element_to_yyyymmdd_fn=DateToYYYYMMDD,
             additional_bq_parameters={
                 'clustering': {
                     'fields': ['bot_group', 'bot', 'measurement']
                 }
             }))

    if stats_options.export_to_datastore:
        _ = (stats_by_bg_b_m
             | 'CreateEntity(SignalQuality)' >> beam.Map(CreateEntity)
             | beam.Reshuffle()
             | datastoreio.WriteToDatastore('chromeperf'))

    result = p.run()
    result.wait_until_finish()