def expand(self, entities): """Writes the given models to the datastore. This overrides the expand() method from the parent class. Args: entities: PCollection. A PCollection of NDB models to write to the datastore. Can also contain just one model. Returns: PCollection. An empty PCollection. This is needed because all expand() methods need to return some PCollection. """ return (entities | 'Transforming the NDB models into Apache Beam entities' >> (beam.Map(job_utils.get_beam_entity_from_ndb_model)) | 'Writing the NDB models to the datastore' >> (datastoreio.WriteToDatastore(feconf.OPPIA_PROJECT_ID)))
def main(): options = PipelineOptions() p = beam.Pipeline(options=options) options.view_as(GoogleCloudOptions).project = 'chromeperf' stats_options = options.view_as(CalcStatsOptions) query_provider = stats_options.GetSQLQueryProvider() # Query 'rows' table for sample_values rows = p | 'QueryTable' >> beam.io.ReadFromBigQuery(query=query_provider, use_standard_sql=True, validate=True, flatten_results=False) # Group the sample values (by measurement, and by measurement+bot+bot_group), # and calculate noisiness stats. stats_by_m = ( rows | 'CalcStats(measurement)' >> SampleValuesStatsBy('measurement')) stats_by_bg_b_m = (rows | 'CalcStats(bot_group,bot,measurement)' >> SampleValuesStatsBy('bot_group', 'bot', 'measurement')) # Emit results to stats_by_* tables in BigQuery. """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.stats_by_measurement_7d` (`date` DATE NOT NULL, measurement STRING NOT NULL, num_samples INT64 NOT NULL, kurtosis FLOAT64, skewness FLOAT64, iqr FLOAT64, variance FLOAT64, `min` FLOAT64, `max` FLOAT64, mean FLOAT64, first_quartile FLOAT64, median FLOAT64, third_quartile FLOAT64, cv FLOAT64, std_dev FLOAT64, std_err FLOAT64, ) PARTITION BY `date` CLUSTER BY measurement; CREATE TABLE `chromeperf.chromeperf_dashboard_data.stats_by_botgroup_7d` (`date` DATE NOT NULL, bot_group STRING NOT NULL, bot STRING NOT NULL, measurement STRING NOT NULL, num_samples INT64 NOT NULL, kurtosis FLOAT64, skewness FLOAT64, iqr FLOAT64, variance FLOAT64, `min` FLOAT64, `max` FLOAT64, mean FLOAT64, first_quartile FLOAT64, median FLOAT64, third_quartile FLOAT64, cv FLOAT64, std_dev FLOAT64, std_err FLOAT64, ) PARTITION BY `date` CLUSTER BY bot_group, bot, measurement; """ # pylint: disable=pointless-string-statement bq_stats_by_measurement_schema = { 'fields': [ { 'name': 'date', 'type': 'DATE', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'num_samples', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'kurtosis', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'skewness', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'iqr', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'variance', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'min', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'max', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'mean', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'first_quartile', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'median', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'third_quartile', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'cv', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'std_dev', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'std_err', 'type': 'FLOAT', 'mode': 'NULLABLE' }, ], } bq_stats_by_bg_b_m_schema = { 'fields': [ { 'name': 'date', 'type': 'DATE', 'mode': 'REQUIRED' }, { 'name': 'bot_group', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'num_samples', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'kurtosis', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'skewness', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'iqr', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'variance', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'min', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'max', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'mean', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'first_quartile', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'median', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'third_quartile', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'cv', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'std_dev', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'std_err', 'type': 'FLOAT', 'mode': 'NULLABLE' }, ], } def GetTableNameFn(table_name): def TableNameFn(unused_element): # The tables are partitioned by end date (only), so we have to manually # partition by window size so that pipelines for e.g. 7d vs. 28d # windows don't overwrite each other. Thus we include days in the table # name (rather than as an extra column). return '{project}:{dataset}.{table_name}_{days}d'.format( project=options.view_as(GoogleCloudOptions).project, dataset=stats_options.dataset.get(), table_name=table_name, days=stats_options.window_in_days.get()) return TableNameFn def DateToYYYYMMDD(elem): return elem['date'].strftime('%Y%m%d') _ = (stats_by_m | 'ConvertToBigQueryRow(by_measurement)' >> FlattenForBQ( stats_options.GetFixedColumnsProvider()) | 'WriteToPartitionedBigQuery(stats_by_measurement)' >> WriteToPartitionedBigQuery(GetTableNameFn('stats_by_measurement'), schema=bq_stats_by_measurement_schema, element_to_yyyymmdd_fn=DateToYYYYMMDD, additional_bq_parameters={ 'clustering': { 'fields': ['measurement'] } })) _ = (stats_by_bg_b_m | 'ConvertToBigQueryRow(by_botgroup)' >> FlattenForBQ( stats_options.GetFixedColumnsProvider()) | 'WriteToPartitionedBigQuery(stats_by_botgroup)' >> WriteToPartitionedBigQuery( GetTableNameFn('stats_by_botgroup'), schema=bq_stats_by_bg_b_m_schema, element_to_yyyymmdd_fn=DateToYYYYMMDD, additional_bq_parameters={ 'clustering': { 'fields': ['bot_group', 'bot', 'measurement'] } })) if stats_options.export_to_datastore: _ = (stats_by_bg_b_m | 'CreateEntity(SignalQuality)' >> beam.Map(CreateEntity) | beam.Reshuffle() | datastoreio.WriteToDatastore('chromeperf')) result = p.run() result.wait_until_finish()