def format(paths, check=False): """Format SQL files.""" if not paths: if sys.stdin.isatty(): parser.print_help() print("Error: must specify PATH or provide input via stdin") sys.exit(255) query = sys.stdin.read() formatted = reformat(query, trailing_newline=True) if not check: print(formatted, end="") if check and query != formatted: sys.exit(1) else: sql_files = [] for path in paths: if os.path.isdir(path): sql_files.extend( filepath for dirpath, _, filenames in os.walk(path) for filename in filenames if filename.endswith(".sql") # skip tests/**/input.sql and not ( path.startswith("tests") and filename == "input.sql") for filepath in [os.path.join(dirpath, filename)] if filepath not in SKIP) elif path: sql_files.append(path) if not sql_files: print("Error: no files were found to format") sys.exit(255) sql_files.sort() reformatted = unchanged = 0 for path in sql_files: with open(path) as fp: query = fp.read() formatted = reformat(query, trailing_newline=True) if query != formatted: if check: print(f"would reformat {path}") else: with open(path, "w") as fp: fp.write(formatted) print(f"reformatted {path}") reformatted += 1 else: unchanged += 1 print(", ".join(f"{number} file{'s' if number > 1 else ''}" f"{' would be' if check else ''} {msg}" for number, msg in [ (reformatted, "reformatted"), (unchanged, "left unchanged"), ] if number > 0) + ".") if check and reformatted: sys.exit(1)
def generate_queries(project, path, write_dir): """Generate experiment monitoring views.""" with open(Path(path) / "templating.yaml", "r") as f: template_config = yaml.safe_load(f) or {} for query, args in template_config["queries"].items(): template_query_dir = FILE_PATH / "templates" / query env = Environment( loader=FileSystemLoader(template_query_dir), keep_trailing_newline=True, ) sql_templates = list(template_query_dir.glob("*.sql")) sql_template_file = sql_templates[0].name sql_template = env.get_template(sql_template_file) metadata_template = env.get_template("metadata.yaml") args["destination_table"] = query args["search_metrics"] = template_config["search_metrics"] if args["per_app"]: # generate a separate query for each application dataset for dataset in template_config["applications"]: args["dataset"] = dataset write_sql( write_dir / project, f"{project}.{dataset}_derived.{query}", sql_template_file, reformat(sql_template.render(**args)), ) write_path = Path(write_dir) / project / (dataset + "_derived") / query (write_path / "metadata.yaml").write_text( metadata_template.render(**args)) else: # generate a single query that UNIONs application datasets # these queries are written to `telemetry` args["applications"] = template_config["applications"] write_sql( write_dir / project, f"{project}.telemetry_derived.{query}", sql_template_file, reformat(sql_template.render(**args)), ) write_path = Path( write_dir) / project / "telemetry_derived" / query (write_path / "metadata.yaml").write_text( metadata_template.render(**args))
def main(argv, out=print): """Print a clients_daily_scalar_aggregates query to stdout.""" opts = vars(p.parse_args(argv[1:])) sql_string = "" if opts["agg_type"] in ("scalars", "keyed_scalars", "keyed_booleans"): scalar_type = (opts["agg_type"] if (opts["agg_type"] == "scalars") else "keyed_scalars") scalar_probes = get_scalar_probes(scalar_type) sql_string = get_scalar_probes_sql_strings(scalar_probes, opts["agg_type"]) else: raise ValueError( "agg-type must be one of scalars, keyed_scalars, keyed_booleans") sleep(opts['wait_seconds']) out( reformat( generate_sql( opts["agg_type"], sql_string["probes_string"], sql_string.get("additional_queries", ""), sql_string.get("additional_partitions", ""), sql_string["select_clause"], sql_string.get("querying_table", "filtered"), opts["json_output"], )))
def create(cls, project, dataset, name, sql_dir, base_table=None): """ Create a new empty view from a template. Use `base_table` in view definition, if provided. """ path = Path(sql_dir) / project / dataset / name / "view.sql" dataset_path = path.parent.parent if not dataset_path.exists(): # create new dataset with dataset metadata path.parent.mkdir(parents=True) dataset_metadata = DatasetMetadata( friendly_name=string.capwords(dataset), description="Please provide a dataset description.", dataset_base_acl="view", user_facing=True, ) dataset_metadata.write(dataset_path / DATASET_METADATA_FILE) else: path.parent.mkdir(parents=True, exist_ok=True) if not base_table: base_table = f"{project}.{dataset}_derived.{name}_v1" path.write_text( reformat( f""" CREATE OR REPLACE VIEW `{project}.{dataset}.{name}` AS SELECT * FROM `{base_table}` """ ) + "\n" ) return cls(path, name, dataset, project)
def render_main( header: str, user_data_type: str, user_data_attributes: List[str], attributes: List[str], extract_select_clause: str, join_filter: str, source_table: str, destination_table: str, **kwargs, ) -> str: """Render the main query.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) main_sql = env.get_template("clients_scalar_aggregates_v1.sql") return reformat( main_sql.render( header=header, user_data_type=user_data_type, user_data_attributes=",".join(user_data_attributes), attributes=",".join(attributes), attributes_list=attributes, extract_select_clause=extract_select_clause, join_filter=join_filter, source_table=source_table, destination_table=destination_table, ) )
def from_template( query_type: QueryType, template_name: str, environment: Environment, args: Namespace, dataset_path: Path, query_name_prefix=None, **kwargs, ) -> TemplateResult: """Fill in templates and write them to disk.""" if query_type == QueryType.INIT: template = environment.get_template(f"{template_name}.init.sql") else: template = environment.get_template(f"{template_name}.sql") if query_name_prefix: table_id = f"{args.prefix}__{query_name_prefix}_{template_name}" else: table_id = f"{args.prefix}__{template_name}" # replaces the header, if it exists kwargs["header"] = f"-- {query_type} for {table_id};" # create the directory for the view (dataset_path / table_id).mkdir(exist_ok=True) view_path = dataset_path / table_id / f"{query_type}.sql" # write the query with appropriate variables query_text = reformat(template.render(**{**vars(args), **kwargs})) print(f"generated {view_path}") with view_path.open("w") as fp: print(query_text, file=fp) return TemplateResult(table_id, query_type, query_text)
def main(argv, out=print): """Print a clients_daily_histogram_aggregates query to stdout.""" opts = vars(p.parse_args(argv[1:])) sql_string = "" if opts["agg_type"] in ("histograms", "keyed_histograms"): probes_and_buckets = get_histogram_probes_and_buckets(opts["agg_type"], opts["processes"]) sql_string = get_histogram_probes_sql_strings( probes_and_buckets, opts["agg_type"] ) else: raise ValueError("agg-type must be one of histograms, keyed_histograms") sleep(opts['wait_seconds']) out( reformat( generate_sql( opts, sql_string.get("additional_queries", ""), sql_string["windowed_clause"], sql_string["select_clause"], opts["json_output"], ) ) )
def render(sql_filename, format=True, **kwargs) -> str: """Render a given template query using Jinja.""" env = Environment(loader=PackageLoader("bigquery_etl", "glean_usage/templates")) main_sql = env.get_template(sql_filename) rendered = main_sql.render(**kwargs) if format: rendered = reformat(rendered) return rendered
def runtest(self): """Run.""" with open(self.fspath) as fp: expect = fp.read() try: with open(f"{self.fspath.dirname}/input.sql") as fp: query = fp.read() except FileNotFoundError: query = expect assert reformat(query) + "\n" == expect
def generate_query(project, dataset, destination_table, write_dir): """Generate feature usage table query.""" with open(TEMPLATE_CONFIG, "r") as f: render_kwargs = yaml.safe_load(f) or {} env = Environment(loader=FileSystemLoader(str(FILE_PATH / "templates"))) template = env.get_template("query.sql") write_sql( write_dir / project, f"{project}.{dataset}.{destination_table}", "query.sql", reformat(template.render(**render_kwargs)), )
def test_generate_query_nested_deep_uneven(): columns = ["a.b.c.d", "a.b.e"] res = generate_query(columns, "test") expect = reformat(""" select struct(struct( struct( a.b.c.d ) as c, a.b.e ) as b ) as a from `test` """) assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def test_generate_query_nested_deep_skip(): columns = ["b.c.e", "b.d.f"] res = generate_query(columns, "test") expect = reformat(""" select struct( struct( b.c.e ) as c, struct( b.d.f ) as d ) as b from `test` """) assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def generate_view(project, dataset, destination_table, write_dir): """Generate feature usage table view.""" view_name = destination_table.split("_v")[0] view_dataset = dataset.split("_derived")[0] sql = reformat(f""" CREATE OR REPLACE VIEW `{project}.{view_dataset}.{view_name}` AS SELECT * FROM `{project}.{dataset}.{destination_table}` """) write_sql(write_dir / project, f"{project}.{view_dataset}.{view_name}", "view.sql", sql)
def render_query(attributes: List[str], **kwargs) -> str: """Render the main query.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) sql = env.get_template("scalar_percentiles_v1.sql") max_combinations = len(attributes) + 1 attribute_combinations = [] for subset_size in reversed(range(max_combinations)): for grouping in combinations(attributes, subset_size): select_expr = [] for attribute in attributes: select_expr.append((attribute, attribute in grouping)) attribute_combinations.append(select_expr) return reformat( sql.render(attribute_combinations=attribute_combinations, **kwargs))
def test_generate_query_nested_deep_anscestor_shared_descendent_names(): columns = ["a.b.c.d", "a.f.c.g"] res = generate_query(columns, "test") expect = reformat(""" select struct( struct(struct( a.b.c.d ) as c) as b, struct(struct( a.f.c.g ) as c) as f ) as a from `test` """) assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def main(): """Generate mobile search clients daily query and print to stdout.""" base_dir = Path(__file__).parent env = Environment(loader=FileSystemLoader(base_dir / "templates")) android_query_template = env.get_template("fenix_metrics.template.sql") ios_query_template = env.get_template("ios_metrics.template.sql") queries = [ android_query_template.render(namespace=app_channel[0], app_name=app_channel[1], channel=app_channel[2]) if app_channel[3] == "android" else ios_query_template.render(namespace=app_channel[0], app_name=app_channel[1], channel=app_channel[2]) for app_channel in APP_CHANNEL_TUPLES ] search_query_template = env.get_template( "mobile_search_clients_daily.template.sql") fenix_combined_baseline = union_statements([ f"SELECT * FROM baseline_{namespace}" for namespace, _, _, platform in APP_CHANNEL_TUPLES if platform == "android" ]) fenix_combined_metrics = union_statements([ f"SELECT * FROM metrics_{namespace}" for namespace, _, _, platform in APP_CHANNEL_TUPLES if platform == "android" ]) ios_combined_metrics = union_statements([ f"SELECT * FROM metrics_{namespace}" for namespace, _, _, platform in APP_CHANNEL_TUPLES if platform == "ios" ]) search_query = search_query_template.render( baseline_and_metrics_by_namespace="\n".join(queries), fenix_baseline=fenix_combined_baseline, fenix_metrics=fenix_combined_metrics, ios_metrics=ios_combined_metrics, ) print(reformat(search_query))
def render(self, write_path, args): """Render this template at the specified write_path with the specified args.""" fpath = write_path / self.name print(f"...Generating {str(fpath)}") write_path.mkdir(parents=True, exist_ok=True) if "header" not in args: args[ "header" ] = "Generated by bigquery_etl/events_daily/generate_queries.py" text = self._get_comment_char(fpath.suffix) + args["header"] + "\n\n" text += self.env.get_template(self.name).render(**args) if fpath.suffix == ".sql": text = reformat(text, trailing_newline=True) (write_path / self.name).write_text(text)
def render_query(attributes: List[str], **kwargs) -> str: """Render the main query.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) sql = env.get_template("probe_counts_v1.sql") # If the set of attributes grows, the max_combinations can be set only # compute a shallow set for less query complexity max_combinations = len(attributes) attribute_combinations = [] for subset_size in reversed(range(max_combinations + 1)): for grouping in combinations(attributes, subset_size): # channel and app_version are required in the GLAM frontend if "channel" not in grouping or "app_version" not in grouping: continue select_expr = [] for attribute in attributes: select_expr.append((attribute, attribute in grouping)) attribute_combinations.append(select_expr) return reformat(sql.render(attribute_combinations=attribute_combinations, **kwargs))
def test_generate_query_nested_deep(): columns = ["a.b", "a.c", "a.d.x.y.e", "a.d.x.y.f", "g"] res = generate_query(columns, "test") expect = reformat(""" select struct( a.b, a.c, struct( struct( struct( a.d.x.y.e, a.d.x.y.f ) as y ) as x ) as d ) as a, g from `test` """) assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def render_init( header, destination_table, attributes, attributes_type, user_data_type, partition_clause, **kwargs, ) -> str: """Render the table initialization DML for partitioning and clustering.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) init_sql = env.get_template("clients_scalar_aggregates_v1.init.sql") return reformat( init_sql.render( header=header, destination_table=destination_table, attributes_type=",".join( f"{name} {dtype}" for name, dtype in zip(attributes, attributes_type) ), user_data_type=user_data_type, partition_clause=partition_clause, ) )
def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaFile): """If a view.sql does not already exist, write one to the target directory.""" target_dir = (sql_dir / target_project / schema.bq_dataset_family / schema.bq_table_unversioned) target_file = target_dir / "view.sql" if target_file.exists(): return # Exclude doctypes maintained in separate projects. for prefix in SKIP_PREFIXES: if schema.bq_dataset_family.startswith(prefix): return full_source_id = f"{target_project}.{schema.stable_table}" full_view_id = f"{target_project}.{schema.user_facing_view}" replacements = ["mozfun.norm.metadata(metadata) AS metadata"] if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1": replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"] if schema.bq_table == "baseline_v1": replacements += [ "mozfun.norm.glean_baseline_client_info" "(client_info, metrics) AS client_info" ] if (schema.bq_dataset_family == "org_mozilla_fenix" and schema.bq_table == "metrics_v1"): # todo: use mozfun udfs replacements += [ "mozdata.udf.normalize_fenix_metrics" "(client_info.telemetry_sdk_build, metrics)" " AS metrics" ] if schema.bq_dataset_family == "firefox_desktop": # FOG does not provide an app_name, so we inject the one that # people already associate with desktop Firefox per bug 1672191. replacements += [ "'Firefox' AS normalized_app_name", ] elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"): replacements += [ "mozdata.udf.normalize_main_payload(payload) AS payload" ] replacements_str = ",\n ".join(replacements) full_sql = reformat( VIEW_QUERY_TEMPLATE.format( target=full_source_id, replacements=replacements_str, full_view_id=full_view_id, )) print(f"Creating {target_file}") target_dir.mkdir(parents=True, exist_ok=True) with target_file.open("w") as f: f.write(full_sql) metadata_content = VIEW_METADATA_TEMPLATE.format( document_namespace=schema.document_namespace, document_type=schema.document_type, ) metadata_file = target_dir / "metadata.yaml" if not metadata_file.exists(): with metadata_file.open("w") as f: f.write(metadata_content)
def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaFile): """If a view.sql does not already exist, write one to the target directory.""" target_dir = (sql_dir / target_project / schema.bq_dataset_family / schema.bq_table_unversioned) target_file = target_dir / "view.sql" if target_file.exists(): return full_source_id = f"{target_project}.{schema.stable_table}" full_view_id = f"{target_project}.{schema.user_facing_view}" replacements = ["mozfun.norm.metadata(metadata) AS metadata"] if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1": replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"] if schema.bq_table == "baseline_v1": replacements += [ "mozfun.norm.glean_baseline_client_info" "(client_info, metrics) AS client_info" ] if (schema.bq_dataset_family == "org_mozilla_fenix" and schema.bq_table == "metrics_v1"): # todo: use mozfun udfs replacements += [ "mozdata.udf.normalize_fenix_metrics" "(client_info.telemetry_sdk_build, metrics)" " AS metrics" ] if schema.bq_dataset_family == "firefox_desktop": # FOG does not provide an app_name, so we inject the one that # people already associate with desktop Firefox per bug 1672191. replacements += [ "'Firefox' AS normalized_app_name", ] elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"): replacements += [ "mozdata.udf.normalize_main_payload(payload) AS payload" ] replacements_str = ",\n ".join(replacements) full_sql = reformat( VIEW_QUERY_TEMPLATE.format( target=full_source_id, replacements=replacements_str, full_view_id=full_view_id, )) print(f"Creating {target_file}") target_dir.mkdir(parents=True, exist_ok=True) with target_file.open("w") as f: f.write(full_sql) metadata_content = VIEW_METADATA_TEMPLATE.format( document_namespace=schema.document_namespace, document_type=schema.document_type, ) metadata_file = target_dir / "metadata.yaml" if not metadata_file.exists(): with metadata_file.open("w") as f: f.write(metadata_content) # get view schema with descriptions try: content = VIEW_CREATE_REGEX.sub("", target_file.read_text()) content += " WHERE DATE(submission_timestamp) = '2020-01-01'" view_schema = Schema.from_query_file(target_file, content=content) stable_table_schema = Schema.from_json({"fields": schema.schema}) view_schema.merge(stable_table_schema, add_missing_fields=False) view_schema.to_yaml_file(target_dir / "schema.yaml") except Exception as e: print(f"Cannot generate schema.yaml for {target_file}: {e}")
def render_main(**kwargs): """Create a SQL query for the clients_daily_scalar_aggregates dataset.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) main_sql = env.get_template("clients_daily_scalar_aggregates_v1.sql") return reformat(main_sql.render(**kwargs))
def test_generate_query_simple(): columns = ["a", "b"] res = generate_query(columns, "test") expect = reformat("select a, b from `test`") assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def generate_query(columns, table): """Generate a SQL query given column names. We construct a query that selects columns into nested structs. Naive selection of all the columns will strip the namespace from the columns. The legacy core and legacy event tables are converted as subsets of the metrics glean ping. There may be more than one row per client, but this matches the existing semantics of the metrics ping. We use this method over joining the core and legacy pings because of the non-overlapping nature of these two pings and difficulty in using coalesce with a deeply nested structure. """ # Build a string that contains the selected columns. We take the set of # columns and split them up by namespace. Each namespace is put inside of a # STRUCT call. For example, foo.a and foo.b will be translated into a # `STRUCT(foo.a, foo.b) as foo` nested column. acc = "" # Maintain the last element in the columns to determine when a transition # must be made. prev = [] # Iterate over the sorted set of columns. This ensures that columns are # grouped together correctly. Every time the column goes into a namespace, # we push an opening struct statement onto the string. Every time we # complete nested struct, we close out the string by aliasing the struct to # the namespace. for col in sorted(columns): split = col.split(".") # check if we go deeper if len(split) > 1 and len(split) > len(prev): # the number of times to start nesting if len(prev) == 0: k = len(split) - 1 else: k = len(split) - len(prev) acc += "struct(" * k # the two structs are different now, figure out how much we need to pop # off before we continue if len(split) > 1 and len(split) == len(prev): # find the common ancestor depth = 0 for a, b in list(zip(split[:-1], prev[:-1])): if a != b: break depth += 1 # now pop off until we reach the ancestor for alias in reversed(prev[depth:-1]): acc = acc.rstrip(",") acc += f") as {alias}," # now enter the new struct acc += "struct(" * (len(split) - 1 - depth) # pop out of the struct if len(split) < len(prev): diff = len(prev) - len(split) # ignore the leaf prev.pop() for _ in range(diff): c = prev.pop() acc = acc.rstrip(",") acc += f") as {c}," acc += f"{col}," prev = split # clean up any columns if len(prev) > 1: prev.pop() for c in reversed(prev): acc = acc.rstrip(",") acc += f") as {c}," acc = acc.rstrip(",") return reformat(f"select {acc} from `{table}`")
def main(project, source_dataset, destination_dataset, create_table, backfill, dryrun): """Generate queries and optionally create the tables in BigQuery.""" client = bigquery.Client(project=project) exported_tables = [ table.table_id for table in client.list_tables(source_dataset) if table.table_type == "TABLE" ] tables_by_dimension = defaultdict(list) opt_in_metrics = set() # group table names by the dimension it is grouped by for table_name in exported_tables: if table_name.endswith("_total"): dimension = None else: metric, dimension = table_name.split("_by_") if dimension.startswith("opt_in"): opt_in_metrics.add(metric) dimension = dimension.replace("opt_in_", "") tables_by_dimension[dimension].append(table_name) for dimension, table_names in tables_by_dimension.items(): qualified_table_names = [ f"`{project}.{source_dataset}.{table_name}`" for table_name in table_names ] if dimension is not None: fields = f"date, app_name, {dimension}" table_name = f"metrics_by_{dimension}" metrics = [ table_name.split("_by_")[0] for table_name in table_names ] else: fields = "date, app_name" table_name = "metrics_total" metrics = [ table_name.split("_total")[0] for table_name in table_names ] join_clauses = [ JOIN_TEMPLATE.format(table=table_name, fields=fields) for table_name in qualified_table_names[1:] ] # add _opt_in to opt-in metrics fields_to_add_opt_in = [ metric for metric in metrics if metric in opt_in_metrics ] excepted_fields = ",".join(fields_to_add_opt_in) additional_fields = [ f"{name} AS {name}_opt_in" for name in fields_to_add_opt_in if name != "rate" ] # rename rate column to opt_in_rate and if "rate" in metrics: additional_fields.append("rate AS opt_in_rate") query_text = QUERY_TEMPLATE.format( excepted_fields=excepted_fields, additional_fields=", ".join(additional_fields), first_table=qualified_table_names[0], joined_tables="\n".join(join_clauses), filter="date=@date", ) query_path = os.path.join(SQL_DIR, destination_dataset, table_name, "query.sql") if not os.path.exists(os.path.dirname(query_path)): os.makedirs(os.path.dirname(query_path)) with open(query_path, "w") as f: print(f"Writing {query_path}") f.write(reformat(query_text)) f.write("\n") if create_table: query_text = QUERY_TEMPLATE.format( excepted_fields=excepted_fields, additional_fields=", ".join(additional_fields), first_table=qualified_table_names[0], joined_tables="\n".join(join_clauses), filter="TRUE" if backfill else "FALSE", ) schema_update_options = ([] if backfill else [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ]) job_config = bigquery.QueryJobConfig( use_legacy_sql=False, dry_run=dryrun, destination=f"{project}.{destination_dataset}.{table_name}", schema_update_options=schema_update_options, time_partitioning=bigquery.TimePartitioning(field="date"), create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE if backfill else bigquery.WriteDisposition.WRITE_APPEND, ) print(f"Creating table {table_name}") query_job = client.query(query_text, job_config) if not dryrun: query_job.result()
def test_generate_query_nested(): columns = ["a", "b.c", "b.d"] res = generate_query(columns, "test") expect = reformat("select a, struct(b.c, b.d) as b from `test`") assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def render_main(**kwargs, ) -> str: """Render the main query.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) main_sql = env.get_template("latest_versions_v1.sql") return reformat(main_sql.render(**kwargs))
def render_query(**kwargs) -> str: """Render the main query.""" env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates")) sql = env.get_template("bucket_counts_v1.sql") return reformat(sql.render(**kwargs))
def main(): # get the most schema deploy (to the nearest 15 minutes) bq = bigquery.Client() label = bq.get_dataset("moz-fx-data-shared-prod.telemetry").labels.get( "schemas_build_id") print(f"last deploy: {label}") # get the schema corresponding to the last commit commit_hash = label.split("_")[-1] schema_url = ( "https://raw.githubusercontent.com/mozilla-services/mozilla-pipeline-schemas/" f"{commit_hash}/schemas/org-mozilla-ios-firefox/metrics/metrics.1.bq") resp = requests.get(schema_url) schema = resp.json() column_summary = get_columns(schema) print(json.dumps(column_summary, indent=2)) """ The columns take on the following form: "root.additional_properties STRING", "root.client_info.android_sdk_version STRING", "root.client_info.app_build STRING", ... This will need to be processed yet again so we can query via bigquery """ bq = bigquery.Client() legacy_core = ( "moz-fx-data-shared-prod.org_mozilla_ios_firefox_derived.legacy_mobile_core_v2" ) legacy_event = ( "moz-fx-data-shared-prod.org_mozilla_ios_firefox_derived.legacy_mobile_event_counts_v2" ) update_schema(bq, legacy_core, schema) update_schema(bq, legacy_event, schema) stripped = [c.split()[0].lstrip("root.") for c in column_summary] query_glean = generate_query( ['"glean" as telemetry_system', *stripped], "mozdata.org_mozilla_ios_firefox.metrics", ) query_legacy_events = generate_query( [ '"legacy" as telemetry_system', *[ # replace submission date with _PARTITIONTIME "DATE(_PARTITIONTIME) as submission_date" if c == "submission_date" else c for c in stripped ], ], legacy_core, ) query_legacy_core = generate_query( [ '"legacy" as telemetry_system', *[ # replace submission date with _PARTITIONTIME "DATE(_PARTITIONTIME) as submission_date" if c == "submission_date" else c for c in stripped ], ], legacy_event, ) view_body = reformat(" UNION ALL ".join( [query_glean, query_legacy_core, query_legacy_events])) print(view_body) view_id = "moz-fx-data-shared-prod.org_mozilla_ios_firefox.unified_metrics" try: bq.delete_table(bq.get_table(view_id)) except exceptions.NotFound: pass view = bigquery.Table(view_id) view.view_query = view_body bq.create_table(view) print(f"updated view at {view_id}")