Exemplo n.º 1
0
def format(paths, check=False):
    """Format SQL files."""
    if not paths:
        if sys.stdin.isatty():
            parser.print_help()
            print("Error: must specify PATH or provide input via stdin")
            sys.exit(255)
        query = sys.stdin.read()
        formatted = reformat(query, trailing_newline=True)
        if not check:
            print(formatted, end="")
        if check and query != formatted:
            sys.exit(1)
    else:
        sql_files = []
        for path in paths:
            if os.path.isdir(path):
                sql_files.extend(
                    filepath for dirpath, _, filenames in os.walk(path)
                    for filename in filenames if filename.endswith(".sql")
                    # skip tests/**/input.sql
                    and not (
                        path.startswith("tests") and filename == "input.sql")
                    for filepath in [os.path.join(dirpath, filename)]
                    if filepath not in SKIP)
            elif path:
                sql_files.append(path)
        if not sql_files:
            print("Error: no files were found to format")
            sys.exit(255)
        sql_files.sort()
        reformatted = unchanged = 0
        for path in sql_files:
            with open(path) as fp:
                query = fp.read()
            formatted = reformat(query, trailing_newline=True)
            if query != formatted:
                if check:
                    print(f"would reformat {path}")
                else:
                    with open(path, "w") as fp:
                        fp.write(formatted)
                    print(f"reformatted {path}")
                reformatted += 1
            else:
                unchanged += 1
        print(", ".join(f"{number} file{'s' if number > 1 else ''}"
                        f"{' would be' if check else ''} {msg}"
                        for number, msg in [
                            (reformatted, "reformatted"),
                            (unchanged, "left unchanged"),
                        ] if number > 0) + ".")
        if check and reformatted:
            sys.exit(1)
Exemplo n.º 2
0
def generate_queries(project, path, write_dir):
    """Generate experiment monitoring views."""
    with open(Path(path) / "templating.yaml", "r") as f:
        template_config = yaml.safe_load(f) or {}

    for query, args in template_config["queries"].items():
        template_query_dir = FILE_PATH / "templates" / query
        env = Environment(
            loader=FileSystemLoader(template_query_dir),
            keep_trailing_newline=True,
        )
        sql_templates = list(template_query_dir.glob("*.sql"))
        sql_template_file = sql_templates[0].name
        sql_template = env.get_template(sql_template_file)
        metadata_template = env.get_template("metadata.yaml")

        args["destination_table"] = query
        args["search_metrics"] = template_config["search_metrics"]

        if args["per_app"]:
            # generate a separate query for each application dataset
            for dataset in template_config["applications"]:
                args["dataset"] = dataset

                write_sql(
                    write_dir / project,
                    f"{project}.{dataset}_derived.{query}",
                    sql_template_file,
                    reformat(sql_template.render(**args)),
                )

                write_path = Path(write_dir) / project / (dataset +
                                                          "_derived") / query
                (write_path / "metadata.yaml").write_text(
                    metadata_template.render(**args))
        else:
            # generate a single query that UNIONs application datasets
            # these queries are written to `telemetry`
            args["applications"] = template_config["applications"]

            write_sql(
                write_dir / project,
                f"{project}.telemetry_derived.{query}",
                sql_template_file,
                reformat(sql_template.render(**args)),
            )

            write_path = Path(
                write_dir) / project / "telemetry_derived" / query
            (write_path / "metadata.yaml").write_text(
                metadata_template.render(**args))
def main(argv, out=print):
    """Print a clients_daily_scalar_aggregates query to stdout."""
    opts = vars(p.parse_args(argv[1:]))
    sql_string = ""

    if opts["agg_type"] in ("scalars", "keyed_scalars", "keyed_booleans"):
        scalar_type = (opts["agg_type"] if
                       (opts["agg_type"] == "scalars") else "keyed_scalars")
        scalar_probes = get_scalar_probes(scalar_type)
        sql_string = get_scalar_probes_sql_strings(scalar_probes,
                                                   opts["agg_type"])
    else:
        raise ValueError(
            "agg-type must be one of scalars, keyed_scalars, keyed_booleans")

    sleep(opts['wait_seconds'])
    out(
        reformat(
            generate_sql(
                opts["agg_type"],
                sql_string["probes_string"],
                sql_string.get("additional_queries", ""),
                sql_string.get("additional_partitions", ""),
                sql_string["select_clause"],
                sql_string.get("querying_table", "filtered"),
                opts["json_output"],
            )))
Exemplo n.º 4
0
    def create(cls, project, dataset, name, sql_dir, base_table=None):
        """
        Create a new empty view from a template.

        Use `base_table` in view definition, if provided.
        """
        path = Path(sql_dir) / project / dataset / name / "view.sql"
        dataset_path = path.parent.parent

        if not dataset_path.exists():
            # create new dataset with dataset metadata
            path.parent.mkdir(parents=True)
            dataset_metadata = DatasetMetadata(
                friendly_name=string.capwords(dataset),
                description="Please provide a dataset description.",
                dataset_base_acl="view",
                user_facing=True,
            )
            dataset_metadata.write(dataset_path / DATASET_METADATA_FILE)
        else:
            path.parent.mkdir(parents=True, exist_ok=True)

        if not base_table:
            base_table = f"{project}.{dataset}_derived.{name}_v1"

        path.write_text(
            reformat(
                f"""
                CREATE OR REPLACE VIEW `{project}.{dataset}.{name}` AS
                SELECT * FROM `{base_table}`
                """
            )
            + "\n"
        )
        return cls(path, name, dataset, project)
def render_main(
    header: str,
    user_data_type: str,
    user_data_attributes: List[str],
    attributes: List[str],
    extract_select_clause: str,
    join_filter: str,
    source_table: str,
    destination_table: str,
    **kwargs,
) -> str:
    """Render the main query."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    main_sql = env.get_template("clients_scalar_aggregates_v1.sql")
    return reformat(
        main_sql.render(
            header=header,
            user_data_type=user_data_type,
            user_data_attributes=",".join(user_data_attributes),
            attributes=",".join(attributes),
            attributes_list=attributes,
            extract_select_clause=extract_select_clause,
            join_filter=join_filter,
            source_table=source_table,
            destination_table=destination_table,
        )
    )
Exemplo n.º 6
0
def from_template(
    query_type: QueryType,
    template_name: str,
    environment: Environment,
    args: Namespace,
    dataset_path: Path,
    query_name_prefix=None,
    **kwargs,
) -> TemplateResult:
    """Fill in templates and write them to disk."""
    if query_type == QueryType.INIT:
        template = environment.get_template(f"{template_name}.init.sql")
    else:
        template = environment.get_template(f"{template_name}.sql")

    if query_name_prefix:
        table_id = f"{args.prefix}__{query_name_prefix}_{template_name}"
    else:
        table_id = f"{args.prefix}__{template_name}"

    # replaces the header, if it exists
    kwargs["header"] = f"-- {query_type} for {table_id};"

    # create the directory for the view
    (dataset_path / table_id).mkdir(exist_ok=True)
    view_path = dataset_path / table_id / f"{query_type}.sql"

    # write the query with appropriate variables
    query_text = reformat(template.render(**{**vars(args), **kwargs}))

    print(f"generated {view_path}")
    with view_path.open("w") as fp:
        print(query_text, file=fp)

    return TemplateResult(table_id, query_type, query_text)
Exemplo n.º 7
0
def main(argv, out=print):
    """Print a clients_daily_histogram_aggregates query to stdout."""
    opts = vars(p.parse_args(argv[1:]))
    sql_string = ""

    if opts["agg_type"] in ("histograms", "keyed_histograms"):
        probes_and_buckets = get_histogram_probes_and_buckets(opts["agg_type"], opts["processes"])
        sql_string = get_histogram_probes_sql_strings(
            probes_and_buckets, opts["agg_type"]
        )
    else:
        raise ValueError("agg-type must be one of histograms, keyed_histograms")

    sleep(opts['wait_seconds'])
    out(
        reformat(
            generate_sql(
                opts,
                sql_string.get("additional_queries", ""),
                sql_string["windowed_clause"],
                sql_string["select_clause"],
                opts["json_output"],
            )
        )
    )
Exemplo n.º 8
0
def render(sql_filename, format=True, **kwargs) -> str:
    """Render a given template query using Jinja."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glean_usage/templates"))
    main_sql = env.get_template(sql_filename)
    rendered = main_sql.render(**kwargs)
    if format:
        rendered = reformat(rendered)
    return rendered
Exemplo n.º 9
0
 def runtest(self):
     """Run."""
     with open(self.fspath) as fp:
         expect = fp.read()
     try:
         with open(f"{self.fspath.dirname}/input.sql") as fp:
             query = fp.read()
     except FileNotFoundError:
         query = expect
     assert reformat(query) + "\n" == expect
Exemplo n.º 10
0
def generate_query(project, dataset, destination_table, write_dir):
    """Generate feature usage table query."""
    with open(TEMPLATE_CONFIG, "r") as f:
        render_kwargs = yaml.safe_load(f) or {}
    env = Environment(loader=FileSystemLoader(str(FILE_PATH / "templates")))
    template = env.get_template("query.sql")

    write_sql(
        write_dir / project,
        f"{project}.{dataset}.{destination_table}",
        "query.sql",
        reformat(template.render(**render_kwargs)),
    )
Exemplo n.º 11
0
def test_generate_query_nested_deep_uneven():
    columns = ["a.b.c.d", "a.b.e"]
    res = generate_query(columns, "test")
    expect = reformat("""
    select struct(struct(
            struct(
                a.b.c.d
            ) as c,
            a.b.e
        ) as b
    ) as a
    from `test`
    """)
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
Exemplo n.º 12
0
def test_generate_query_nested_deep_skip():
    columns = ["b.c.e", "b.d.f"]
    res = generate_query(columns, "test")
    expect = reformat("""
    select struct(
        struct(
            b.c.e
        ) as c,
        struct(
            b.d.f
        ) as d
    ) as b
    from `test`
    """)
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
Exemplo n.º 13
0
def generate_view(project, dataset, destination_table, write_dir):
    """Generate feature usage table view."""
    view_name = destination_table.split("_v")[0]
    view_dataset = dataset.split("_derived")[0]

    sql = reformat(f"""
        CREATE OR REPLACE VIEW `{project}.{view_dataset}.{view_name}` AS
        SELECT
            *
        FROM
            `{project}.{dataset}.{destination_table}`
    """)

    write_sql(write_dir / project, f"{project}.{view_dataset}.{view_name}",
              "view.sql", sql)
Exemplo n.º 14
0
def render_query(attributes: List[str], **kwargs) -> str:
    """Render the main query."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    sql = env.get_template("scalar_percentiles_v1.sql")

    max_combinations = len(attributes) + 1
    attribute_combinations = []
    for subset_size in reversed(range(max_combinations)):
        for grouping in combinations(attributes, subset_size):
            select_expr = []
            for attribute in attributes:
                select_expr.append((attribute, attribute in grouping))
            attribute_combinations.append(select_expr)

    return reformat(
        sql.render(attribute_combinations=attribute_combinations, **kwargs))
Exemplo n.º 15
0
def test_generate_query_nested_deep_anscestor_shared_descendent_names():
    columns = ["a.b.c.d", "a.f.c.g"]
    res = generate_query(columns, "test")
    expect = reformat("""
    select
    struct(
        struct(struct(
            a.b.c.d
        ) as c) as b,
        struct(struct(
            a.f.c.g
        ) as c) as f
    ) as a
    from `test`
    """)
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def main():
    """Generate mobile search clients daily query and print to stdout."""
    base_dir = Path(__file__).parent

    env = Environment(loader=FileSystemLoader(base_dir / "templates"))

    android_query_template = env.get_template("fenix_metrics.template.sql")
    ios_query_template = env.get_template("ios_metrics.template.sql")

    queries = [
        android_query_template.render(namespace=app_channel[0],
                                      app_name=app_channel[1],
                                      channel=app_channel[2]) if app_channel[3]
        == "android" else ios_query_template.render(namespace=app_channel[0],
                                                    app_name=app_channel[1],
                                                    channel=app_channel[2])
        for app_channel in APP_CHANNEL_TUPLES
    ]

    search_query_template = env.get_template(
        "mobile_search_clients_daily.template.sql")

    fenix_combined_baseline = union_statements([
        f"SELECT * FROM baseline_{namespace}"
        for namespace, _, _, platform in APP_CHANNEL_TUPLES
        if platform == "android"
    ])
    fenix_combined_metrics = union_statements([
        f"SELECT * FROM metrics_{namespace}"
        for namespace, _, _, platform in APP_CHANNEL_TUPLES
        if platform == "android"
    ])
    ios_combined_metrics = union_statements([
        f"SELECT * FROM metrics_{namespace}"
        for namespace, _, _, platform in APP_CHANNEL_TUPLES
        if platform == "ios"
    ])

    search_query = search_query_template.render(
        baseline_and_metrics_by_namespace="\n".join(queries),
        fenix_baseline=fenix_combined_baseline,
        fenix_metrics=fenix_combined_metrics,
        ios_metrics=ios_combined_metrics,
    )

    print(reformat(search_query))
Exemplo n.º 17
0
    def render(self, write_path, args):
        """Render this template at the specified write_path with the specified args."""
        fpath = write_path / self.name
        print(f"...Generating {str(fpath)}")

        write_path.mkdir(parents=True, exist_ok=True)

        if "header" not in args:
            args[
                "header"
            ] = "Generated by bigquery_etl/events_daily/generate_queries.py"

        text = self._get_comment_char(fpath.suffix) + args["header"] + "\n\n"
        text += self.env.get_template(self.name).render(**args)

        if fpath.suffix == ".sql":
            text = reformat(text, trailing_newline=True)

        (write_path / self.name).write_text(text)
Exemplo n.º 18
0
def render_query(attributes: List[str], **kwargs) -> str:
    """Render the main query."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    sql = env.get_template("probe_counts_v1.sql")

    # If the set of attributes grows, the max_combinations can be set only
    # compute a shallow set for less query complexity
    max_combinations = len(attributes)
    attribute_combinations = []
    for subset_size in reversed(range(max_combinations + 1)):
        for grouping in combinations(attributes, subset_size):
            # channel and app_version are required in the GLAM frontend
            if "channel" not in grouping or "app_version" not in grouping:
                continue
            select_expr = []
            for attribute in attributes:
                select_expr.append((attribute, attribute in grouping))
            attribute_combinations.append(select_expr)

    return reformat(sql.render(attribute_combinations=attribute_combinations, **kwargs))
Exemplo n.º 19
0
def test_generate_query_nested_deep():
    columns = ["a.b", "a.c", "a.d.x.y.e", "a.d.x.y.f", "g"]
    res = generate_query(columns, "test")
    expect = reformat("""
        select struct(
            a.b,
            a.c,
            struct(
                struct(
                    struct(
                        a.d.x.y.e,
                        a.d.x.y.f
                    ) as y
                ) as x
            ) as d
        ) as a,
        g
        from `test`
    """)
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
def render_init(
    header,
    destination_table,
    attributes,
    attributes_type,
    user_data_type,
    partition_clause,
    **kwargs,
) -> str:
    """Render the table initialization DML for partitioning and clustering."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    init_sql = env.get_template("clients_scalar_aggregates_v1.init.sql")
    return reformat(
        init_sql.render(
            header=header,
            destination_table=destination_table,
            attributes_type=",".join(
                f"{name} {dtype}" for name, dtype in zip(attributes, attributes_type)
            ),
            user_data_type=user_data_type,
            partition_clause=partition_clause,
        )
    )
Exemplo n.º 21
0
def write_view_if_not_exists(target_project: str, sql_dir: Path,
                             schema: SchemaFile):
    """If a view.sql does not already exist, write one to the target directory."""
    target_dir = (sql_dir / target_project / schema.bq_dataset_family /
                  schema.bq_table_unversioned)
    target_file = target_dir / "view.sql"

    if target_file.exists():
        return

    # Exclude doctypes maintained in separate projects.
    for prefix in SKIP_PREFIXES:
        if schema.bq_dataset_family.startswith(prefix):
            return

    full_source_id = f"{target_project}.{schema.stable_table}"
    full_view_id = f"{target_project}.{schema.user_facing_view}"
    replacements = ["mozfun.norm.metadata(metadata) AS metadata"]
    if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1":
        replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"]
        if schema.bq_table == "baseline_v1":
            replacements += [
                "mozfun.norm.glean_baseline_client_info"
                "(client_info, metrics) AS client_info"
            ]
        if (schema.bq_dataset_family == "org_mozilla_fenix"
                and schema.bq_table == "metrics_v1"):
            # todo: use mozfun udfs
            replacements += [
                "mozdata.udf.normalize_fenix_metrics"
                "(client_info.telemetry_sdk_build, metrics)"
                " AS metrics"
            ]
        if schema.bq_dataset_family == "firefox_desktop":
            # FOG does not provide an app_name, so we inject the one that
            # people already associate with desktop Firefox per bug 1672191.
            replacements += [
                "'Firefox' AS normalized_app_name",
            ]
    elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"):
        replacements += [
            "mozdata.udf.normalize_main_payload(payload) AS payload"
        ]
    replacements_str = ",\n    ".join(replacements)
    full_sql = reformat(
        VIEW_QUERY_TEMPLATE.format(
            target=full_source_id,
            replacements=replacements_str,
            full_view_id=full_view_id,
        ))
    print(f"Creating {target_file}")
    target_dir.mkdir(parents=True, exist_ok=True)
    with target_file.open("w") as f:
        f.write(full_sql)
    metadata_content = VIEW_METADATA_TEMPLATE.format(
        document_namespace=schema.document_namespace,
        document_type=schema.document_type,
    )
    metadata_file = target_dir / "metadata.yaml"
    if not metadata_file.exists():
        with metadata_file.open("w") as f:
            f.write(metadata_content)
def write_view_if_not_exists(target_project: str, sql_dir: Path,
                             schema: SchemaFile):
    """If a view.sql does not already exist, write one to the target directory."""
    target_dir = (sql_dir / target_project / schema.bq_dataset_family /
                  schema.bq_table_unversioned)

    target_file = target_dir / "view.sql"

    if target_file.exists():
        return

    full_source_id = f"{target_project}.{schema.stable_table}"
    full_view_id = f"{target_project}.{schema.user_facing_view}"
    replacements = ["mozfun.norm.metadata(metadata) AS metadata"]
    if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1":
        replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"]
        if schema.bq_table == "baseline_v1":
            replacements += [
                "mozfun.norm.glean_baseline_client_info"
                "(client_info, metrics) AS client_info"
            ]
        if (schema.bq_dataset_family == "org_mozilla_fenix"
                and schema.bq_table == "metrics_v1"):
            # todo: use mozfun udfs
            replacements += [
                "mozdata.udf.normalize_fenix_metrics"
                "(client_info.telemetry_sdk_build, metrics)"
                " AS metrics"
            ]
        if schema.bq_dataset_family == "firefox_desktop":
            # FOG does not provide an app_name, so we inject the one that
            # people already associate with desktop Firefox per bug 1672191.
            replacements += [
                "'Firefox' AS normalized_app_name",
            ]
    elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"):
        replacements += [
            "mozdata.udf.normalize_main_payload(payload) AS payload"
        ]
    replacements_str = ",\n    ".join(replacements)
    full_sql = reformat(
        VIEW_QUERY_TEMPLATE.format(
            target=full_source_id,
            replacements=replacements_str,
            full_view_id=full_view_id,
        ))
    print(f"Creating {target_file}")
    target_dir.mkdir(parents=True, exist_ok=True)
    with target_file.open("w") as f:
        f.write(full_sql)
    metadata_content = VIEW_METADATA_TEMPLATE.format(
        document_namespace=schema.document_namespace,
        document_type=schema.document_type,
    )
    metadata_file = target_dir / "metadata.yaml"
    if not metadata_file.exists():
        with metadata_file.open("w") as f:
            f.write(metadata_content)

    # get view schema with descriptions
    try:
        content = VIEW_CREATE_REGEX.sub("", target_file.read_text())
        content += " WHERE DATE(submission_timestamp) = '2020-01-01'"
        view_schema = Schema.from_query_file(target_file, content=content)

        stable_table_schema = Schema.from_json({"fields": schema.schema})
        view_schema.merge(stable_table_schema, add_missing_fields=False)
        view_schema.to_yaml_file(target_dir / "schema.yaml")
    except Exception as e:
        print(f"Cannot generate schema.yaml for {target_file}: {e}")
def render_main(**kwargs):
    """Create a SQL query for the clients_daily_scalar_aggregates dataset."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    main_sql = env.get_template("clients_daily_scalar_aggregates_v1.sql")
    return reformat(main_sql.render(**kwargs))
Exemplo n.º 24
0
def test_generate_query_simple():
    columns = ["a", "b"]
    res = generate_query(columns, "test")
    expect = reformat("select a, b from `test`")
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
Exemplo n.º 25
0
def generate_query(columns, table):
    """Generate a SQL query given column names.

    We construct a query that selects columns into nested structs. Naive
    selection of all the columns will strip the namespace from the columns.

    The legacy core and legacy event tables are converted as subsets of the
    metrics glean ping. There may be more than one row per client, but this
    matches the existing semantics of the metrics ping. We use this method over
    joining the core and legacy pings because of the non-overlapping nature of
    these two pings and difficulty in using coalesce with a deeply nested
    structure.
    """

    # Build a string that contains the selected columns. We take the set of
    # columns and split them up by namespace. Each namespace is put inside of a
    # STRUCT call. For example, foo.a and foo.b will be translated into a
    # `STRUCT(foo.a, foo.b) as foo` nested column.
    acc = ""

    # Maintain the last element in the columns to determine when a transition
    # must be made.
    prev = []

    # Iterate over the sorted set of columns. This ensures that columns are
    # grouped together correctly. Every time the column goes into a namespace,
    # we push an opening struct statement onto the string. Every time we
    # complete nested struct, we close out the string by aliasing the struct to
    # the namespace.
    for col in sorted(columns):
        split = col.split(".")
        # check if we go deeper
        if len(split) > 1 and len(split) > len(prev):
            # the number of times to start nesting
            if len(prev) == 0:
                k = len(split) - 1
            else:
                k = len(split) - len(prev)
            acc += "struct(" * k
        # the two structs are different now, figure out how much we need to pop
        # off before we continue
        if len(split) > 1 and len(split) == len(prev):
            # find the common ancestor
            depth = 0
            for a, b in list(zip(split[:-1], prev[:-1])):
                if a != b:
                    break
                depth += 1
            # now pop off until we reach the ancestor
            for alias in reversed(prev[depth:-1]):
                acc = acc.rstrip(",")
                acc += f") as {alias},"
            # now enter the new struct
            acc += "struct(" * (len(split) - 1 - depth)
        # pop out of the struct
        if len(split) < len(prev):
            diff = len(prev) - len(split)
            # ignore the leaf
            prev.pop()
            for _ in range(diff):
                c = prev.pop()
                acc = acc.rstrip(",")
                acc += f") as {c},"
        acc += f"{col},"
        prev = split
    # clean up any columns
    if len(prev) > 1:
        prev.pop()
        for c in reversed(prev):
            acc = acc.rstrip(",")
            acc += f") as {c},"
    acc = acc.rstrip(",")

    return reformat(f"select {acc} from `{table}`")
def main(project, source_dataset, destination_dataset, create_table, backfill,
         dryrun):
    """Generate queries and optionally create the tables in BigQuery."""
    client = bigquery.Client(project=project)

    exported_tables = [
        table.table_id for table in client.list_tables(source_dataset)
        if table.table_type == "TABLE"
    ]

    tables_by_dimension = defaultdict(list)
    opt_in_metrics = set()

    # group table names by the dimension it is grouped by
    for table_name in exported_tables:
        if table_name.endswith("_total"):
            dimension = None
        else:
            metric, dimension = table_name.split("_by_")
            if dimension.startswith("opt_in"):
                opt_in_metrics.add(metric)
                dimension = dimension.replace("opt_in_", "")

        tables_by_dimension[dimension].append(table_name)

    for dimension, table_names in tables_by_dimension.items():
        qualified_table_names = [
            f"`{project}.{source_dataset}.{table_name}`"
            for table_name in table_names
        ]

        if dimension is not None:
            fields = f"date, app_name, {dimension}"
            table_name = f"metrics_by_{dimension}"
            metrics = [
                table_name.split("_by_")[0] for table_name in table_names
            ]
        else:
            fields = "date, app_name"
            table_name = "metrics_total"
            metrics = [
                table_name.split("_total")[0] for table_name in table_names
            ]

        join_clauses = [
            JOIN_TEMPLATE.format(table=table_name, fields=fields)
            for table_name in qualified_table_names[1:]
        ]

        # add _opt_in to opt-in metrics
        fields_to_add_opt_in = [
            metric for metric in metrics if metric in opt_in_metrics
        ]
        excepted_fields = ",".join(fields_to_add_opt_in)
        additional_fields = [
            f"{name} AS {name}_opt_in" for name in fields_to_add_opt_in
            if name != "rate"
        ]

        # rename rate column to opt_in_rate and
        if "rate" in metrics:
            additional_fields.append("rate AS opt_in_rate")

        query_text = QUERY_TEMPLATE.format(
            excepted_fields=excepted_fields,
            additional_fields=", ".join(additional_fields),
            first_table=qualified_table_names[0],
            joined_tables="\n".join(join_clauses),
            filter="date=@date",
        )
        query_path = os.path.join(SQL_DIR, destination_dataset, table_name,
                                  "query.sql")

        if not os.path.exists(os.path.dirname(query_path)):
            os.makedirs(os.path.dirname(query_path))

        with open(query_path, "w") as f:
            print(f"Writing {query_path}")
            f.write(reformat(query_text))
            f.write("\n")

        if create_table:
            query_text = QUERY_TEMPLATE.format(
                excepted_fields=excepted_fields,
                additional_fields=", ".join(additional_fields),
                first_table=qualified_table_names[0],
                joined_tables="\n".join(join_clauses),
                filter="TRUE" if backfill else "FALSE",
            )
            schema_update_options = ([] if backfill else [
                bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
            ])
            job_config = bigquery.QueryJobConfig(
                use_legacy_sql=False,
                dry_run=dryrun,
                destination=f"{project}.{destination_dataset}.{table_name}",
                schema_update_options=schema_update_options,
                time_partitioning=bigquery.TimePartitioning(field="date"),
                create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
                if backfill else bigquery.WriteDisposition.WRITE_APPEND,
            )
            print(f"Creating table {table_name}")
            query_job = client.query(query_text, job_config)
            if not dryrun:
                query_job.result()
Exemplo n.º 27
0
def test_generate_query_nested():
    columns = ["a", "b.c", "b.d"]
    res = generate_query(columns, "test")
    expect = reformat("select a, struct(b.c, b.d) as b from `test`")
    assert res == expect, f"expected:\n{expect}\ngot:\n{res}"
Exemplo n.º 28
0
def render_main(**kwargs, ) -> str:
    """Render the main query."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    main_sql = env.get_template("latest_versions_v1.sql")
    return reformat(main_sql.render(**kwargs))
Exemplo n.º 29
0
def render_query(**kwargs) -> str:
    """Render the main query."""
    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))
    sql = env.get_template("bucket_counts_v1.sql")
    return reformat(sql.render(**kwargs))
Exemplo n.º 30
0
def main():
    # get the most schema deploy (to the nearest 15 minutes)
    bq = bigquery.Client()
    label = bq.get_dataset("moz-fx-data-shared-prod.telemetry").labels.get(
        "schemas_build_id")
    print(f"last deploy: {label}")

    # get the schema corresponding to the last commit
    commit_hash = label.split("_")[-1]
    schema_url = (
        "https://raw.githubusercontent.com/mozilla-services/mozilla-pipeline-schemas/"
        f"{commit_hash}/schemas/org-mozilla-ios-firefox/metrics/metrics.1.bq")
    resp = requests.get(schema_url)
    schema = resp.json()
    column_summary = get_columns(schema)

    print(json.dumps(column_summary, indent=2))
    """
    The columns take on the following form:

    "root.additional_properties STRING",
    "root.client_info.android_sdk_version STRING",
    "root.client_info.app_build STRING",
    ...

    This will need to be processed yet again so we can query via bigquery
    """

    bq = bigquery.Client()
    legacy_core = (
        "moz-fx-data-shared-prod.org_mozilla_ios_firefox_derived.legacy_mobile_core_v2"
    )
    legacy_event = (
        "moz-fx-data-shared-prod.org_mozilla_ios_firefox_derived.legacy_mobile_event_counts_v2"
    )
    update_schema(bq, legacy_core, schema)
    update_schema(bq, legacy_event, schema)

    stripped = [c.split()[0].lstrip("root.") for c in column_summary]
    query_glean = generate_query(
        ['"glean" as telemetry_system', *stripped],
        "mozdata.org_mozilla_ios_firefox.metrics",
    )
    query_legacy_events = generate_query(
        [
            '"legacy" as telemetry_system',
            *[
                # replace submission date with _PARTITIONTIME
                "DATE(_PARTITIONTIME) as submission_date"
                if c == "submission_date" else c for c in stripped
            ],
        ],
        legacy_core,
    )
    query_legacy_core = generate_query(
        [
            '"legacy" as telemetry_system',
            *[
                # replace submission date with _PARTITIONTIME
                "DATE(_PARTITIONTIME) as submission_date"
                if c == "submission_date" else c for c in stripped
            ],
        ],
        legacy_event,
    )

    view_body = reformat(" UNION ALL ".join(
        [query_glean, query_legacy_core, query_legacy_events]))
    print(view_body)
    view_id = "moz-fx-data-shared-prod.org_mozilla_ios_firefox.unified_metrics"
    try:
        bq.delete_table(bq.get_table(view_id))
    except exceptions.NotFound:
        pass
    view = bigquery.Table(view_id)
    view.view_query = view_body
    bq.create_table(view)
    print(f"updated view at {view_id}")