Exemplo n.º 1
0
def main():
    """Generate GLAM ETL queries."""
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument("--prefix")
    parser.add_argument("--dataset", default="glam_etl")
    parser.add_argument("--sql-root", default="sql/")
    args = parser.parse_args()

    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))

    dataset_path = Path(args.sql_root) / args.dataset
    if not dataset_path.is_dir():
        raise NotADirectoryError(f"path to {dataset_path} not found")

    build_date_udf_mapping = dict(
        org_mozilla_fenix="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",
        org_mozilla_firefox="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",
        org_mozilla_firefox_beta="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",  # noqa
        org_mozilla_fennec_aurora="`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",  # noqa
    )
    if not build_date_udf_mapping.get(args.prefix):
        raise ValueError(f"build date udf for {args.prefix} was not found")

    # curry functions for convenience
    template = partial(
        from_template, environment=env, dataset_path=dataset_path, args=args
    )
    view = partial(template, QueryType.VIEW)
    table = partial(template, QueryType.TABLE)
    init = partial(template, QueryType.INIT)

    [
        table(
            "latest_versions_v1",
            **dict(source_table=f"{args.prefix}_stable.baseline_v1"),
        ),
        init(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=(
                    f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"
                ),
            ),
        ),
        table(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=(
                    f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"
                ),
            ),
        ),
        init(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "scalar_bucket_counts_v1",
            **models.scalar_bucket_counts(
                source_table=f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"
            ),
        ),
        table(
            "histogram_bucket_counts_v1",
            **models.histogram_bucket_counts(
                source_table=f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1"
            ),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="scalar",
            **models.probe_counts(
                source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
                is_scalar=True,
            ),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="histogram",
            **models.probe_counts(
                source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
                is_scalar=False,
            ),
        ),
        table(
            "scalar_percentiles_v1",
            **models.scalar_percentiles(
                source_table=f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"
            ),
        ),
        view("view_clients_daily_scalar_aggregates_v1"),
        view("view_clients_daily_histogram_aggregates_v1"),
        table("histogram_percentiles_v1"),
        view("view_probe_counts_v1"),
        view("view_user_counts_v1", **models.user_counts()),
        table(
            "extract_user_counts_v1", build_date_udf=build_date_udf_mapping[args.prefix]
        ),
        table(
            "extract_probe_counts_v1",
            build_date_udf=build_date_udf_mapping[args.prefix],
        ),
    ]
Exemplo n.º 2
0
def main():
    """Generate GLAM ETL queries."""
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument("--prefix")
    parser.add_argument("--dataset", default="glam_etl")
    parser.add_argument("--sql-root", default="sql/")
    parser.add_argument("--daily-view-only",
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))

    dataset_path = Path(args.sql_root) / args.dataset
    if not dataset_path.is_dir():
        raise NotADirectoryError(f"path to {dataset_path} not found")

    # curry functions for convenience
    template = partial(from_template,
                       environment=env,
                       dataset_path=dataset_path,
                       args=args)
    view = partial(template, QueryType.VIEW)
    table = partial(template, QueryType.TABLE)
    init = partial(template, QueryType.INIT)

    # If this is a logical app id, generate it. Assert that the daily view for
    # the app exists. This assumes that both scalar and histogram aggregates
    # exist and will break down in the case where a glean app only contains one
    # of the scalar or histogram view.
    for daily_view in [
            "view_clients_daily_scalar_aggregates_v1",
            "view_clients_daily_histogram_aggregates_v1",
    ]:
        try:
            view(f"logical_app_id/{args.prefix}__{daily_view}")
        except TemplateNotFound:
            print(f"{args.prefix} is not a logical app id")
            # generate the view for the app id directly
            view(daily_view)

        if not (dataset_path / f"{args.prefix}__{daily_view}").is_dir():
            raise ValueError(f"missing {daily_view}")

    # exit early if we're only generating a daily view
    if args.daily_view_only:
        return

    # Supported fenix/firefox for android products. These are logical ids that
    # are formed from the union of several app_ids (sometimes across date
    # boundaries).
    fenix_app_ids = [
        "org_mozilla_fenix_glam_nightly",
        "org_mozilla_fenix_glam_beta",
        "org_mozilla_fenix_glam_release",
    ]

    build_date_udf_mapping = dict(
        **{
            app_id: "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime"
            for app_id in fenix_app_ids
        })
    if not build_date_udf_mapping.get(args.prefix):
        raise ValueError(f"build date udf for {args.prefix} was not found")

    [
        table(
            "latest_versions_v1",
            **dict(source_table=(
                f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
            )),
        ),
        init(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
            ),
        ),
        table(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
            ),
        ),
        init(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "scalar_bucket_counts_v1",
            **models.scalar_bucket_counts(
                source_table=
                f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
        ),
        table(
            "histogram_bucket_counts_v1",
            **models.histogram_bucket_counts(
                source_table=
                f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1"),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="scalar",
            **models.probe_counts(
                source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
                is_scalar=True,
            ),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="histogram",
            **models.probe_counts(
                source_table=
                f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
                is_scalar=False,
            ),
        ),
        table(
            "scalar_percentiles_v1",
            **models.scalar_percentiles(
                source_table=
                f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
        ),
        table("histogram_percentiles_v1"),
        view("view_probe_counts_v1"),
        view("view_user_counts_v1", **models.user_counts()),
        table("extract_user_counts_v1",
              build_date_udf=build_date_udf_mapping[args.prefix]),
        table(
            "extract_probe_counts_v1",
            build_date_udf=build_date_udf_mapping[args.prefix],
        ),
    ]
Exemplo n.º 3
0
def main():
    """Generate GLAM ETL queries."""
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument("--prefix")
    parser.add_argument("--dataset", default="glam_etl")
    parser.add_argument("--sql-root", default="sql/moz-fx-data-shared-prod/")
    parser.add_argument("--daily-view-only",
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))

    dataset_path = Path(args.sql_root) / args.dataset
    if not dataset_path.is_dir():
        raise NotADirectoryError(f"path to {dataset_path} not found")

    # curry functions for convenience
    template = partial(from_template,
                       environment=env,
                       dataset_path=dataset_path,
                       args=args)
    view = partial(template, QueryType.VIEW)
    table = partial(template, QueryType.TABLE)
    init = partial(template, QueryType.INIT)

    # If this is a logical app id, generate it. Assert that the daily view for
    # the app exists. This assumes that both scalar and histogram aggregates
    # exist and will break down in the case where a glean app only contains one
    # of the scalar or histogram view.
    for daily_view in [
            "view_clients_daily_scalar_aggregates_v1",
            "view_clients_daily_histogram_aggregates_v1",
    ]:
        try:
            view(f"logical_app_id/{args.prefix}__{daily_view}")
        except TemplateNotFound:
            print(f"{args.prefix} is not a logical app id")
            # generate the view for the app id directly
            view(daily_view)

        if not (dataset_path / f"{args.prefix}__{daily_view}").is_dir():
            raise ValueError(f"missing {daily_view}")

    # exit early if we're only generating a daily view
    if args.daily_view_only:
        return

    # Supported fenix/firefox for android products. These are logical ids that
    # are formed from the union of several app_ids (sometimes across date
    # boundaries).
    config_schema = {
        "type": "object",
        "additionalProperties": {
            "type":
            "object",
            "properties": {
                "build_date_udf": {
                    "type":
                    "string",
                    "description":
                    ("The fully qualified path to a UDF that accepts the "
                     "client_info.app_build_id field and returns a datetime."),
                },
                "filter_version": {
                    "type":
                    "boolean",
                    "description":
                    ("Whether the integer extracted from the "
                     "client_info.app_display_version should be used to "
                     "filter incremental aggregates."),
                },
                "num_versions_to_keep": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "The number of versions to keep.",
                },
            },
            "required":
            ["build_date_udf", "filter_version", "num_versions_to_keep"],
        },
    }
    config = {
        "org_mozilla_fenix_glam_nightly": {
            "build_date_udf":
            "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",
            "filter_version": False,
            # this value is ignored due to filter version
            "num_versions_to_keep": 1000,
        },
        "org_mozilla_fenix_glam_beta": {
            "build_date_udf":
            "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",
            "filter_version": True,
            "num_versions_to_keep": 2,
        },
        "org_mozilla_fenix_glam_release": {
            "build_date_udf":
            "`moz-fx-data-shared-prod`.udf.fenix_build_to_datetime",
            "filter_version": True,
            "num_versions_to_keep": 2,
        },
    }
    validate(instance=config, schema=config_schema)

    if not config.get(args.prefix, {}).get("build_date_udf"):
        raise ValueError(f"build date udf for {args.prefix} was not found")

    [
        table(
            "latest_versions_v1",
            **dict(source_table=(
                f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
            )),
        ),
        init(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
            ),
        ),
        table(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}__view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
                **config[args.prefix],
            ),
        ),
        init(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True,
                                                  **config[args.prefix]),
        ),
        table(
            "scalar_bucket_counts_v1",
            **models.scalar_bucket_counts(
                source_table=
                f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
        ),
        table(
            "histogram_bucket_counts_v1",
            **models.histogram_bucket_counts(
                source_table=
                f"glam_etl.{args.prefix}__clients_histogram_aggregates_v1"),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="scalar",
            **models.probe_counts(
                source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
                is_scalar=True,
            ),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="histogram",
            **models.probe_counts(
                source_table=
                f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
                is_scalar=False,
            ),
        ),
        table(
            "scalar_percentiles_v1",
            **models.scalar_percentiles(
                source_table=
                f"glam_etl.{args.prefix}__clients_scalar_aggregates_v1"),
        ),
        table("histogram_percentiles_v1"),
        view("view_probe_counts_v1"),
        view("view_user_counts_v1", **models.user_counts()),
        table("extract_user_counts_v1", **config[args.prefix]),
        table("extract_probe_counts_v1", **config[args.prefix]),
    ]
Exemplo n.º 4
0
def main():
    """Generate GLAM ETL queries."""
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument("--prefix", default="fenix")
    parser.add_argument("--dataset", default="glam_etl")
    parser.add_argument("--sql-root", default="sql/")
    args = parser.parse_args()

    env = Environment(loader=PackageLoader("bigquery_etl", "glam/templates"))

    dataset_path = Path(args.sql_root) / args.dataset
    if not dataset_path.is_dir():
        raise NotADirectoryError(f"path to {dataset_path} not found")

    # curry functions for convenience
    template = partial(from_template,
                       environment=env,
                       dataset_path=dataset_path,
                       args=args)
    view = partial(template, QueryType.VIEW)
    table = partial(template, QueryType.TABLE)
    init = partial(template, QueryType.INIT)

    [
        table(
            "latest_versions_v1",
            **dict(source_table="org_mozilla_fenix_stable.baseline_v1"),
        ),
        init(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}_view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}_clients_scalar_aggregates_v1"),
            ),
        ),
        table(
            "clients_scalar_aggregates_v1",
            **models.clients_scalar_aggregates(
                source_table=
                (f"glam_etl.{args.prefix}_view_clients_daily_scalar_aggregates_v1"
                 ),
                destination_table=(
                    f"glam_etl.{args.prefix}_clients_scalar_aggregates_v1"),
            ),
        ),
        init(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "clients_histogram_aggregates_v1",
            **models.clients_histogram_aggregates(parameterize=True),
        ),
        table(
            "clients_scalar_bucket_counts_v1",
            **models.clients_scalar_bucket_counts(
                source_table="glam_etl.fenix_clients_scalar_aggregates_v1"),
        ),
        table(
            "clients_histogram_bucket_counts_v1",
            **models.clients_histogram_bucket_counts(),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="clients_scalar",
            **models.probe_counts(
                source_table="glam_etl.fenix_clients_scalar_bucket_counts_v1",
                is_scalar=True,
            ),
        ),
        table(
            "probe_counts_v1",
            query_name_prefix="clients_histogram",
            **models.probe_counts(
                source_table=
                "glam_etl.fenix_clients_histogram_bucket_counts_v1",
                is_scalar=False,
            ),
        ),
        table(
            "scalar_percentiles_v1",
            **models.scalar_percentiles(
                source_table="glam_etl.fenix_clients_scalar_aggregates_v1"),
        ),
        view("view_clients_daily_scalar_aggregates_v1"),
        view("view_clients_daily_histogram_aggregates_v1"),
        table("histogram_percentiles_v1"),
        view("view_probe_counts_v1"),
        view("view_user_counts_v1", **models.user_counts()),
        table("extract_user_counts_v1"),
        table("extract_probe_counts_v1"),
    ]