Пример #1
0
    def test_merge_empty_schema(self):
        schema_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - mode: NULLABLE
              name: client_id
              type: STRING
            - fields:
              - mode: NULLABLE
                name: campaign
                type: STRING
              - mode: NULLABLE
                name: content
                type: STRING
              mode: NULLABLE
              name: attribution
              type: RECORD
            """)

        schema = Schema.from_json(yaml.safe_load(schema_yaml))
        empty_schema = Schema.from_json({"fields": []})
        schema.merge(empty_schema)
        assert schema.to_json() != empty_schema.to_json()
        assert (schema.to_json() == Schema.from_json(
            yaml.safe_load(schema_yaml)).to_json())

        empty_schema.merge(schema)
        assert (empty_schema.to_json() == Schema.from_json(
            yaml.safe_load(schema_yaml)).to_json())
Пример #2
0
    def test_schemas_unequal_nested_record(self):
        schema_1_yaml = dedent("""
            fields:
            - fields:
              - mode: NULLABLE
                name: multiprocess_compatible
                type: BOOLEAN
            mode: REPEATED
            name: active_addons
            type: RECORD
            """)

        schema_2_yaml = dedent("""
            fields:
            - fields:
              - mode: NULLABLE
                name: multiprocess
                type: BOOLEAN
            mode: REPEATED
            name: active_addons
            type: RECORD
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert schema_1.equal(schema_2) is False
        assert schema_2.equal(schema_1) is False
Пример #3
0
    def test_merge_compatible_schemas(self):
        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - mode: NULLABLE
              name: client_id
              type: STRING
            - fields:
              - mode: NULLABLE
                name: campaign
                type: STRING
              - mode: NULLABLE
                name: content
                type: STRING
              mode: NULLABLE
              name: attribution
              type: RECORD
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - mode: NULLABLE
              name: sample_id
              type: INTEGER
            - fields:
              - mode: NULLABLE
                name: description
                type: STRING
              - mode: NULLABLE
                name: content
                description: "Content description"
                type: STRING
              mode: NULLABLE
              name: attribution
              type: RECORD
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert len(schema_1.schema["fields"]) == 3
        schema_1.merge(schema_2)
        assert len(schema_1.schema["fields"]) == 4

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        assert len(schema_2.schema["fields"]) == 3
        schema_2.merge(schema_1)
        assert len(schema_2.schema["fields"]) == 4
Пример #4
0
    def test_equal_schemas(self):
        empty_schema = Schema.from_json({"fields": {}})
        assert empty_schema.equal(empty_schema)

        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - mode: NULLABLE
              name: client_id
              type: STRING
            - fields:
              - mode: NULLABLE
                name: campaign
                type: STRING
              - mode: NULLABLE
                name: content
                type: STRING
              mode: NULLABLE
              name: attribution
              type: RECORD
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: client_id
              type: STRING
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - fields:
              - mode: NULLABLE
                name: content
                type: STRING
                description: "Cool content"
              - mode: NULLABLE
                name: campaign
                type: STRING
              mode: NULLABLE
              name: attribution
              type: RECORD
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert schema_1.equal(schema_2) is True
        assert schema_2.equal(schema_1) is True
Пример #5
0
    def test_from_schema_file(self):
        schema_file = (TEST_DIR / "data" / "test_sql" /
                       "moz-fx-data-test-project" / "test" /
                       "incremental_query_v1" / "schema.yaml")

        schema = Schema.from_schema_file(schema_file)
        assert len(schema.schema["fields"]) == 3
Пример #6
0
    def test_from_query_file(self):
        query_file = (TEST_DIR / "data" / "test_sql" /
                      "moz-fx-data-test-project" / "test" /
                      "incremental_query_non_incremental_export_v1" /
                      "query.sql")

        schema = Schema.from_query_file(query_file)
        assert len(schema.schema["fields"]) == 3
Пример #7
0
    def test_schemas_unequal_attributes(self):
        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: INTEGER
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert schema_1.equal(schema_2) is False
        assert schema_2.equal(schema_1) is False
Пример #8
0
    def test_schemas_different_descriptions(self):
        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              description: "The submission_date"
              type: DATE
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              description: "Date of the submission"
              type: DATE
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert schema_1.equal(schema_2) is True
        assert schema_2.equal(schema_1) is True
Пример #9
0
    def test_schemas_compatible(self):
        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            - mode: NULLABLE
              name: client_id
              type: STRING
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))

        assert schema_1.compatible(schema_2) is True
        assert schema_2.compatible(schema_1) is False
Пример #10
0
    def test_merge_different_descriptions(self):
        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              description: "The submission_date"
              type: DATE
            """)

        schema_2_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              description: "Date of the submission"
              type: DATE
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml))
        schema_1.merge(schema_2)

        assert schema_1.schema["fields"][0][
            "description"] == "The submission_date"

        schema_1_yaml = dedent("""
            fields:
            - mode: NULLABLE
              name: submission_date
              type: DATE
            """)

        schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml))
        schema_1.merge(schema_2)

        assert schema_1.schema["fields"][0][
            "description"] == "Date of the submission"
Пример #11
0
    def test_from_json(self):
        json_schema = {
            "fields": [
                {
                    "name": "amount",
                    "type": "INTEGER"
                },
                {
                    "name": "amount_captured",
                    "type": "INTEGER"
                },
            ]
        }

        schema = Schema.from_json(json_schema)
        assert len(schema.schema["fields"]) == 2
Пример #12
0
    def publish(self, target_project=None, dry_run=False):
        """
        Publish this view to BigQuery.

        If `target_project` is set, it will replace the project ID in the view definition.
        """
        if any(str(self.path).endswith(p) for p in SKIP_PUBLISHING):
            print(f"Skipping {self.path}")
            return True

        # avoid checking references since Jenkins might throw an exception:
        # https://github.com/mozilla/bigquery-etl/issues/2246
        if (
            any(str(self.path).endswith(p) for p in SKIP_VALIDATION)
            or self._valid_view_naming()
        ):
            client = bigquery.Client()
            sql = self.content
            target_view = self.view_identifier

            if target_project:
                if self.project != "moz-fx-data-shared-prod":
                    print(f"Skipping {self.path} because --target-project is set")
                    return True

                # target_view must be a fully-qualified BigQuery Standard SQL table
                # identifier, which is of the form f"{project_id}.{dataset_id}.{table_id}".
                # dataset_id and table_id may not contain "." or "`". Each component may be
                # a backtick (`) quoted identifier, or the whole thing may be a backtick
                # quoted identifier, but not both.
                # Project IDs must contain 6-63 lowercase letters, digits, or dashes. Some
                # project IDs also include domain name separated by a colon. IDs must start
                # with a letter and may not end with a dash. For more information see also
                # https://github.com/mozilla/bigquery-etl/pull/1427#issuecomment-707376291
                target_view = self.view_identifier.replace(
                    self.project, target_project, 1
                )
                # We only change the first occurrence, which is in the target view name.
                sql = sql.replace(self.project, target_project, 1)

            job_config = bigquery.QueryJobConfig(use_legacy_sql=False, dry_run=dry_run)
            query_job = client.query(sql, job_config)

            if dry_run:
                print(f"Validated definition of {self.view_identifier} in {self.path}")
            else:
                try:
                    query_job.result()
                except BadRequest as e:
                    if "Invalid snapshot time" in e.message:
                        # This occasionally happens due to dependent views being
                        # published concurrently; we wait briefly and give it one
                        # extra try in this situation.
                        time.sleep(1)
                        client.query(sql, job_config).result()
                    else:
                        raise

                try:
                    view_schema = Schema.from_schema_file(
                        Path(self.path).parent / "schema.yaml"
                    )
                    view_schema.deploy(target_view)
                except Exception as e:
                    print(f"Could not update field descriptions for {target_view}: {e}")
                print(f"Published view {target_view}")
        else:
            print(f"Error publishing {self.path}. Invalid view definition.")
            return False

        return True
def write_view_if_not_exists(target_project: str, sql_dir: Path,
                             schema: SchemaFile):
    """If a view.sql does not already exist, write one to the target directory."""
    target_dir = (sql_dir / target_project / schema.bq_dataset_family /
                  schema.bq_table_unversioned)

    target_file = target_dir / "view.sql"

    if target_file.exists():
        return

    full_source_id = f"{target_project}.{schema.stable_table}"
    full_view_id = f"{target_project}.{schema.user_facing_view}"
    replacements = ["mozfun.norm.metadata(metadata) AS metadata"]
    if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1":
        replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"]
        if schema.bq_table == "baseline_v1":
            replacements += [
                "mozfun.norm.glean_baseline_client_info"
                "(client_info, metrics) AS client_info"
            ]
        if (schema.bq_dataset_family == "org_mozilla_fenix"
                and schema.bq_table == "metrics_v1"):
            # todo: use mozfun udfs
            replacements += [
                "mozdata.udf.normalize_fenix_metrics"
                "(client_info.telemetry_sdk_build, metrics)"
                " AS metrics"
            ]
        if schema.bq_dataset_family == "firefox_desktop":
            # FOG does not provide an app_name, so we inject the one that
            # people already associate with desktop Firefox per bug 1672191.
            replacements += [
                "'Firefox' AS normalized_app_name",
            ]
    elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"):
        replacements += [
            "mozdata.udf.normalize_main_payload(payload) AS payload"
        ]
    replacements_str = ",\n    ".join(replacements)
    full_sql = reformat(
        VIEW_QUERY_TEMPLATE.format(
            target=full_source_id,
            replacements=replacements_str,
            full_view_id=full_view_id,
        ))
    print(f"Creating {target_file}")
    target_dir.mkdir(parents=True, exist_ok=True)
    with target_file.open("w") as f:
        f.write(full_sql)
    metadata_content = VIEW_METADATA_TEMPLATE.format(
        document_namespace=schema.document_namespace,
        document_type=schema.document_type,
    )
    metadata_file = target_dir / "metadata.yaml"
    if not metadata_file.exists():
        with metadata_file.open("w") as f:
            f.write(metadata_content)

    # get view schema with descriptions
    try:
        content = VIEW_CREATE_REGEX.sub("", target_file.read_text())
        content += " WHERE DATE(submission_timestamp) = '2020-01-01'"
        view_schema = Schema.from_query_file(target_file, content=content)

        stable_table_schema = Schema.from_json({"fields": schema.schema})
        view_schema.merge(stable_table_schema, add_missing_fields=False)
        view_schema.to_yaml_file(target_dir / "schema.yaml")
    except Exception as e:
        print(f"Cannot generate schema.yaml for {target_file}: {e}")