def test_merge_empty_schema(self): schema_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE - mode: NULLABLE name: client_id type: STRING - fields: - mode: NULLABLE name: campaign type: STRING - mode: NULLABLE name: content type: STRING mode: NULLABLE name: attribution type: RECORD """) schema = Schema.from_json(yaml.safe_load(schema_yaml)) empty_schema = Schema.from_json({"fields": []}) schema.merge(empty_schema) assert schema.to_json() != empty_schema.to_json() assert (schema.to_json() == Schema.from_json( yaml.safe_load(schema_yaml)).to_json()) empty_schema.merge(schema) assert (empty_schema.to_json() == Schema.from_json( yaml.safe_load(schema_yaml)).to_json())
def test_schemas_unequal_nested_record(self): schema_1_yaml = dedent(""" fields: - fields: - mode: NULLABLE name: multiprocess_compatible type: BOOLEAN mode: REPEATED name: active_addons type: RECORD """) schema_2_yaml = dedent(""" fields: - fields: - mode: NULLABLE name: multiprocess type: BOOLEAN mode: REPEATED name: active_addons type: RECORD """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert schema_1.equal(schema_2) is False assert schema_2.equal(schema_1) is False
def test_merge_compatible_schemas(self): schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE - mode: NULLABLE name: client_id type: STRING - fields: - mode: NULLABLE name: campaign type: STRING - mode: NULLABLE name: content type: STRING mode: NULLABLE name: attribution type: RECORD """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE - mode: NULLABLE name: sample_id type: INTEGER - fields: - mode: NULLABLE name: description type: STRING - mode: NULLABLE name: content description: "Content description" type: STRING mode: NULLABLE name: attribution type: RECORD """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert len(schema_1.schema["fields"]) == 3 schema_1.merge(schema_2) assert len(schema_1.schema["fields"]) == 4 schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) assert len(schema_2.schema["fields"]) == 3 schema_2.merge(schema_1) assert len(schema_2.schema["fields"]) == 4
def test_equal_schemas(self): empty_schema = Schema.from_json({"fields": {}}) assert empty_schema.equal(empty_schema) schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE - mode: NULLABLE name: client_id type: STRING - fields: - mode: NULLABLE name: campaign type: STRING - mode: NULLABLE name: content type: STRING mode: NULLABLE name: attribution type: RECORD """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: client_id type: STRING - mode: NULLABLE name: submission_date type: DATE - fields: - mode: NULLABLE name: content type: STRING description: "Cool content" - mode: NULLABLE name: campaign type: STRING mode: NULLABLE name: attribution type: RECORD """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert schema_1.equal(schema_2) is True assert schema_2.equal(schema_1) is True
def test_from_schema_file(self): schema_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_v1" / "schema.yaml") schema = Schema.from_schema_file(schema_file) assert len(schema.schema["fields"]) == 3
def test_from_query_file(self): query_file = (TEST_DIR / "data" / "test_sql" / "moz-fx-data-test-project" / "test" / "incremental_query_non_incremental_export_v1" / "query.sql") schema = Schema.from_query_file(query_file) assert len(schema.schema["fields"]) == 3
def test_schemas_unequal_attributes(self): schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: INTEGER """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert schema_1.equal(schema_2) is False assert schema_2.equal(schema_1) is False
def test_schemas_different_descriptions(self): schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date description: "The submission_date" type: DATE """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date description: "Date of the submission" type: DATE """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert schema_1.equal(schema_2) is True assert schema_2.equal(schema_1) is True
def test_schemas_compatible(self): schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE - mode: NULLABLE name: client_id type: STRING """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) assert schema_1.compatible(schema_2) is True assert schema_2.compatible(schema_1) is False
def test_merge_different_descriptions(self): schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date description: "The submission_date" type: DATE """) schema_2_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date description: "Date of the submission" type: DATE """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_2 = Schema.from_json(yaml.safe_load(schema_2_yaml)) schema_1.merge(schema_2) assert schema_1.schema["fields"][0][ "description"] == "The submission_date" schema_1_yaml = dedent(""" fields: - mode: NULLABLE name: submission_date type: DATE """) schema_1 = Schema.from_json(yaml.safe_load(schema_1_yaml)) schema_1.merge(schema_2) assert schema_1.schema["fields"][0][ "description"] == "Date of the submission"
def test_from_json(self): json_schema = { "fields": [ { "name": "amount", "type": "INTEGER" }, { "name": "amount_captured", "type": "INTEGER" }, ] } schema = Schema.from_json(json_schema) assert len(schema.schema["fields"]) == 2
def publish(self, target_project=None, dry_run=False): """ Publish this view to BigQuery. If `target_project` is set, it will replace the project ID in the view definition. """ if any(str(self.path).endswith(p) for p in SKIP_PUBLISHING): print(f"Skipping {self.path}") return True # avoid checking references since Jenkins might throw an exception: # https://github.com/mozilla/bigquery-etl/issues/2246 if ( any(str(self.path).endswith(p) for p in SKIP_VALIDATION) or self._valid_view_naming() ): client = bigquery.Client() sql = self.content target_view = self.view_identifier if target_project: if self.project != "moz-fx-data-shared-prod": print(f"Skipping {self.path} because --target-project is set") return True # target_view must be a fully-qualified BigQuery Standard SQL table # identifier, which is of the form f"{project_id}.{dataset_id}.{table_id}". # dataset_id and table_id may not contain "." or "`". Each component may be # a backtick (`) quoted identifier, or the whole thing may be a backtick # quoted identifier, but not both. # Project IDs must contain 6-63 lowercase letters, digits, or dashes. Some # project IDs also include domain name separated by a colon. IDs must start # with a letter and may not end with a dash. For more information see also # https://github.com/mozilla/bigquery-etl/pull/1427#issuecomment-707376291 target_view = self.view_identifier.replace( self.project, target_project, 1 ) # We only change the first occurrence, which is in the target view name. sql = sql.replace(self.project, target_project, 1) job_config = bigquery.QueryJobConfig(use_legacy_sql=False, dry_run=dry_run) query_job = client.query(sql, job_config) if dry_run: print(f"Validated definition of {self.view_identifier} in {self.path}") else: try: query_job.result() except BadRequest as e: if "Invalid snapshot time" in e.message: # This occasionally happens due to dependent views being # published concurrently; we wait briefly and give it one # extra try in this situation. time.sleep(1) client.query(sql, job_config).result() else: raise try: view_schema = Schema.from_schema_file( Path(self.path).parent / "schema.yaml" ) view_schema.deploy(target_view) except Exception as e: print(f"Could not update field descriptions for {target_view}: {e}") print(f"Published view {target_view}") else: print(f"Error publishing {self.path}. Invalid view definition.") return False return True
def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaFile): """If a view.sql does not already exist, write one to the target directory.""" target_dir = (sql_dir / target_project / schema.bq_dataset_family / schema.bq_table_unversioned) target_file = target_dir / "view.sql" if target_file.exists(): return full_source_id = f"{target_project}.{schema.stable_table}" full_view_id = f"{target_project}.{schema.user_facing_view}" replacements = ["mozfun.norm.metadata(metadata) AS metadata"] if schema.schema_id == "moz://mozilla.org/schemas/glean/ping/1": replacements += ["mozfun.norm.glean_ping_info(ping_info) AS ping_info"] if schema.bq_table == "baseline_v1": replacements += [ "mozfun.norm.glean_baseline_client_info" "(client_info, metrics) AS client_info" ] if (schema.bq_dataset_family == "org_mozilla_fenix" and schema.bq_table == "metrics_v1"): # todo: use mozfun udfs replacements += [ "mozdata.udf.normalize_fenix_metrics" "(client_info.telemetry_sdk_build, metrics)" " AS metrics" ] if schema.bq_dataset_family == "firefox_desktop": # FOG does not provide an app_name, so we inject the one that # people already associate with desktop Firefox per bug 1672191. replacements += [ "'Firefox' AS normalized_app_name", ] elif schema.schema_id.startswith("moz://mozilla.org/schemas/main/ping/"): replacements += [ "mozdata.udf.normalize_main_payload(payload) AS payload" ] replacements_str = ",\n ".join(replacements) full_sql = reformat( VIEW_QUERY_TEMPLATE.format( target=full_source_id, replacements=replacements_str, full_view_id=full_view_id, )) print(f"Creating {target_file}") target_dir.mkdir(parents=True, exist_ok=True) with target_file.open("w") as f: f.write(full_sql) metadata_content = VIEW_METADATA_TEMPLATE.format( document_namespace=schema.document_namespace, document_type=schema.document_type, ) metadata_file = target_dir / "metadata.yaml" if not metadata_file.exists(): with metadata_file.open("w") as f: f.write(metadata_content) # get view schema with descriptions try: content = VIEW_CREATE_REGEX.sub("", target_file.read_text()) content += " WHERE DATE(submission_timestamp) = '2020-01-01'" view_schema = Schema.from_query_file(target_file, content=content) stable_table_schema = Schema.from_json({"fields": schema.schema}) view_schema.merge(stable_table_schema, add_missing_fields=False) view_schema.to_yaml_file(target_dir / "schema.yaml") except Exception as e: print(f"Cannot generate schema.yaml for {target_file}: {e}")