def module_target_schema_with_transient( module_source_schema, module_target_track_with_transient) -> Schema: temporal: Track = module_target_track_with_transient("t", "temporal") immutable: Track = module_target_track_with_transient("i", "immutable") target_schema = Schema(temporal, immutable) target_schema.source = module_source_schema return target_schema
def build(cls, conf_dir, data_dir, name): """Build task from yaml, read all input data and create corresponding objects""" logging.info("Constructing task execution plan.") path_locator = PathLocator(conf=conf_dir, data=data_dir) logging.info( "Configuration base directory is %s; data base directory is %s." % (conf_dir, data_dir)) task_path: str = os.path.join(path_locator.tasks_dir, name + '.yaml') with open(task_path, 'r') as f: logging.info("Task configuration loaded from %s." % task_path) spec = yaml.safe_load(f) resulting_in = spec.get('resulting_in', {}) task = cls(path_locator=path_locator, origin_data=spec['starting_with']['data'], origin_schema=Schema.load(path_locator, spec['starting_with']['schema']), target_data=resulting_in.get('data'), target_schema=Schema.load(path_locator, resulting_in.get('schema'))) task.load_steps(spec['steps']) # If the last step is a Consume step we don't need target data assert task.target_data is not None or isinstance( task.steps[-1], Consume) return task
def schema_validate(schema_basepath: str, schema_name: str, schema_source_name: Optional[str]) -> None: """Validates a schema without doing any other work.""" source: Optional[Schema] = None if schema_source_name is not None: source = Schema.load(schema_source_name, schema_basepath) Schema.load(schema_name, schema_basepath, source_schema=source)
def build(cls, context: Context, name: str) -> "Task": """Build task from yaml, read all input data and create corresponding objects""" logging.info("Constructing task execution plan.") logging.info( "Configuration base directory is %s; entities directory is %s." % (context.conf_dir, context.entities_input_dir)) task_path: str = os.path.join(context.conf_dir, 'tasks', name + '.yaml') with open(task_path, 'r') as f: logging.info("Task configuration loaded from %s." % task_path) spec = yaml.safe_load(f) resulting_in = spec.get('resulting_in', {}) origin_schema = Schema.load(spec['starting_with']['schema'], context.schemas_dir) assert origin_schema is not None task = cls(context=context, origin_data=spec['starting_with']['data'], origin_schema=origin_schema, target_data=resulting_in.get('data'), target_schema=Schema.load(resulting_in.get('schema'), context.schemas_dir)) task.load_steps(spec['steps']) # If the last step is a Consume step we don't need target data assert task.target_data is not None or isinstance( task.steps[-1], Consume) return task
def from_files(cls, schema_basepath: str, source_schema: str, target_schema: str, output_file: TextIO) -> None: source_schema_instance: Optional[Schema] = Schema.load(source_schema, base_path=schema_basepath) target_schema_instance: Optional[Schema] = Schema.load(target_schema, source_schema=source_schema_instance, base_path=schema_basepath) assert target_schema_instance is not None export: "ExportLinkages" = cls(target_schema_instance, output_file) export() output_file.close()
def __init__(self, schema: Schema, list_id: VariableId, argument_id: VariableId, identifier_id: Optional[VariableId]): self.subjects_path: ListType[str] = schema.get(list_id).absolute_path self.arg_path: ListType[str] = schema.get(argument_id).relative_path self.identifier_path: Optional[ListType[str]] = None if identifier_id is not None: self.identifier_path = schema.get(identifier_id).relative_path
def _make_schema(temporal: bool) -> Schema: if temporal: temporal: Track = Track.build(make_spec(), None, "temporal") immutable: Track = Track.build({}, None, "immutable") schema: Schema = Schema(temporal, immutable) else: immutable: Track = Track.build(make_spec(), None, "immutable") temporal: Track = Track.build({}, None, "temporal") schema: Schema = Schema(temporal, immutable) return schema
def from_files(cls, schema_basepath: str, source_schema: str, target_schema: str, input_file: TextIO, suffix: str) -> None: source_schema_instance: Optional[Schema] = Schema.load(source_schema, base_path=schema_basepath) target_schema_instance: Optional[Schema] = Schema.load(target_schema, source_schema=source_schema_instance, base_path=schema_basepath) assert target_schema_instance is not None do_import: "ImportLinkages" = cls(target_schema_instance, input_file) do_import() input_file.close() output_schema_relpath: str = "%s_%s" % (target_schema, suffix) output_path: str = os.path.join(schema_basepath, output_schema_relpath) if not os.path.exists(output_path): os.mkdir(output_path) target_schema_instance.serialize(output_path)
def test_get_conflict_raises(track_type): t_spec: Dict = { "A": { "name": "temporal variable", "data_type": "Text", "sort_order": 0 } } t_track = Track.build(t_spec, None, "temporal") i_spec = copy.deepcopy(t_spec) i_track = Track.build(i_spec, None, "immutable") schema = Schema(t_track, i_track) with pytest.raises(ValueError): schema.get("A", track_type=track_type)
def standalone(cls, context: Context, translate_dir: str, trace_dir: str, source_schema_name: str, target_schema_name: str, output_filename: str) -> None: source_schema: Optional[Schema] = Schema.load(source_schema_name, context.schemas_dir) assert source_schema is not None schema: Optional[Schema] = Schema.load(target_schema_name, context.schemas_dir, source_schema) assert schema is not None coverage: "SourceCoverage" = cls(context, schema, translate_dir, trace_dir, output_filename) coverage("dummy", None)
def test_list_na_absent_match(complex_track, empty_track, entity_id): """If the fixture has an explicit |--|NA|--| and the item is missing in the actual, count it as a match.""" schema: Schema = Schema(empty_track, complex_track) fixture: Dict = { "outer": [{ "the_folder": { "inner": [{ "some_text": "foo" }, { "some_text": POLYTROPOS_NA }], } }] } observed: Dict = { "outer": [{ "the_folder": { "inner": [{ "some_text": "foo" }, {}], } }] } expected: Outcome = Outcome() expected.matches.append( ValueMatch(entity_id, "immutable", "/outer", "List", json.dumps(fixture["outer"]))) actual: Outcome = Outcome() crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture, observed, actual) crawl() assert expected == actual
def nested_list_schema() -> Schema: immutable_spec: Dict = { "outer_list_1_id": { "name": "outer_list_1", "data_type": "List", "sort_order": 0 }, "inner_list_1_id": { "name": "inner_list", "data_type": "List", "parent": "outer_list_1_id", "sort_order": 0 }, "name_1_id": { "name": "name", "data_type": "Text", "parent": "inner_list_1_id", "sort_order": 0 } } immutable_track: Track = Track.build(immutable_spec, None, "immutable") temporal_track: Track = Track.build({}, None, "Temporal") schema: Schema = Schema(temporal_track, immutable_track) return schema
def variable_catalog(schema_basepath: str, schema_name: str, fh: TextIO) -> None: schema: Optional[Schema] = Schema.load(schema_name, base_path=schema_basepath) assert schema is not None write_catalog(schema, fh) fh.close()
def _do_nearest_list_test(innermost: str, middle: str, outermost: str, expected: str) -> None: spec: Dict = { "innermost": { "data_type": innermost, "name": "innermost", "sort_order": 0, "parent": "middle" }, "middle": { "data_type": middle, "name": "middle", "sort_order": 0, "parent": "outermost" }, "outermost": { "data_type": outermost, "name": "outermost", "sort_order": 0 } } immutable: Track = Track.build(spec, None, "i") temporal: Track = Track.build({}, None, "t") schema: Schema = Schema(temporal, immutable) innermost: Variable = schema.get(cast(VariableId, "innermost")) assert innermost.nearest_list == expected
def test_simple_na_present_with_value_mismatch(simple_track, empty_track, entity_id): """If the fixture has an explicit |--|NA|--| and the item is present in the actual with a non-null value, count it as a mismatch.""" schema: Schema = Schema(empty_track, simple_track) fixture: Dict = { "some_multiple_text": POLYTROPOS_NA, "outer": { "some_multiple_text": ["foo", "bar"] } } observation: Dict = { "some_multiple_text": ["123"], "outer": { "some_multiple_text": ["foo", "bar"] } } expected: Outcome = Outcome() expected.mismatches.append(ValueMismatch(entity_id, "immutable", "/some_multiple_text", "MultipleText", POLYTROPOS_NA, json.dumps(["123"]))) expected.matches.append(ValueMatch(entity_id, "immutable", "/outer/some_multiple_text", "MultipleText", json.dumps(["foo", "bar"]))) actual: Outcome = Outcome() crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture, observation, actual) crawl() assert expected == actual
def test_nested_does_not_short_circuit_crawl(): """Bug history: - Detected around 9/20/2019 - Isolated minimum reproducible case on 9/24/2019 - Caused by commit e23b825 (8/27/2019) - Regression test based on minimum reproducible case """ spec: Dict = { "root": { "name": "return", "data_type": "Folder", "sort_order": 0 }, "application_submissions": { "name": "application_submissions", "data_type": "List", "parent": "root", "sort_order": 0 }, "award_restrict": { "name": "award_restrict", "data_type": "Text", "parent": "application_submissions", "sort_order": 0 }, "filer": { "name": "filer", "data_type": "Folder", "parent": "root", "sort_order": 1 }, "name_org": { "name": "name_org", "data_type": "Text", "parent": "filer", "sort_order": 0 } } temporal: Track = Track.build(spec, None, "temporal") immutable: Track = Track.build({}, None, "immutable") schema: Schema = Schema(temporal, immutable, name="semantic") basepath: str = os.path.dirname(os.path.abspath(__file__)) composite_path: str = os.path.join(basepath, "data") shutil.rmtree(output_path, ignore_errors=True) os.makedirs(output_path) with Context.build(conf_dir="dummy", data_dir="dummy") as context: coverage: CoverageFile = CoverageFile(context, schema, output_path + "/semantic", None, None) coverage(composite_path, "dummy") expected_path: str = os.path.join(basepath, "expected.csv") actual_path: str = os.path.join(output_path, "semantic_temporal.csv") with open(expected_path) as expected_fh, open(actual_path) as actual_fh: expected: csv.DictReader = csv.DictReader(expected_fh) actual: csv.DictReader = csv.DictReader(actual_fh) e_rows = [row for row in expected] a_rows = [row for row in actual] assert a_rows == e_rows
def outcomes(example_path) -> FixtureOutcomes: schema: Schema = Schema.load("conf/schemas/simple", example_path) fixture_path: str = os.path.join(example_path, "data", "fixtures") obs_path: str = os.path.join(example_path, "data", "observations") return FixtureOutcomes(schema, fixture_path, obs_path)
def build( cls, path_locator: PathLocator, schema: Schema, name: str, target_schema: str, id_var: str, input_schema_vars: Dict, output_schema_vars: Dict ): target_schema_instance: Schema = Schema.load(path_locator, target_schema) aggregations: Dict[str, Type] = load(cls) input_variables: Dict[str, Variable] = { var_name: schema.get(var_id) for var_name, var_id in input_schema_vars.items() } output_variables: Dict[str, Variable] = { var_name: target_schema_instance.get(var_id) for var_name, var_id in output_schema_vars.items() } return aggregations[name](origin_schema=schema, target_schema=target_schema_instance, id_var=id_var, **input_variables, **output_variables)
def target_schema(source_schema) -> Schema: spec_path: str = os.path.join(basepath, "target_spec.json") with open(spec_path) as fh: spec: Dict = json.load(fh) temporal: Track = Track.build(spec, source_schema.temporal, "temporal") immutable: Track = Track.build({}, source_schema.immutable, "immutable") return Schema(temporal, immutable)
def test_list_has_sentinal_value_raises(complex_track, empty_track, entity_id, bad_value): """If the fixture has an explicit |--|NA|--| and the item has the same string, an error is raised because we need to choose a new, truly unique sentinel value for missing data.""" schema: Schema = Schema(empty_track, complex_track) fixture: Dict = { "outer": [{ "the_folder": { "inner": [{ "some_text": "foo" }, { "some_text": POLYTROPOS_NA }], } }] } observed: Dict = { "outer": [{ "the_folder": { "inner": [{ "some_text": "foo" }, { "some_text": bad_value }], } }] } actual: Outcome = Outcome() crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture, observed, actual) with pytest.raises(ValueError): crawl()
def test_folder_na_present_as_none_mismatch(simple_track, empty_track, entity_id): """If the fixture has an explicit |--|NA|--| and the item is present in the actual with the value None, count it as a mismatch.""" schema: Schema = Schema(empty_track, simple_track) fixture: Dict = { "some_text": "bar", "outer": { "some_text": POLYTROPOS_NA, } } observation: Dict = { "some_text": "bar", "outer": { "some_text": None, } } expected: Outcome = Outcome() expected.mismatches.append( ValueMismatch(entity_id, "immutable", "/outer/some_text", "Text", POLYTROPOS_NA, None)) expected.matches.append( ValueMatch(entity_id, "immutable", "/some_text", "Text", "bar")) actual: Outcome = Outcome() crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture, observation, actual) crawl() assert expected == actual
def schema() -> Schema: temporal_spec: Dict = { "integer_source": { "name": "integer_source", "data_type": "Integer", "sort_order": 0 }, "decimal_source": { "name": "decimal_source", "data_type": "Decimal", "sort_order": 1 }, "currency_source": { "name": "currency_source", "data_type": "Currency", "sort_order": 2 } } immutable_spec: Dict = { "target": { "name": "target", "data_type": "Decimal", "sort_order": 0 } } temporal: Track = Track(temporal_spec, None, "temporal") immutable: Track = Track(immutable_spec, None, "immutable") return Schema(temporal, immutable)
def test_underscore_folders_ignored(): spec: Dict = { "binary_in_root": { "name": "the_binary", "data_type": "Binary", "sort_order": 0 } } immutable: Track = Track.build(spec, None, "immutable") temporal: Track = Track.build({}, None, "temporal") schema: Schema = Schema(temporal, immutable) content: Dict = { "immutable": { "the_binary": "true", "_folder": { "foo": "shouldn't matter", "bar": "also shouldn't matter" } } } expected: Dict = { "immutable": { "the_binary": True, "_folder": { "foo": "shouldn't matter", "bar": "also shouldn't matter" } } } composite: Composite = Composite(schema, content) cast: Cast = Cast(schema, {}) cast(composite) assert composite.content == expected
def _do_cast_error_test(data_type: str, raw: Optional[Any]): spec: Dict = { "var": { "name": "the_var", "data_type": data_type, "sort_order": 0 } } immutable: Track = Track.build(spec, None, "immutable") temporal: Track = Track.build({}, None, "temporal") schema: Schema = Schema(temporal, immutable) content: Dict = {"immutable": {"the_var": raw}} composite: Composite = Composite(schema, content) cast: Cast = Cast(schema, {}) cast(composite) expected: Dict = { "immutable": { "qc": { "_exceptions": { "cast_errors": { "the_var": raw } } } } } actual: Dict = composite.content assert actual == expected
def list_in_nested_folder_schema(empty_track) -> Schema: spec: Dict = { "folder_in_root": { "name": "parent", "data_type": "Folder", "sort_order": 0 }, "folder_in_folder": { "name": "child", "data_type": "Folder", "parent": "folder_in_root", "sort_order": 0 }, "folder_in_folder_in_folder": { "name": "grandchild", "data_type": "Folder", "parent": "folder_in_folder", "sort_order": 0 }, "nested_list": { "name": "the_list", "data_type": "List", "parent": "folder_in_folder_in_folder", "sort_order": 0 }, "list_text": { "name": "some_text", "data_type": "Text", "parent": "nested_list", "sort_order": 0 } } test_track: Track = Track.build(spec, None, "") return Schema(test_track, empty_track)
def simple_schema(empty_track) -> Schema: spec: Dict = { "text_in_root": { "name": "some_text", "data_type": "Text", "sort_order": 0 }, "folder_in_root": { "name": "the_folder", "data_type": "Folder", "sort_order": 1 }, "text_in_folder": { "name": "some_text", "data_type": "Text", "parent": "folder_in_root", "sort_order": 0 }, "int_in_folder": { "name": "some_number", "data_type": "Integer", "parent": "folder_in_root", "sort_order": 1 } } test_track: Track = Track.build(spec, None, "") return Schema(test_track, empty_track)
def schema() -> Schema: temporal_spec: Dict = { "the_subject": { "name": "source", "data_type": "Integer", "sort_order": 0 } } temporal: Track = Track.build(temporal_spec, None, "temporal") immutable_spec: Dict = { "the_target": { "name": "limit", "data_type": "Integer", "sort_order": 0 }, "the_period_id": { "name": "limit_period", "data_type": "Text", "sort_order": 1 } } immutable: Track = Track.build(immutable_spec, None, "immutable") schema: Schema = Schema(temporal, immutable) return schema
def build(cls, path_locator, schema: Schema, name: str, subjects: Dict): logging.info('Building instance of filter class "%s"' % name) filters = load(cls) variables = { var_name: schema.get(var_id) for var_name, var_id in subjects.items() } return filters[name](schema=schema, **variables)
def _target_schema(source: Schema, data_type: str = "Text") -> Schema: temporal_spec: Dict = target_spec("t", data_type) temporal: Track = Track.build(temporal_spec, source.temporal, "temporal") immutable_spec: Dict = target_spec("i", data_type) immutable: Track = Track.build(immutable_spec, source.immutable, "immutable") return Schema(temporal, immutable, name="target", source=source)
def standalone(cls, context: Context, schema_name: str, output_prefix: str, t_group: Optional[VariableId], i_group: Optional[VariableId], exclude_trivial: bool = False) -> None: schema: Optional[Schema] = Schema.load(schema_name, context.schemas_dir) assert schema is not None # TODO Refactor so unnecessary arguments aren't required. coverage: "CoverageFile" = cls(context, schema, output_prefix, t_group, i_group, exclude_trivial) coverage(context.entities_input_dir, None)