def stream_mapping(infile: Path, outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: with path_writer(outfile) as outfh: with input_file(infile) as fh: for record in CSVSource.read_csv(fh): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): # type: ignore entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def sign(infile: Path, outfile: Path, signature: Optional[str]) -> None: ns = Namespace(signature) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): signed = ns.apply(entity) write_entity(outfh, signed) except BrokenPipeError: raise click.Abort()
def import_vis(infile: Path, outfile: Path) -> None: with path_writer(outfile) as outfh: with open(infile, "r") as infh: data = json.load(infh) if "entities" in data: entities = data.get("entities", data) if "layout" in data: entities = data.get("layout", {}).get("entities", data) for entity_data in ensure_list(entities): entity = EntityProxy.from_dict(model, entity_data) write_entity(outfh, entity)
def validate(infile: Path, outfile: Path) -> None: try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy, cleaned=False): clean = model.make_entity(entity.schema) clean.id = entity.id for (prop, value) in entity.itervalues(): clean.add(prop, value) write_entity(outfh, clean) except BrokenPipeError: raise click.Abort()
def sieve( infile: Path, outfile: Path, schema: Iterable[str], property: Iterable[str], type: Iterable[str], ) -> None: try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): sieved = sieve_entity(entity, schema, property, type) if sieved is not None: write_entity(outfh, sieved) except BrokenPipeError: raise click.Abort()
def aggregate(infile: Path, outfile: Path) -> None: buffer: Dict[str, EntityProxy] = {} namespace = Namespace(None) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: config = load_mapping_file(mapping_yaml) try: with path_writer(outfile) as outfh: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def sorted_aggregate(path: Path, outpath: Path, entity_type: Type[E]) -> None: """Aggregate entities based on the premise that the fragements in the source stream are sorted by their ID.""" entity: Optional[E] = None with path_writer(outpath) as outfh: for next_entity in path_entities(path, entity_type=entity_type): if entity is None: entity = next_entity continue if next_entity.id == entity.id: entity = entity.merge(next_entity) continue write_entity(outfh, entity) entity = next_entity if entity is not None: write_entity(outfh, entity)