def _build_row_types_processor(types, headers_inverse): type_processors = [] if not types: return type_processors saw_header = False for i, col_type in enumerate(types): if isinstance(col_type, parser.TypeDefData): if saw_header: raise ProcessException( "Cannot specify types for indexed columns after named columns" ) type_def = col_type optional_type = type_def.optional optional_ref = optional_type default = type_def.default elif isinstance(col_type, parser.KeyToValueData): saw_header = True type_def = col_type.value optional_type = type_def.optional default = type_def.default if isinstance(col_type.key, parser.LocData): optional_ref = col_type.key.optional i = col_type.key.value if i > 0: i -= 1 elif isinstance(col_type.key, parser.IdData): optional_ref = col_type.key.optional header = col_type.key.value i = headers_inverse[header] else: raise ProcessException(f"Unknown key type: {col_type.key}") else: raise ProcessException(f"Unknown col_type: {col_type}") row_type_processor = _build_type_processor(type_def.type) if default is not NO_DEFAULT and not (optional_ref or optional_type): optional_ref = True optional_type = True if default is NO_DEFAULT: default = None else: try: default = ast.literal_eval(default) except Exception as e: raise ProcessException( f"Invalid default literal value: {default}") from e type_processors.append( (i, row_type_processor, optional_ref, optional_type, default)) return type_processors
def _build_type_processor(t): if t in {"_", "s"}: return str elif t == "f": return float elif t == "i": return int elif t == "b": return lambda v: str(v).lower() in {"true", "yes", "y", "on", "1"} elif t == "j": return lambda v: json.loads(str(v)) elif t == "l": return lambda v: ast.literal_eval(str(v)) elif t == "d": raise ProcessException("Dates are not supported yet") else: raise ProcessException(f"Unsupported column type: {t}")
def run_jq(text, args): final_args = ["jq", "-c", args or "."] text = bytes(text, encoding="utf-8") if text else None try: return subprocess.check_output( final_args, stderr=subprocess.STDOUT, input=text, ).decode() except subprocess.CalledProcessError as e: raise ProcessException(e.stdout.decode())
def _extract_structure_field_name(field: parser.ParseData, headers): if isinstance(field, parser.KeyToValueData): name = field.key.value elif isinstance(field, parser.RefData): first, rest = field.value[0], field.value[1:] if isinstance(first, parser.LocData): num = first.value if num > 0: num -= 1 else: num += len(headers) if num in headers: first_name = headers[num] else: first_name = str(first.value) elif isinstance(first, parser.IdData): first_name = first.value else: raise ProcessException(f"Unexpected first field: {first}") name = ".".join([first_name] + [str(f.value) for f in rest]) else: raise ProcessException(f"Unexpected field: {field}") return name
def get(self, processed_cmd: ProcessedCommand, safe=False): if processed_cmd.raw: default = [self.data[self.default_raw]] else: default = [self.data[self.default_alias]] data_processors_configs = getattr(processed_cmd, self.attr) if not data_processors_configs: return default result = [] for data_processor_config in data_processors_configs: alias = data_processor_config.alias args = data_processor_config.args try: obj_or_cls = self.data[alias] except KeyError: if safe: return default raise ProcessException(f"Unregistered {self.type.__name__.lower()}: {alias}") if isinstance(obj_or_cls, type) and issubclass(obj_or_cls, self.type): result.append(obj_or_cls(args)) else: result.append(obj_or_cls) return result
def get(alias): try: return macros[alias] except KeyError: raise ProcessException(f"Unregistered macro {alias}")
def _process_cmd(ctx, cmd) -> ProcessedCommand: cmd = cmd.strip() if cmd.startswith("@"): alias, *args = cmd.split(" ", 1) alias = alias[1:] macro = macros.get(alias) if args: args = macro["split"](args[0].strip()) cmd = macro["fn"](*args) changed = set() if cmd == ":" or (ctx.processed_command and ctx.processed_command.cmd == cmd): return ctx.processed_command, changed previous_command = ctx.processed_command result = ProcessedCommand(cmd) if cmd.startswith(":") and len(cmd) >= 2: cmd_separator = cmd[1] cmd = cmd[2:] else: cmd_separator = ";" for expression in cmd.split(cmd_separator): expression = expression.strip() if not expression: continue if expression == "h": result.has_header = True continue if expression == "r": result.raw = True continue expression_split = expression.split(":", 1) if len(expression_split) < 2: continue expression_type, expression_body = expression_split expression_type, expression_body = expression_type.strip( ), expression_body.strip() if expression_type not in "dhtsio": raise ProcessException( f"Unsupported command type: {expression_type}") elif expression_type == "d": if expression_body.startswith("\\"): expression_body = ast.literal_eval(f'"{expression_body}"') result.delimiter = expression_body or DEFAULT_CMD.delimiter elif expression_type == "t": if expression_body: result.types = parser.parse_types(expression_body) else: result.types = DEFAULT_CMD.types elif expression_type == "s": if expression_body: result.structure = parser.parse_structure(expression_body) else: result.structure = DEFAULT_CMD.structure elif expression_type == "i": if expression_body: result.inputs = parser.parse_processors(expression_body) else: result.input = DEFAULT_CMD.inputs elif expression_type == "o": if expression_body: result.outputs = parser.parse_processors(expression_body) else: result.outputs = DEFAULT_CMD.outputs else: raise ProcessException("Unexpected") previous_command.cmd = cmd ctx.processed_command = result for attr in [ "cmd", "delimiter", "outputs", "inputs", "structure", "types", "has_header", "raw" ]: if getattr(previous_command, attr) != getattr(result, attr): changed.add(attr) return result, changed
def impl(data: parser.ParseData, values): if isinstance(data, parser.StructureData): assert values if not data.fields: if data.type == "[]": return lambda r: replace_missing_from_row(r) elif data.type == "()": return lambda r: tuple(replace_missing(i) for i in r) elif data.type == "s()": return lambda r: set(replace_missing(i) for i in r) elif data.type in {"{}", "d()"}: return lambda r: dict( zip(headers.values(), replace_missing_from_row(r))) else: raise ProcessException( f"Unsupported data type: {data.type}") else: values = [impl(f, values=True) for f in data.fields] if data.type == "[]": return lambda row: [v(row) for v in values] elif data.type == "()": return lambda row: tuple(v(row) for v in values) elif data.type == "s()": return lambda row: set(v(row) for v in values) elif data.type in {"{}", "d()"}: keys = [impl(f, values=False) for f in data.fields] return lambda row: dict(zip(keys, [v(row) for v in values])) else: raise ProcessException( f"Unsupported data type: {data.type}") elif isinstance(data, parser.KeyToValueData): if values: return impl(data.value, values=True) else: return data.key.value elif isinstance(data, parser.RefData): if values: path = data.value default = data.default has_default = default is not NO_DEFAULT if has_default: try: default = ast.literal_eval(default) except Exception as e: raise ProcessException( f"Invalid default literal value: {default}") from e else: default = None has_optional = any( (isinstance(p, parser.IdData) and p.optional) or ( isinstance(p, parser.LocData) and p.optional) for p in path) value_processor = [] for part in path: getter = get if isinstance(part, parser.IdData): if part.optional or (has_default and not has_optional): getter = getsafe key = part.value if not value_processor: index = headers_inverse.get(key) value_processor.append(getter(index, default)) else: value_processor.append(getter(key, default)) elif isinstance(part, parser.LocData): if part.optional or (has_default and not has_optional): getter = getsafe index = part.value if not value_processor: if index > 0: index -= 1 value_processor.append(getter(index, default)) else: raise ProcessException(f"Unexpected part {part}") def fn(row): result = row for p in value_processor: result = p(result) if result is default: break return result return fn else: return _extract_structure_field_name(data, headers) else: raise ProcessException(f"Unsupported parser data: {data}")
def _parse(element, expr): try: parsed = element.parseString(expr, parseAll=True) except Exception: raise ProcessException(f"Not a valid expresion: {expr}") return parsed.asList()[0]