def render_string(template_name: str, template_type: str, compact=False) -> str: """ Replace template ${strings} by configuration values. In case the input is a JSON-formatted file, we make sure output is nicely JSON-formatted. """ resource_name = _find_templates(template_type) if template_name not in resource_name: raise InvalidArgumentError( "template name not found: '{}'".format(template_name)) filename = resource_name[template_name] logger.info("Rendering template '%s' from file '%s'", template_name, filename) original = pkg_resources.resource_string("etl", filename).decode() rendered = render_from_config(original, context="'{}'".format(filename)) if not filename.endswith((".json", ".yaml", ".yml")): return rendered # Always load as YAML in order to support comments. obj = yaml.safe_load(rendered) # But since we don't support anything that couldn't be done in JSON, dump as # the (prettier) JSON format. if compact: return json.dumps(obj, separators=(",", ":"), sort_keys=True) + "\n" return json.dumps(obj, indent=" ", sort_keys=True) + "\n"
def render(template_name: str, compact=False) -> None: """ Replace template ${strings} by configuration values. """ resource_name = _find_templates() if template_name not in resource_name: raise InvalidArgumentError( "template name not found: '{}'".format(template_name)) filename = resource_name[template_name] logger.info("Rendering template '%s' from file '%s'", template_name, filename) original = pkg_resources.resource_string("etl", filename).decode() rendered = render_from_config(original, context="'{}'".format(filename)) if filename.endswith((".json", ".yaml", ".yml")): # Always load as YAML in order to support comments. obj = yaml.safe_load(rendered) # But since we don't support anything that couldn't be done in JSON, dump the (prettier) JSON format. if compact: print(simplejson.dumps(obj, separators=(',', ':'), sort_keys=True)) else: print(simplejson.dumps(obj, indent=" ", sort_keys=True)) else: print(rendered, end='')
def show_vars(names: List[str]) -> None: """ List "variables" with values. This shows all known configuration settings as "variables" with their values or just the variables that are selected. """ config_mapping = etl.config.get_config_map() all_keys = sorted(config_mapping) if not names: keys = all_keys else: selected_keys = set() for name in names: matching_keys = [ key for key in all_keys if fnmatch.fnmatch(key, name) ] if not matching_keys: raise InvalidArgumentError( "no matching setting for '{}'".format(name)) selected_keys.update(matching_keys) keys = sorted(selected_keys) values = [config_mapping[key] for key in keys] print( etl.text.format_lines(zip(keys, values), header_row=["Name", "Value"]))
def show_value(name: str, default: Optional[str]) -> None: """ Show value of a specific variable. This fails if the variable is not set and no default is provided. """ value = etl.config.get_config_value(name, default) if value is None: raise InvalidArgumentError("setting '{}' has no value".format(name)) print(value)
def get_config_int(name: str, default: Optional[int] = None) -> int: """ Lookup a configuration value that is an integer. It is an error if the value (even when using the default) is None. """ if default is None: value = get_config_value(name) else: value = get_config_value(name, str(default)) if value is None: raise InvalidArgumentError("missing config for {}".format(name)) else: return int(value)
def show_vars(name: Optional[str]) -> None: """ List all the known configuration settings as "variables" with their values or just for the variable that's selected. """ config_mapping = etl.config.get_config_map() if name is None: keys = sorted(config_mapping) else: keys = [ key for key in sorted(config_mapping) if fnmatch.fnmatch(key, name) ] if not keys: raise InvalidArgumentError( "no matching setting for '{}'".format(name)) values = [config_mapping[key] for key in keys] print( etl.text.format_lines(zip(keys, values), header_row=["Name", "Value"]))
def scan_etl_events(etl_id, comma_separated_columns) -> None: """Scan for all events belonging to a specific ETL.""" ddb = DynamoDBStorage.factory() table = ddb.get_table(create_if_not_exists=False) all_keys = ["target", "step", "event", "timestamp", "elapsed", "rowcount"] if comma_separated_columns: selected_columns = comma_separated_columns.split(",") invalid_columns = [ key for key in selected_columns if key not in all_keys ] if invalid_columns: raise InvalidArgumentError("invalid column(s): {}".format( ",".join(invalid_columns))) # We will always select "target" and "event" to have a meaningful output. selected_columns = frozenset(selected_columns).union( ["target", "event"]) keys = [key for key in all_keys if key in selected_columns] else: keys = all_keys # We need to scan here since the events are stored by "target" and not by "etl_id". # TODO Try to find all the "known" relations and query on them with a filter on the etl_id. client = boto3.client("dynamodb") paginator = client.get_paginator("scan") response_iterator = paginator.paginate( TableName=table.name, ConsistentRead=False, ExpressionAttributeNames={"#timestamp": "timestamp"}, ExpressionAttributeValues={ ":etl_id": { "S": etl_id }, ":marker": { "S": _DUMMY_TARGET }, ":start_event": { "S": STEP_START }, }, FilterExpression= "etl_id = :etl_id and target <> :marker and event <> :start_event", ProjectionExpression= "target, step, event, #timestamp, elapsed, extra.rowcount", ReturnConsumedCapacity="TOTAL", # PaginationConfig={ # "PageSize": 100 # } ) logger.info("Scanning events table for elapsed times") consumed_capacity = 0.0 scanned_count = 0 rows = [] # type: List[List[str]] for response in response_iterator: consumed_capacity += response["ConsumedCapacity"]["CapacityUnits"] scanned_count += response["ScannedCount"] items = [_flatten_scan_result(item) for item in response["Items"]] # Backwards compatibility kludge: Be careful picking out the rowcount which may not be present in older tables. items = [{ key: item.get("extra", {"rowcount": None})["rowcount"] if key == "rowcount" else item[key] for key in keys } for item in items] rows.extend([_format_output_column(key, item[key]) for key in keys] for item in items) logger.info("Scan result: scanned count = %d, consumed capacity = %f", scanned_count, consumed_capacity) if "timestamp" in keys: rows.sort(key=itemgetter(keys.index("timestamp"))) else: rows.sort(key=itemgetter(keys.index("target"))) print(etl.text.format_lines(rows, header_row=keys))
def get_config_list(name: str) -> List[int]: """Lookup a configuration value that is a List.""" value = get_config_value(name) if value is None: raise InvalidArgumentError("missing config for {}".format(name)) return list(map(int, value.split(",")))
def select_in_execution_order( relations: Sequence[RelationDescription], selector: TableSelector, include_dependents=False, include_immediate_views=False, continue_from: Optional[str] = None, ) -> List[RelationDescription]: """ Return list of relations that were selected, optionally adding dependents or skipping forward. The values supported for skipping forward are: - '*' to start from the beginning - ':transformations' to only run transformations of selected relations - a specific relation to continue from that one in the original execution order - a specific schema to include all relations in that source schema as well as any originally selected transformation Note that these operate on the list of relations selected by the selector patterns. The option of '*' exists to we can have a default value in our pipeline definitions. The last option of specifying a schema is most useful with a source schema when you want to restart the load step followed by all transformations. No error is raised when the selector does not select any relations. An error is raised when the "continue from" condition does not resolve to a list of relations. """ logger.info("Pondering execution order of %d relation(s)", len(relations)) execution_order = order_by_dependencies(relations) selected = find_matches(execution_order, selector) if not selected: logger.warning("Found no relations matching: %s", selector) return [] if include_dependents: dependents = find_dependents(execution_order, selected) combined = frozenset(selected).union(dependents) selected = [ relation for relation in execution_order if relation in combined ] elif include_immediate_views: immediate_views = find_immediate_dependencies(execution_order, selector) combined = frozenset(selected).union(immediate_views) selected = [ relation for relation in execution_order if relation in combined ] if continue_from is None or continue_from == "*": return selected transformations = [ relation for relation in selected if relation.is_transformation ] if continue_from in (":transformations", ":transformation"): if transformations: logger.info( "Continuing with %d transformation(s) in selected relations", len(transformations)) return transformations raise InvalidArgumentError("found no transformations to continue from") logger.info("Trying to fast forward to '%s' within %d relation(s)", continue_from, len(selected)) starting_from_match = list( fy.dropwhile(lambda relation: relation.identifier != continue_from, selected)) if starting_from_match: logger.info( "Continuing with %d relation(s) after skipping %d", len(starting_from_match), len(selected) - len(starting_from_match), ) return starting_from_match single_schema = frozenset( fy.filter(lambda relation: relation.source_name == continue_from, selected)) if single_schema.intersection(transformations): raise InvalidArgumentError( f"schema '{continue_from}' contains transformations") if single_schema: combined = single_schema.union(transformations) logger.info( "Continuing with %d relation(s) in '%s' and %d transformation(s)", len(single_schema), continue_from, len(combined) - len(single_schema), ) return [ relation for relation in execution_order if relation in combined ] raise InvalidArgumentError("found no matching relations to continue from")