def execution_order(self) -> int: """All relations can be ordered to load properly in series based on their dependencies.""" if self._execution_order is None: raise ETLRuntimeError( "execution order unknown for RelationDescription '{0.identifier}'" .format(self)) return self._execution_order
def execution_level(self) -> int: """All relations of the same level may be loaded in parallel.""" if self._execution_level is None: raise ETLRuntimeError( "execution level unknown for RelationDescription '{0.identifier}'" .format(self)) return self._execution_level
def get_table(self, create_if_not_exists=True): """Get table reference from DynamoDB or create it (within a new session).""" session = boto3.session.Session(region_name=self.region_name) dynamodb = session.resource("dynamodb") try: table = dynamodb.Table(self.table_name) status = table.table_status logger.info( "Found existing events table '%s' in DynamoDB (status: %s)", self.table_name, status) except botocore.exceptions.ClientError as exc: # Check whether this is just a ResourceNotFoundException (sadly a 400, not a 404) if exc.response["ResponseMetadata"]["HTTPStatusCode"] != 400: raise # Nullify assignment and start over table = None status = None if not (status == "ACTIVE" or create_if_not_exists): raise ETLRuntimeError( "DynamoDB table '%s' does not exist or is not active" % self.table_name) if table is None: logger.info("Creating DynamoDB table: '%s'", self.table_name) table = dynamodb.create_table( TableName=self.table_name, KeySchema=[ { "AttributeName": "target", "KeyType": "HASH" }, { "AttributeName": "timestamp", "KeyType": "RANGE" }, ], AttributeDefinitions=[ { "AttributeName": "target", "AttributeType": "S" }, { "AttributeName": "timestamp", "AttributeType": "N" }, ], ProvisionedThroughput={ "ReadCapacityUnits": self.initial_read_capacity, "WriteCapacityUnits": self.initial_write_capacity, }, ) status = table.table_status if status != "ACTIVE": logger.info("Waiting for events table '%s' to become active", self.table_name) table.wait_until_exists() logger.debug( "Finished creating or updating events table '%s' (arn=%s)", self.table_name, table.table_arn) return table
def alter_password(cx, user, ignore_missing_password=False): password = _get_encrypted_password(cx, user) if password is None: logger.warning("Failed to find password in PGPASSFILE for '%s'", user) if not ignore_missing_password: raise ETLRuntimeError( "password missing from PGPASSFILE for user '{}'".format(user)) return execute(cx, """ALTER USER "{}" PASSWORD %s""".format(user), (password, ))
def create_user(cx, user, group): password = _get_encrypted_password(cx, user) if password is None: logger.warning("Missing entry in PGPASSFILE file for '%s'", user) raise ETLRuntimeError( "password missing from PGPASSFILE for user '{}'".format(user)) execute( cx, """CREATE USER "{}" IN GROUP "{}" PASSWORD %s""".format(user, group), (password, ))
def upload_files(files: Sequence[Tuple[str, str]], bucket_name: str, prefix: str, dry_run=False) -> None: """ Upload local files to S3 from "local_name" to "s3://bucket_name/prefix/remote_name". The sequence of files must consist of tuples of ("local_name", "remote_name"). """ max_workers = min(len(files), 10) timer = etl.timer.Timer() common_path = _keep_common_path([object_key for _, object_key in files]) description = "Uploading files to S3" if not dry_run else "Dry-run: Uploading files to S3" tqdm_bar = tqdm(desc=description, disable=None, leave=False, total=len(files), unit="file") uploader = S3Uploader(bucket_name, callback=tqdm_bar.update, dry_run=dry_run) # We break out the futures to be able to easily tally up errors. futures: List[concurrent.futures.Future] = [] with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers, thread_name_prefix="sync-parallel") as executor: for local_filename, remote_filename in files: futures.append( executor.submit(uploader.__call__, local_filename, f"{prefix}/{remote_filename}")) errors = 0 for future in concurrent.futures.as_completed(futures): exception = future.exception() if exception is not None: logger.error("Failed to upload file: %s", exception) errors += 1 tqdm_bar.close() what_happened = "Uploaded" if not dry_run else "Dry-run: Skipped uploading" logger.info( f"{what_happened} %d of %d file(s) to 's3://%s/%s/%s' using %d threads (%s)", len(files) - errors, len(files), bucket_name, prefix, common_path, max_workers, timer, ) if errors: raise ETLRuntimeError(f"There were {errors} error(s) during upload")
def _get_encrypted_password(cx, user): dsn_complete = dict(kv.split('=') for kv in cx.dsn.split(" ")) dsn_partial = {key: dsn_complete[key] for key in ["host", "port", "dbname"]} dsn_user = dict(dsn_partial, user=user) password = pgpasslib.getpass(**dsn_user) if password is None: logger.warning("Missing line in .pgpass file: '%(host)s:%(port)s:%(dbname)s:%(user)s:<password>'", dsn_user) raise ETLRuntimeError("password missing from PGPASSFILE for '{}'".format(user)) md5 = hashlib.md5() md5.update((password + user).encode()) return "md5" + md5.hexdigest()
def copy_using_manifest( conn: connection, table_name: TableName, column_list: List[str], s3_uri: str, aws_iam_role: str, data_format: Optional[str] = None, format_option: Optional[str] = None, file_compression: Optional[str] = None, compupdate="ON", dry_run=False, ) -> None: credentials = "aws_iam_role={}".format(aws_iam_role) data_format_parameters = determine_data_format_parameters( data_format, format_option, file_compression) copy_stmt = """ COPY {table} ( {columns} ) FROM %s CREDENTIALS %s MANIFEST {data_format_parameters} TIMEFORMAT AS 'auto' DATEFORMAT AS 'auto' TRUNCATECOLUMNS STATUPDATE OFF COMPUPDATE {compupdate} """.format( table=table_name, columns=join_with_double_quotes(column_list), data_format_parameters=data_format_parameters, compupdate=compupdate, ) if dry_run: logger.info("Dry-run: Skipping copying data into '%s' using '%s'", table_name.identifier, s3_uri) etl.db.skip_query(conn, copy_stmt, (s3_uri, credentials)) else: logger.info("Copying data into '%s' using '%s'", table_name.identifier, s3_uri) try: with log_load_error(conn): etl.db.execute(conn, copy_stmt, (s3_uri, credentials)) except psycopg2.InternalError as exc: if exc.pgcode == "XX000": raise ETLRuntimeError(exc) from exc else: raise TransientETLError(exc) from exc
def _get_encrypted_password(cx, user) -> Optional[str]: """Return MD5-hashed password if entry is found in PGPASSLIB or None otherwise.""" dsn_complete = dict(kv.split("=") for kv in cx.dsn.split(" ")) dsn_partial = { key: dsn_complete[key] for key in ["host", "port", "dbname"] } dsn_user = dict(dsn_partial, user=user) try: password = pgpasslib.getpass(**dsn_user) except pgpasslib.FileNotFound as exc: logger.info( "Create the file using 'touch ~/.pgpass && chmod go= ~/.pgpass'") raise ETLRuntimeError("PGPASSFILE file is missing") from exc except pgpasslib.InvalidPermissions as exc: logger.info("Update the permissions using: 'chmod go= ~/.pgpass'") raise ETLRuntimeError( "PGPASSFILE file has invalid permissions") from exc if password is None: return None md5 = hashlib.md5() md5.update((password + user).encode()) return "md5" + md5.hexdigest()
def load_config(config_files: Sequence[str], default_file: str = "default_settings.yaml") -> None: """ Load settings and environment from config files (starting with the default if provided), set our global settings. The settings are validated against their schema. If the config "file" is actually a directory, (try to) read all the files in that directory. """ settings = dict() # type: Dict[str, Any] count_settings = 0 for filename in yield_config_files(config_files, default_file): if filename.endswith(".sh"): load_environ_file(filename) elif filename.endswith((".yaml", ".yml")): load_settings_file(filename, settings) count_settings += 1 else: logger.info("Skipping unknown config file '%s'", filename) # Need to load at least the defaults and some installation specific file: if count_settings < 2: raise ETLRuntimeError( "Failed to find enough configuration files (need at least default and local config)" ) validate_with_schema(settings, "settings.schema") # If 'today' and 'yesterday' are not set already, pick the actual values of "today" and "yesterday" (wrt UTC). today = datetime.datetime.utcnow().date() date_settings = settings.setdefault("date", {}) date_settings.setdefault( "today", today.strftime("%Y/%m/%d")) # Render date to look like part of a path date_settings.setdefault( "yesterday", (today - datetime.timedelta(days=1)).strftime("%Y/%m/%d")) global _mapped_config _mapped_config = _build_config_map(settings) global _dw_config _dw_config = etl.config.dw.DataWarehouseConfig(settings) set_config_value("version", package_version())
def load_config(config_files: Iterable[str], default_file: str = "default_settings.yaml") -> None: """ Load settings and environment from config files and set our global settings. The default, if provided, is always the first file to be loaded. If the config "file" is actually a directory, (try to) read all the files in that directory. The settings are validated against their schema. """ settings: Dict[str, Any] = dict() count_settings = 0 for filename in yield_config_files(config_files, default_file): if filename.endswith(".sh"): load_environ_file(filename) elif filename.endswith((".yaml", ".yml")): load_settings_file(filename, settings) count_settings += 1 else: logger.debug("Skipping unknown config file '%s'", filename) # Need to load at least the defaults and some installation specific file: if count_settings < 2: raise ETLRuntimeError("failed to find enough configuration files (need at least default and local config)") validate_with_schema(settings, "settings.schema") # Set values for 'date.today' and 'date.yesterday' (in case they aren't set already.) # The values are wrt current UTC and look like a path, e.g. '2017/05/16'. today = datetime.datetime.utcnow().date() date_settings = settings.setdefault("date", {}) date_settings.setdefault("today", today.strftime("%Y/%m/%d")) date_settings.setdefault("yesterday", (today - datetime.timedelta(days=1)).strftime("%Y/%m/%d")) global _dw_config _dw_config = etl.config.dw.DataWarehouseConfig(settings) global _mapped_config _mapped_config = _build_config_map(settings) if _mapped_config is not None: _mapped_config["data_warehouse.owner.name"] = _dw_config.owner.name set_config_value("version", package_version())
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool = False) -> None: """ Copy (validated) table design and SQL files from local directory to S3 bucket. """ logger.info("Validating %d table design(s) before upload", len(relations)) RelationDescription.load_in_parallel(relations) files = [] # typing: List[Tuple[str, str]] for relation in relations: relation_files = [relation.design_file_name] if relation.is_transformation: if relation.sql_file_name: relation_files.append(relation.sql_file_name) else: raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name) for file_name in relation_files: local_filename = relation.norm_path(file_name) remote_filename = os.path.join(prefix, local_filename) files.append((local_filename, remote_filename)) uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run) with Timer() as timer: futures = [] # typing: List[concurrent.futures.Future] # TODO With Python 3.6, we should pass in a thread_name_prefix with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: for local_filename, remote_filename in files: futures.append(executor.submit(uploader.__call__, local_filename, remote_filename)) errors = 0 for future in concurrent.futures.as_completed(futures): exception = future.exception() if exception is not None: logger.error("Failed to upload file: %s", exception) errors += 1 if not dry_run: logger.info( "Uploaded %d of %d file(s) to 's3://%s/%s (%s)", len(files) - errors, len(files), bucket_name, prefix, timer ) if errors: raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
def initial_setup(config, with_user_creation=False, force=False, dry_run=False): """ Place named data warehouse database into initial state. This destroys the contents of the targeted database. You have to set `force` to true if the name of the database doesn't start with 'validation'. Optionally use `with_user_creation` flag to create users and groups. """ try: database_name = config.dsn_etl['database'] except (KeyError, ValueError) as exc: raise ETLConfigError( "could not identify database initialization target") from exc if database_name.startswith('validation'): logger.info("Initializing validation database '%s'", database_name) elif force: logger.info( "Initializing non-validation database '%s' forcefully as requested", database_name) else: raise ETLRuntimeError( "Refused to initialize non-validation database '%s' without the --force option" % database_name) # Create all defined users which includes the ETL user needed before next step (so that database is owned by ETL) if with_user_creation: with closing( etl.db.connection(config.dsn_admin, autocommit=True, readonly=dry_run)) as conn: for user in config.users: _create_or_update_cluster_user(conn, user, dry_run=dry_run) if dry_run: logger.info( "Dry-run: Skipping drop and create of database '%s' with owner '%s'", database_name, config.owner) else: admin_dev_conn = etl.db.connection(config.dsn_admin, autocommit=True) with closing(admin_dev_conn): logger.info("Dropping and creating database '%s' with owner '%s'", database_name, config.owner) etl.db.drop_and_create_database(admin_dev_conn, database_name, config.owner) admin_target_db_conn = etl.db.connection(config.dsn_admin_on_etl_db, autocommit=True, readonly=dry_run) with closing(admin_target_db_conn): if dry_run: logger.info("Dry-run: Skipping drop of PUBLIC schema in '%s'", database_name) else: logger.info("Dropping PUBLIC schema in '%s'", database_name) etl.db.drop_schema(admin_target_db_conn, "PUBLIC") if with_user_creation: for user in config.users: if user.schema: _create_schema_for_user(admin_target_db_conn, user, config.groups[0], dry_run=dry_run) _update_search_path(admin_target_db_conn, user, dry_run=dry_run)
def is_required(self) -> bool: if self._is_required is None: raise ETLRuntimeError( "state of 'is_required' unknown for RelationDescription '{0.identifier}'" .format(self)) return self._is_required