Пример #1
0
 def execution_order(self) -> int:
     """All relations can be ordered to load properly in series based on their dependencies."""
     if self._execution_order is None:
         raise ETLRuntimeError(
             "execution order unknown for RelationDescription '{0.identifier}'"
             .format(self))
     return self._execution_order
Пример #2
0
 def execution_level(self) -> int:
     """All relations of the same level may be loaded in parallel."""
     if self._execution_level is None:
         raise ETLRuntimeError(
             "execution level unknown for RelationDescription '{0.identifier}'"
             .format(self))
     return self._execution_level
Пример #3
0
 def get_table(self, create_if_not_exists=True):
     """Get table reference from DynamoDB or create it (within a new session)."""
     session = boto3.session.Session(region_name=self.region_name)
     dynamodb = session.resource("dynamodb")
     try:
         table = dynamodb.Table(self.table_name)
         status = table.table_status
         logger.info(
             "Found existing events table '%s' in DynamoDB (status: %s)",
             self.table_name, status)
     except botocore.exceptions.ClientError as exc:
         # Check whether this is just a ResourceNotFoundException (sadly a 400, not a 404)
         if exc.response["ResponseMetadata"]["HTTPStatusCode"] != 400:
             raise
         # Nullify assignment and start over
         table = None
         status = None
     if not (status == "ACTIVE" or create_if_not_exists):
         raise ETLRuntimeError(
             "DynamoDB table '%s' does not exist or is not active" %
             self.table_name)
     if table is None:
         logger.info("Creating DynamoDB table: '%s'", self.table_name)
         table = dynamodb.create_table(
             TableName=self.table_name,
             KeySchema=[
                 {
                     "AttributeName": "target",
                     "KeyType": "HASH"
                 },
                 {
                     "AttributeName": "timestamp",
                     "KeyType": "RANGE"
                 },
             ],
             AttributeDefinitions=[
                 {
                     "AttributeName": "target",
                     "AttributeType": "S"
                 },
                 {
                     "AttributeName": "timestamp",
                     "AttributeType": "N"
                 },
             ],
             ProvisionedThroughput={
                 "ReadCapacityUnits": self.initial_read_capacity,
                 "WriteCapacityUnits": self.initial_write_capacity,
             },
         )
         status = table.table_status
     if status != "ACTIVE":
         logger.info("Waiting for events table '%s' to become active",
                     self.table_name)
         table.wait_until_exists()
         logger.debug(
             "Finished creating or updating events table '%s' (arn=%s)",
             self.table_name, table.table_arn)
     return table
Пример #4
0
def alter_password(cx, user, ignore_missing_password=False):
    password = _get_encrypted_password(cx, user)
    if password is None:
        logger.warning("Failed to find password in PGPASSFILE for '%s'", user)
        if not ignore_missing_password:
            raise ETLRuntimeError(
                "password missing from PGPASSFILE for user '{}'".format(user))
        return
    execute(cx, """ALTER USER "{}" PASSWORD %s""".format(user), (password, ))
Пример #5
0
def create_user(cx, user, group):
    password = _get_encrypted_password(cx, user)
    if password is None:
        logger.warning("Missing entry in PGPASSFILE file for '%s'", user)
        raise ETLRuntimeError(
            "password missing from PGPASSFILE for user '{}'".format(user))
    execute(
        cx,
        """CREATE USER "{}" IN GROUP "{}" PASSWORD %s""".format(user, group),
        (password, ))
Пример #6
0
def upload_files(files: Sequence[Tuple[str, str]],
                 bucket_name: str,
                 prefix: str,
                 dry_run=False) -> None:
    """
    Upload local files to S3 from "local_name" to "s3://bucket_name/prefix/remote_name".

    The sequence of files must consist of tuples of ("local_name", "remote_name").
    """
    max_workers = min(len(files), 10)
    timer = etl.timer.Timer()

    common_path = _keep_common_path([object_key for _, object_key in files])
    description = "Uploading files to S3" if not dry_run else "Dry-run: Uploading files to S3"
    tqdm_bar = tqdm(desc=description,
                    disable=None,
                    leave=False,
                    total=len(files),
                    unit="file")
    uploader = S3Uploader(bucket_name,
                          callback=tqdm_bar.update,
                          dry_run=dry_run)

    # We break out the futures to be able to easily tally up errors.
    futures: List[concurrent.futures.Future] = []
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers,
            thread_name_prefix="sync-parallel") as executor:
        for local_filename, remote_filename in files:
            futures.append(
                executor.submit(uploader.__call__, local_filename,
                                f"{prefix}/{remote_filename}"))

    errors = 0
    for future in concurrent.futures.as_completed(futures):
        exception = future.exception()
        if exception is not None:
            logger.error("Failed to upload file: %s", exception)
            errors += 1

    tqdm_bar.close()
    what_happened = "Uploaded" if not dry_run else "Dry-run: Skipped uploading"
    logger.info(
        f"{what_happened} %d of %d file(s) to 's3://%s/%s/%s' using %d threads (%s)",
        len(files) - errors,
        len(files),
        bucket_name,
        prefix,
        common_path,
        max_workers,
        timer,
    )
    if errors:
        raise ETLRuntimeError(f"There were {errors} error(s) during upload")
Пример #7
0
def _get_encrypted_password(cx, user):
    dsn_complete = dict(kv.split('=') for kv in cx.dsn.split(" "))
    dsn_partial = {key: dsn_complete[key] for key in ["host", "port", "dbname"]}
    dsn_user = dict(dsn_partial, user=user)
    password = pgpasslib.getpass(**dsn_user)
    if password is None:
        logger.warning("Missing line in .pgpass file: '%(host)s:%(port)s:%(dbname)s:%(user)s:<password>'", dsn_user)
        raise ETLRuntimeError("password missing from PGPASSFILE for '{}'".format(user))
    md5 = hashlib.md5()
    md5.update((password + user).encode())
    return "md5" + md5.hexdigest()
Пример #8
0
def copy_using_manifest(
    conn: connection,
    table_name: TableName,
    column_list: List[str],
    s3_uri: str,
    aws_iam_role: str,
    data_format: Optional[str] = None,
    format_option: Optional[str] = None,
    file_compression: Optional[str] = None,
    compupdate="ON",
    dry_run=False,
) -> None:

    credentials = "aws_iam_role={}".format(aws_iam_role)
    data_format_parameters = determine_data_format_parameters(
        data_format, format_option, file_compression)

    copy_stmt = """
        COPY {table} (
            {columns}
        )
        FROM %s
        CREDENTIALS %s MANIFEST
        {data_format_parameters}
        TIMEFORMAT AS 'auto'
        DATEFORMAT AS 'auto'
        TRUNCATECOLUMNS
        STATUPDATE OFF
        COMPUPDATE {compupdate}
        """.format(
        table=table_name,
        columns=join_with_double_quotes(column_list),
        data_format_parameters=data_format_parameters,
        compupdate=compupdate,
    )
    if dry_run:
        logger.info("Dry-run: Skipping copying data into '%s' using '%s'",
                    table_name.identifier, s3_uri)
        etl.db.skip_query(conn, copy_stmt, (s3_uri, credentials))
    else:
        logger.info("Copying data into '%s' using '%s'", table_name.identifier,
                    s3_uri)
        try:
            with log_load_error(conn):
                etl.db.execute(conn, copy_stmt, (s3_uri, credentials))
        except psycopg2.InternalError as exc:
            if exc.pgcode == "XX000":
                raise ETLRuntimeError(exc) from exc
            else:
                raise TransientETLError(exc) from exc
Пример #9
0
def _get_encrypted_password(cx, user) -> Optional[str]:
    """Return MD5-hashed password if entry is found in PGPASSLIB or None otherwise."""
    dsn_complete = dict(kv.split("=") for kv in cx.dsn.split(" "))
    dsn_partial = {
        key: dsn_complete[key]
        for key in ["host", "port", "dbname"]
    }
    dsn_user = dict(dsn_partial, user=user)
    try:
        password = pgpasslib.getpass(**dsn_user)
    except pgpasslib.FileNotFound as exc:
        logger.info(
            "Create the file using 'touch ~/.pgpass && chmod go= ~/.pgpass'")
        raise ETLRuntimeError("PGPASSFILE file is missing") from exc
    except pgpasslib.InvalidPermissions as exc:
        logger.info("Update the permissions using: 'chmod go= ~/.pgpass'")
        raise ETLRuntimeError(
            "PGPASSFILE file has invalid permissions") from exc

    if password is None:
        return None
    md5 = hashlib.md5()
    md5.update((password + user).encode())
    return "md5" + md5.hexdigest()
Пример #10
0
def load_config(config_files: Sequence[str],
                default_file: str = "default_settings.yaml") -> None:
    """
    Load settings and environment from config files (starting with the default if provided),
    set our global settings.

    The settings are validated against their schema.
    If the config "file" is actually a directory, (try to) read all the files in that directory.
    """
    settings = dict()  # type: Dict[str, Any]
    count_settings = 0
    for filename in yield_config_files(config_files, default_file):
        if filename.endswith(".sh"):
            load_environ_file(filename)
        elif filename.endswith((".yaml", ".yml")):
            load_settings_file(filename, settings)
            count_settings += 1
        else:
            logger.info("Skipping unknown config file '%s'", filename)

    # Need to load at least the defaults and some installation specific file:
    if count_settings < 2:
        raise ETLRuntimeError(
            "Failed to find enough configuration files (need at least default and local config)"
        )

    validate_with_schema(settings, "settings.schema")

    # If 'today' and 'yesterday' are not set already, pick the actual values of "today" and "yesterday" (wrt UTC).
    today = datetime.datetime.utcnow().date()
    date_settings = settings.setdefault("date", {})
    date_settings.setdefault(
        "today",
        today.strftime("%Y/%m/%d"))  # Render date to look like part of a path
    date_settings.setdefault(
        "yesterday", (today - datetime.timedelta(days=1)).strftime("%Y/%m/%d"))

    global _mapped_config
    _mapped_config = _build_config_map(settings)

    global _dw_config
    _dw_config = etl.config.dw.DataWarehouseConfig(settings)

    set_config_value("version", package_version())
Пример #11
0
def load_config(config_files: Iterable[str], default_file: str = "default_settings.yaml") -> None:
    """
    Load settings and environment from config files and set our global settings.

    The default, if provided, is always the first file to be loaded.
    If the config "file" is actually a directory, (try to) read all the files in that directory.

    The settings are validated against their schema.
    """
    settings: Dict[str, Any] = dict()
    count_settings = 0
    for filename in yield_config_files(config_files, default_file):
        if filename.endswith(".sh"):
            load_environ_file(filename)
        elif filename.endswith((".yaml", ".yml")):
            load_settings_file(filename, settings)
            count_settings += 1
        else:
            logger.debug("Skipping unknown config file '%s'", filename)

    # Need to load at least the defaults and some installation specific file:
    if count_settings < 2:
        raise ETLRuntimeError("failed to find enough configuration files (need at least default and local config)")

    validate_with_schema(settings, "settings.schema")

    # Set values for 'date.today' and 'date.yesterday' (in case they aren't set already.)
    # The values are wrt current UTC and look like a path, e.g. '2017/05/16'.
    today = datetime.datetime.utcnow().date()
    date_settings = settings.setdefault("date", {})
    date_settings.setdefault("today", today.strftime("%Y/%m/%d"))
    date_settings.setdefault("yesterday", (today - datetime.timedelta(days=1)).strftime("%Y/%m/%d"))

    global _dw_config
    _dw_config = etl.config.dw.DataWarehouseConfig(settings)

    global _mapped_config
    _mapped_config = _build_config_map(settings)
    if _mapped_config is not None:
        _mapped_config["data_warehouse.owner.name"] = _dw_config.owner.name

    set_config_value("version", package_version())
Пример #12
0
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool = False) -> None:
    """
    Copy (validated) table design and SQL files from local directory to S3 bucket.
    """
    logger.info("Validating %d table design(s) before upload", len(relations))
    RelationDescription.load_in_parallel(relations)

    files = []  # typing: List[Tuple[str, str]]
    for relation in relations:
        relation_files = [relation.design_file_name]
        if relation.is_transformation:
            if relation.sql_file_name:
                relation_files.append(relation.sql_file_name)
            else:
                raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name)
        for file_name in relation_files:
            local_filename = relation.norm_path(file_name)
            remote_filename = os.path.join(prefix, local_filename)
            files.append((local_filename, remote_filename))

    uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run)
    with Timer() as timer:
        futures = []  # typing: List[concurrent.futures.Future]
        # TODO With Python 3.6, we should pass in a thread_name_prefix
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            for local_filename, remote_filename in files:
                futures.append(executor.submit(uploader.__call__, local_filename, remote_filename))
        errors = 0
        for future in concurrent.futures.as_completed(futures):
            exception = future.exception()
            if exception is not None:
                logger.error("Failed to upload file: %s", exception)
                errors += 1
    if not dry_run:
        logger.info(
            "Uploaded %d of %d file(s) to 's3://%s/%s (%s)", len(files) - errors, len(files), bucket_name, prefix, timer
        )
    if errors:
        raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
Пример #13
0
def initial_setup(config,
                  with_user_creation=False,
                  force=False,
                  dry_run=False):
    """
    Place named data warehouse database into initial state.

    This destroys the contents of the targeted database.
    You have to set `force` to true if the name of the database doesn't start with 'validation'.

    Optionally use `with_user_creation` flag to create users and groups.
    """
    try:
        database_name = config.dsn_etl['database']
    except (KeyError, ValueError) as exc:
        raise ETLConfigError(
            "could not identify database initialization target") from exc

    if database_name.startswith('validation'):
        logger.info("Initializing validation database '%s'", database_name)
    elif force:
        logger.info(
            "Initializing non-validation database '%s' forcefully as requested",
            database_name)
    else:
        raise ETLRuntimeError(
            "Refused to initialize non-validation database '%s' without the --force option"
            % database_name)
    # Create all defined users which includes the ETL user needed before next step (so that database is owned by ETL)
    if with_user_creation:
        with closing(
                etl.db.connection(config.dsn_admin,
                                  autocommit=True,
                                  readonly=dry_run)) as conn:
            for user in config.users:
                _create_or_update_cluster_user(conn, user, dry_run=dry_run)

    if dry_run:
        logger.info(
            "Dry-run: Skipping drop and create of database '%s' with owner '%s'",
            database_name, config.owner)
    else:
        admin_dev_conn = etl.db.connection(config.dsn_admin, autocommit=True)
        with closing(admin_dev_conn):
            logger.info("Dropping and creating database '%s' with owner '%s'",
                        database_name, config.owner)
            etl.db.drop_and_create_database(admin_dev_conn, database_name,
                                            config.owner)

    admin_target_db_conn = etl.db.connection(config.dsn_admin_on_etl_db,
                                             autocommit=True,
                                             readonly=dry_run)
    with closing(admin_target_db_conn):
        if dry_run:
            logger.info("Dry-run: Skipping drop of PUBLIC schema in '%s'",
                        database_name)
        else:
            logger.info("Dropping PUBLIC schema in '%s'", database_name)
            etl.db.drop_schema(admin_target_db_conn, "PUBLIC")
        if with_user_creation:
            for user in config.users:
                if user.schema:
                    _create_schema_for_user(admin_target_db_conn,
                                            user,
                                            config.groups[0],
                                            dry_run=dry_run)
                _update_search_path(admin_target_db_conn,
                                    user,
                                    dry_run=dry_run)
Пример #14
0
 def is_required(self) -> bool:
     if self._is_required is None:
         raise ETLRuntimeError(
             "state of 'is_required' unknown for RelationDescription '{0.identifier}'"
             .format(self))
     return self._is_required