def test_from_conn_not_registered():
    """
    Tests helpful error message on attempt to choose unregistered conn type.
    """
    conn = Mock()
    conn.__class__ = "Not a real class"

    with pytest.raises(ETLHelperHelperError,
                       match=r'Unsupported connection type.*'):
        DB_HELPER_FACTORY.from_conn(conn)
def test_from_db_params_not_registered():
    """
    Tests helpful error message on attempt to choose unregistered db_params
    type.
    """
    db_params = MagicMock(DbParams)
    db_params.dbtype = 'Not a real type'

    with pytest.raises(ETLHelperHelperError,
                       match=r'Unsupported DbParams.dbtype.*'):
        DB_HELPER_FACTORY.from_db_params(db_params)
示例#3
0
def test_sqlalchemy_conn_string(monkeypatch, db_params, expected):

    monkeypatch.setenv('DB_PASSWORD', 'mypassword')
    helper = DB_HELPER_FACTORY.from_db_params(db_params)
    conn_str = helper.get_sqlalchemy_connection_string(db_params, 'DB_PASSWORD')

    assert conn_str == expected
    def validate_params(self):
        """
        Validate database parameters.

        Should validate that a dbtype is a valid one and that the appropriate
        params have been passed for a particular db_type.

        :raises ETLHelperParamsError: Error if params are invalid
        """
        # Get a set of the attributes to compare against required attributes.
        given = set(self.keys())

        try:
            required_params = DB_HELPER_FACTORY.from_dbtype(
                self.dbtype).required_params
        except ETLHelperHelperError:
            msg = f'{self.dbtype} not in valid types ({DB_HELPER_FACTORY.helpers.keys()})'
            # from None suppresses lower errors in the stack trace
            # Deeper error is recorded in ETLHelperDbParamsError.__context__
            raise ETLHelperDbParamsError(msg) from None

        unset_params = (given ^ required_params) & required_params
        if unset_params:
            msg = f'{unset_params} not set. Required parameters are {required_params}'
            raise ETLHelperDbParamsError(msg)

        valid_params = required_params.union({'dbtype'})
        bad_params = given ^ valid_params
        if bad_params:
            msg = f"Invalid parameter(s): {bad_params}"
            raise ETLHelperDbParamsError(msg)
示例#5
0
def execute(query, conn, parameters=()):
    """
    Run SQL query against connection.

    :param query: str, SQL query to execute
    :param conn: dbapi connection
    :param parameters: sequence or dict of bind variables to insert in the query
    """
    logger.info("Executing query")
    logger.debug(f"Executing:\n\n{query}\n\nwith parameters:\n\n"
                 f"{parameters}\n\nagainst\n\n{conn}")

    helper = DB_HELPER_FACTORY.from_conn(conn)
    with helper.cursor(conn) as cursor:
        # Run query
        try:
            cursor.execute(query, parameters)
            conn.commit()
        except helper.sql_exceptions as exc:
            # Even though we haven't modified data, we have to rollback to
            # clear the failed transaction before any others can be started.
            conn.rollback()
            msg = (f"SQL query raised an error.\n\n{query}\n\n"
                   f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n")
            raise ETLHelperQueryError(msg)
    def from_environment(cls, prefix='ETLHelper_'):
        """
        Create DbParams object from parameters specified by environment
        variables e.g. ETLHelper_dbtype, ETLHelper_host, ETLHelper_port, etc.
        :param prefix: str, prefix to environment variable names
        """
        dbparams_keys = [key for key in os.environ if key.startswith(prefix)]
        dbparams_from_env = {
            key.replace(prefix, '').lower(): os.environ[key]
            for key in dbparams_keys
        }

        # Ensure dbtype has been set
        dbtype_var = f'{prefix}dbtype'
        dbtype = dbparams_from_env.get('dbtype', None)
        if dbtype is None:
            msg = f"{dbtype_var} environment variable is not set"
            raise ETLHelperDbParamsError(msg)

        # Only include the required params
        # This prevents something like ETLHelper_password being added
        required_params = DB_HELPER_FACTORY.from_dbtype(
            dbtype).required_params | {'dbtype'}
        dbparams_from_env = {
            key: dbparams_from_env[key]
            for key in required_params
        }

        return cls(**dbparams_from_env)
def test_from_dbparams(dbtype_keyword, expected_helper):
    """
    Tests correct helper produced given a db params object
    """
    db_params = MagicMock(DbParams)
    db_params.dbtype = dbtype_keyword
    helper = DB_HELPER_FACTORY.from_db_params(db_params)
    assert isinstance(helper, expected_helper)
示例#8
0
def executemany(query, rows, conn, commit_chunks=True):
    """
    Use query to insert/update data from rows to database at conn.  This
    method uses the executemany or execute_batch (PostgreSQL) commands to
    process the data in chunks and avoid creating a new database connection for
    each row.  Row data are passed as parameters into query.

    commit_chunks controls if chunks the transaction should be committed after
    each chunk has been inserted.  Committing chunks means that errors during
    a long-running insert do not require all all data to be loaded again.  The
    disadvantage is that investigation may be required to determine exactly
    which records have been successfully transferred.

    :param query: str, SQL insert command with placeholders for data
    :param rows: List of tuples containing data to be inserted/updated
    :param conn: dbapi connection
    :param commit_chunks: bool, commit after each chunk has been inserted/updated
    :return row_count: int, number of rows inserted/updated
    """
    logger.info(f"Executing many (chunksize={CHUNKSIZE})")
    logger.debug(f"Executing:\n\n{query}\n\nagainst\n\n{conn}")

    helper = DB_HELPER_FACTORY.from_conn(conn)
    processed = 0

    with helper.cursor(conn) as cursor:
        for chunk in _chunker(rows, CHUNKSIZE):
            # Run query
            try:
                # Chunker pads to whole chunk with None; remove these
                chunk = [row for row in chunk if row is not None]

                # Show first row as example of data
                if processed == 0:
                    logger.debug(f"First row: {chunk[0]}")

                # Execute query
                helper.executemany(cursor, query, chunk)
                processed += len(chunk)

            except helper.sql_exceptions as exc:
                # Rollback to clear the failed transaction before any others can
                # be # started.
                conn.rollback()
                msg = f"SQL query raised an error.\n\n{query}\n\n{exc}\n"
                raise ETLHelperInsertError(msg)

            logger.info(f'{processed} rows processed')

            # Commit changes so far
            if commit_chunks:
                conn.commit()

    # Commit changes where not already committed
    if not commit_chunks:
        conn.commit()

    logger.info(f'{processed} rows processed in total')
def test_from_conn(expected_helper, db_class):
    """
    Tests correct helper produced given a conn object
    """
    conn = Mock()
    # conn.__class__ = cx_Oracle.Connection
    conn.__class__ = db_class
    helper = DB_HELPER_FACTORY.from_conn(conn)
    assert isinstance(helper, expected_helper)
示例#10
0
def get_connection_string(db_params, password_variable):
    """
    Get a connection string

    :param db_params: DbParams object or similar with appropriate attributes
    :param password_variable: str, name of environment variable with password
    :return: str, Connection string
    """
    helper = DB_HELPER_FACTORY.from_db_params(db_params)
    return helper.get_connection_string(db_params, password_variable)
示例#11
0
    def __setattr__(self, item, value):
        # Prepare set of valid_params
        # dbtype has to be added as it is used to determine required_params
        valid_params = DB_HELPER_FACTORY.from_dbtype(
            self.dbtype).required_params
        valid_params = valid_params.union({'dbtype'})
        if item not in valid_params:
            msg = f"'{item}' is not a valid DbParams attribute: {valid_params}"
            raise AttributeError(msg)

        self[item] = value
示例#12
0
def test_connect(monkeypatch, db_params, driver, expected):
    # Arrange
    monkeypatch.setenv('DB_PASSWORD', 'mypassword')
    mock_connect = Mock()
    monkeypatch.setattr(driver, 'connect', mock_connect)
    helper = DB_HELPER_FACTORY.from_db_params(db_params)

    # Act
    helper.connect(db_params, 'DB_PASSWORD')

    # Assert
    mock_connect.assert_called_with(expected)
示例#13
0
def connect(db_params, password_variable=None, **kwargs):
    """
    Return database connection.

    :param db_params: DbParams object or similar with appropriate attributes
    :param password_variable: str, name of environment variable with password
    :param kwargs: connection specific keyword arguments e.g. row_factory
    :return: Connection object
    """
    helper = DB_HELPER_FACTORY.from_db_params(db_params)
    # Helpers will raise ETLHelperConnectionError if connection fails
    conn = helper.connect(db_params, password_variable, **kwargs)
    return conn
示例#14
0
def generate_insert_sql(table, row, conn):
    """Generate insert SQL for table, getting column names from row and the
    placeholder style from the connection.  `row` is either a namedtuple or
    a dictionary."""
    helper = DB_HELPER_FACTORY.from_conn(conn)
    paramstyles = {
        "qmark": "?",
        "numeric": ":{number}",
        "named": ":{name}",
        "format": "%s",
        "pyformat": "%({name})s"
    }

    # Namedtuples use a query with positional placeholders
    if not hasattr(row, 'keys'):
        paramstyle = helper.positional_paramstyle

        # Convert namedtuple to dictionary to easily access keys
        try:
            row = row._asdict()
        except AttributeError:
            msg = f"Row is not dictionary or namedtuple ({type(row)})"
            raise ETLHelperInsertError(msg)

        columns = row.keys()
        if paramstyle == "numeric":
            placeholders = [
                paramstyles[paramstyle].format(number=i + 1)
                for i in range(len(columns))
            ]
        else:
            placeholders = [paramstyles[paramstyle]] * len(columns)

    # Dictionaries use a query with named placeholders
    else:
        paramstyle = helper.named_paramstyle
        if not paramstyle:
            msg = (
                f"Database connection ({str(conn.__class__)}) doesn't support named parameters.  "
                "Pass data as namedtuples instead.")
            raise ETLHelperInsertError(msg)

        columns = row.keys()
        placeholders = [
            paramstyles[paramstyle].format(name=c) for c in columns
        ]

    sql = f"INSERT INTO {table} ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"

    return sql
示例#15
0
    def validate_params(self):
        """
        Validate database parameters.

        Should validate that a dbtype is a valid one and that the appropriate
        params have been passed for a particular db_type.

        :raises ETLHelperParamsError: Error if params are invalid
        """
        # Get a set of the attributes to compare against required attributes.
        given = set(self.keys())

        try:
            required_params = DB_HELPER_FACTORY.from_dbtype(self.dbtype).required_params
        except ETLHelperHelperError:
            msg = f'{self.dbtype} not in valid types ({DB_HELPER_FACTORY.helpers.keys()})'
            raise ETLHelperDbParamsError(msg)

        unset_params = (given ^ required_params) & required_params
        if unset_params:
            msg = f'{unset_params} not set. Required parameters are {required_params}'
            raise ETLHelperDbParamsError(msg)
示例#16
0
def executemany(query,
                conn,
                rows,
                on_error=None,
                commit_chunks=True,
                chunk_size=CHUNKSIZE):
    """
    Use query to insert/update data from rows to database at conn.  This
    method uses the executemany or execute_batch (PostgreSQL) commands to
    process the data in chunks and avoid creating a new database connection for
    each row.  Row data are passed as parameters into query.

    Default behaviour is to raise an exception in the case of SQL errors such
    as primary key violations.  If the on_error parameter is specified, the
    exception will be caught then then rows of each chunk re-tried individually.
    Further errors will be caught and appended to a list of (row, exception)
    tuples.  on_error is a function that is called at the end of each chunk,
    with the list as the only argument.

    commit_chunks controls if chunks the transaction should be committed after
    each chunk has been inserted.  Committing chunks means that errors during
    a long-running insert do not require all data to be loaded again.  The
    disadvantage is that investigation may be required to determine exactly
    which records have been successfully transferred.

    :param query: str, SQL insert command with placeholders for data
    :param conn: dbapi connection
    :param rows: List of tuples containing data to be inserted/updated
    :param on_error: Function to be applied to failed rows in each chunk
    :param commit_chunks: bool, commit after each chunk has been inserted/updated
    :param chunk_size: int, size of chunks to group data by
    :return row_count: int, number of rows inserted/updated
    """
    logger.info("Executing many (chunk_size=%s)", chunk_size)
    logger.debug("Executing:\n\n%s\n\nagainst\n\n%s", query, conn)

    helper = DB_HELPER_FACTORY.from_conn(conn)
    processed = 0
    failed = 0

    with helper.cursor(conn) as cursor:
        for chunk in _chunker(rows, chunk_size):
            # Run query
            try:
                # Chunker pads to whole chunk with None; remove these
                chunk = [row for row in chunk if row is not None]

                # Show first row as example of data
                if processed == 0:
                    logger.debug(f"First row: {chunk[0]}")

                # Execute query
                helper.executemany(cursor, query, chunk)

            except helper.sql_exceptions as exc:
                # Rollback to clear the failed transaction before any others can
                # be started.
                conn.rollback()

                # Collect and process failed rows if on_error function provided
                if on_error:
                    # Temporarily disable logging
                    old_level = logger.level
                    logger.setLevel(logging.ERROR)

                    try:
                        failed_rows = _execute_by_row(query, conn, chunk)
                    finally:
                        # Restore logging
                        logger.setLevel(old_level)

                    failed += len(failed_rows)
                    logger.debug("Calling on_error function on %s failed rows",
                                 failed)
                    on_error(failed_rows)
                else:
                    msg = (
                        f"SQL query raised an error.\n\n{query}\n\n"
                        f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n")
                    raise ETLHelperInsertError(msg)

            processed += len(chunk)
            logger.info('%s rows processed (%s failed)', processed, failed)

            # Commit changes so far
            if commit_chunks:
                conn.commit()

    # Commit changes where not already committed
    if not commit_chunks:
        conn.commit()

    logger.info(f'{processed} rows processed in total')
示例#17
0
def iter_chunks(select_query,
                conn,
                parameters=(),
                row_factory=namedtuple_row_factory,
                transform=None,
                read_lob=False,
                chunk_size=CHUNKSIZE):
    """
    Run SQL query against connection and return iterator object to loop over
    results in batches of chunksize (default 5000).

    The row_factory changes the output format of the results.  Other row
    factories e.g. dict_row_factory are available.

    The transform function is applied to chunks of data as they are extracted
    from the database.

    The read_lob parameter will convert Oracle LOB objects to strings. It is
    required to access results of some Oracle Spatial functions.

    :param select_query: str, SQL query to execute
    :param conn: dbapi connection
    :param parameters: sequence or dict of bind variables to insert in the query
    :param row_factory: function that accepts a cursor and returns a function
                        for parsing each row
    :param transform: function that accepts an iterable (e.g. list) of rows and
                      returns an iterable of rows (possibly of different shape)
    :param read_lob: bool, convert Oracle LOB objects to strings
    :param chunk_size: int, size of chunks to group data by
    """
    logger.info("Fetching rows (chunk_size=%s)", chunk_size)
    logger.debug(f"Fetching:\n\n{select_query}\n\nwith parameters:\n\n"
                 f"{parameters}\n\nagainst\n\n{conn}")

    helper = DB_HELPER_FACTORY.from_conn(conn)
    with helper.cursor(conn) as cursor:
        # Run query
        try:
            cursor.execute(select_query, parameters)
        except helper.sql_exceptions as exc:
            # Even though we haven't modified data, we have to rollback to
            # clear the failed transaction before any others can be started.
            conn.rollback()
            msg = (f"SQL query raised an error.\n\n{select_query}\n\n"
                   f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n")
            raise ETLHelperExtractError(msg)

        # Set row factory
        create_row = row_factory(cursor)

        # Parse results
        first_pass = True
        while True:
            rows = cursor.fetchmany(chunk_size)

            # No more rows to process
            if not rows:
                if first_pass:
                    msg = "No rows returned"
                else:
                    if cursor.rowcount == -1:
                        # SQLite3 drive doesn't support row count (always -1)
                        msg = "All rows returned"
                    else:
                        msg = f"{cursor.rowcount} rows returned"
                logger.info(msg)

                # Close the active transaction
                conn.commit()
                return

            # Convert Oracle LOBs to strings if required
            if read_lob:
                rows = _read_lob(rows)

            # Apply row_factory
            rows = (create_row(row) for row in rows)

            # Apply transform
            if transform:
                rows = transform(rows)

            # Return data
            yield rows
            first_pass = False
示例#18
0
def iter_chunks(select_query,
                conn,
                parameters=(),
                row_factory=namedtuple_rowfactory,
                transform=None,
                read_lob=False):
    """
    Run SQL query against connection and return iterator object to loop over
    results in batches of etlhelper.etl.CHUNKSIZE (default 5000).

    The row_factory changes the output format of the results.  Other row
    factories e.g. dict_rowfactory are available.

    The transform function is applied to chunks of data as they are extracted
    from the database.

    The read_lob parameter will convert Oracle LOB objects to strings. It is
    required to access results of some Oracle Spatial functions.

    :param select_query: str, SQL query to execute
    :param conn: dbapi connection
    :param parameters: sequence or dict of bind variables to insert in the query
    :param row_factory: function that accepts a cursor and returns a function
                        for parsing each row
    :param transform: function that accepts an iterable (e.g. list) of rows and
                      returns an iterable of rows (possibly of different shape)
    :param read_lob: bool, convert Oracle LOB objects to strings
    """
    helper = DB_HELPER_FACTORY.from_conn(conn)
    with helper.cursor(conn) as cursor:
        # Run query
        try:
            cursor.execute(select_query, parameters)
        except helper.sql_exceptions as exc:
            # Even though we haven't modified data, we have to rollback to
            # clear the failed transaction before any others can be started.
            conn.rollback()
            msg = f"SQL query raised an error.\n\n{select_query}\n\n{exc}\n"
            raise ETLHelperExtractError(msg)

        # Set row factory
        create_row = row_factory(cursor)

        # Parse results
        while True:
            rows = cursor.fetchmany(CHUNKSIZE)

            # cursor.rowcount is number of records transferred from the server
            if cursor.rowcount == 0:
                logging.debug("iter_chunks: No records returned")
                return

            # No more rows to process
            if not rows:
                logging.debug(
                    f"iter_chunks: {cursor.rowcount} records returned")
                return

            # Convert Oracle LOBs to strings if required
            if read_lob:
                rows = _read_lob(rows)

            # Apply row_factory
            rows = (create_row(row) for row in rows)

            # Apply transform
            if transform:
                rows = transform(rows)

            # Return data
            yield rows
def test_from_db_params_bad_type():
    with pytest.raises(ETLHelperHelperError,
                       match=r'Expected DbParams-like object.*'):
        DB_HELPER_FACTORY.from_db_params('some string')
def test_from_conn_bad_type():
    with pytest.raises(ETLHelperHelperError,
                       match=r'Expected connection-like object.*'):
        DB_HELPER_FACTORY.from_conn('some string')
 def paramstyle(self):
     """The DBAPI2 paramstyle attribute for database type"""
     return DB_HELPER_FACTORY.from_dbtype(self.dbtype).paramstyle