示例#1
0
def createTable(df: pd.DataFrame,
                engine: sqlalchemy.engine.base.Engine,
                tablename: str,
                dbname: str = None,
                ifExists: str = "error",
                indexList: list = None,
                isIndexUnique: bool = True,
                dtype: dict = {}) -> None:
    """
    ===example===
    createTable(df_iris, engine, "iris", "db_data",
                ifExists="replace")
    >>> None
    """
    # make query string
    if dbname is None:
        dbname = engine.url.database
    new_names = renameColumns(df.columns.to_series())
    col_dtype = [
        dtypeParser(col) if name not in dtype else dtype[name]
        for name, col in df.iteritems()
    ]

    column_query_part = "\n    ,".join(
        [f"{name} {dtype}" for name, dtype in zip(new_names, col_dtype)])
    if indexList:
        idxSeries = pd.Series(indexList).astype(str)
        idx_str = ",".join(renameColumns(idxSeries))
        q_unique = "unique " if isIndexUnique else ""
        index_query_part = q_unique + f"primary index ({idx_str})"
    else:
        index_query_part = "no primary index"
    query = """
    create table {dbname}.{tablename} (
        {column_query_part}
    ) {index_query_part}
    """.format(tablename=tablename,
               dbname=dbname,
               column_query_part=column_query_part,
               index_query_part=index_query_part)

    # execute query
    if ifExists == "replace":
        dropIfExists(tablename, dbname, engine)
    elif ifExists == "error":
        if tableIsExist(tablename, dbname, engine):
            raise ValueError(f"{tablename}.{dbname} is already exists.")
    elif ifExists == "insert":
        if tableIsExist(tablename, dbname, engine):
            return

    engine.execute(query)
    print("created table \n" + query)
示例#2
0
def dropIfExists(tablename: str, dbname: str,
                 engine: sqlalchemy.engine.base.Engine) -> None:
    """
    ===example===
    from sqlalchemy import create_engine
    engine = create_engine("teradata://*****:*****@hostip:1025")
    dropIfExists("iris", "db_data", engine)
    droped table db_data.iris
    >>> None
    """
    if tableIsExist(tablename, dbname, engine):
        engine.execute(f"drop table {dbname}.{tablename}")
        print(f"droped table {dbname}.{tablename}")
def query_and_decrypt_data(
    db: sqlalchemy.engine.base.Engine,
    env_aead: tink.aead.KmsEnvelopeAead,
    table_name: str,
) -> None:
    with db.connect() as conn:
        # Execute the query and fetch all results
        recent_votes = conn.execute(
            f"SELECT team, time_cast, voter_email FROM {table_name} "
            "ORDER BY time_cast DESC LIMIT 5").fetchall()

        print("Team\tEmail\tTime Cast")

        for row in recent_votes:
            team = row[0]

            # Postgres pads CHAR fields with spaces. These will need to be removed before
            # decrypting.
            aad = team.rstrip()

            # Use the envelope AEAD primitive to decrypt the email, using the team name as
            # associated data. Encryption with associated data ensures authenticity
            # (who the sender is) and integrity (the data has not been tampered with) of that
            # data, but not its secrecy. (see RFC 5116 for more info)
            email = env_aead.decrypt(row[2], aad.encode()).decode()
            time_cast = row[1]

            # Print recent votes
            print(f"{team}\t{email}\t{time_cast}")
示例#4
0
def usage_file_is_processed(file_name: str,
                            db_engine: sqlalchemy.engine.base.Engine) -> bool:
    """
    Verifies if the current file has been previously processed

    Parameters
    ----------
    file_name: str
        Name of the file
    db_engine: sqlalchemy.engine.base.Engine

    Returns
    -------
    bool
        True if the file has been processed, False if it has not.
    """

    _create_usage_table_if_it_does_not_exist(db_engine)

    query = f"select exists(select 1 from {USAGE_TABLE_NAME} where FILE_NAME='{file_name}') as 'exists'"

    db_item = {"exists": False}
    with db_engine.connect() as con:
        result: Union[ResultProxy, None] = con.execute(query)
        db_item = _map_single_result_to_dict(result)

    return db_item["exists"] == 1
示例#5
0
def encrypt_and_insert_data(
    db: sqlalchemy.engine.base.Engine,
    env_aead: tink.aead.KmsEnvelopeAead,
    table_name: str,
    team: str,
    email: str,
) -> None:
    time_cast = datetime.datetime.now(tz=datetime.timezone.utc)
    # Use the envelope AEAD primitive to encrypt the email, using the team name as
    # associated data. Encryption with associated data ensures authenticity
    # (who the sender is) and integrity (the data has not been tampered with) of that
    # data, but not its secrecy. (see RFC 5116 for more info)
    encrypted_email = env_aead.encrypt(email.encode(), team.encode())
    # Verify that the team is one of the allowed options
    if team != "TABS" and team != "SPACES":
        logger.error(f"Invalid team specified: {team}")
        return

    # Preparing a statement before hand can help protect against injections.
    stmt = sqlalchemy.text(
        f"INSERT INTO {table_name} (time_cast, team, voter_email)"
        " VALUES (:time_cast, :team, :voter_email)"
    )

    # Using a with statement ensures that the connection is always released
    # back into the pool at the end of statement (even if an error occurs)
    with db.connect() as conn:
        conn.execute(
            stmt,
            time_cast=time_cast,
            team=team,
            voter_email=encrypted_email)
    print(f"Vote successfully cast for '{team}' at time {time_cast}!")
示例#6
0
 def getUserByEmail(email: str, engine: sa.engine.base.Engine) -> User:
     ## should check username validity
     user = list(engine.connect().execute(
         f"select * from {User.__tablename__} where email = '{email}'"))
     if (len(user) == 0): return None
     else: user = user[0]
     user = User.from_row_to_obj(user)
     return user
示例#7
0
def _table_to_csv(engine: sqlalchemy.engine.base.Engine, table_name: str, file_path: str):
    with open(file_path, 'w') as fh:
        outcsv = csv.writer(fh)

        with engine.connect() as con:
            records = con.execute(f"SELECT * FROM {table_name}")
            outcsv.writerow(records.keys())
            outcsv.writerows(records)
示例#8
0
    def getUSerById(id: int, engine: sa.engine.base.Engine) -> User:

        user = list(engine.connect().execute(
            f"select * from {User.__tablename__} where id = '{id}'"))
        if (len(user) == 0): return None
        else: user = user[0]
        user = User.from_row_to_obj(user)
        return user
示例#9
0
def insert_usage_file_name(file_name: str,
                           db_engine: sqlalchemy.engine.base.Engine) -> None:
    """
    Inserts the name of the file in the table for processed files

    Parameters
    ----------
    file_name: str
        Name of the file
    db_engine: sqlalchemy.engine.base.Engine

    Returns
    -------
    None
    """

    _create_usage_table_if_it_does_not_exist(db_engine)
    db_engine.execute(f"INSERT INTO {USAGE_TABLE_NAME} VALUES('{file_name}')")
示例#10
0
def tearDown(file: str, started_at: str, engine: sqlalchemy.engine.base.Engine,
             session: sqlalchemy.orm.session.Session):
    """
    close session & engine, banner

    :param file: caller, usually __file__
    :param started_at: eg, str(datetime.now())
    :param engine: eg, nw.logic import session, engine
    :param session: from nw.logic import session, engine
    :return:
    """
    session.close()
    engine.dispose()
    print("\n")
    print("**********************")
    print("** Test complete, SQLAlchemy session/engine closed for: " + file)
    print("** Started: " + started_at + " Ended: " + str(datetime.now()))
    print("**********************")
示例#11
0
def last_sync_date(
        sync_db: sqlalchemy.engine.base.Engine) -> Optional[datetime]:
    with sync_db.connect() as con:
        try:
            usage_df = read_sql("SELECT asOfDate FROM Usage", con)
            if usage_df["asOfDate"].count() == 0:
                return None
            return date_parse(usage_df["asOfDate"].max())  # type: ignore
        except OperationalError:
            logger.debug("No Usage table yet")
            return None
示例#12
0
def create_tables(config: list, engine: sqlalchemy.engine.base.Engine):
    con = engine.connect()

    for table in config:
        name = table.get('name')
        schema = table.get('schema')
        ddl = f"""DROP TABLE IF EXISTS {name}"""
        con.execute(ddl)

        ddl = f"""CREATE TABLE {name} ({schema})"""
        con.execute(ddl)
示例#13
0
    def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict:

        self.setup(engine)

        # Create a connection to the database
        conn = engine.connect()

        # Record the time spent
        duration = {
            'querying': {},
        }

        # Sample each relation
        self.relations = {}
        sampling_method = {
            True: 'SYSTEM',
            False: 'BERNOULLI'
        }[self.block_sampling]
        for rel_name in self.rel_names:

            # Sample the relation if the number of rows is high enough
            query = 'SELECT * FROM {}'.format(rel_name)
            # Add a sampling statement if the sampling ratio is lower than 1
            sampling_ratio = max(self.sampling_ratio,
                                 self.min_rows / self.rel_cards[rel_name])
            if sampling_ratio < 1:
                # Make sure there won't be less samples then the minimum number of allowed rows
                query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format(
                    sampling_method, sampling_ratio * 100, self.seed)
            date_atts = [
                att for att, typ in self.att_types[rel_name].items()
                if typ == 'date'
            ]
            tic = time.time()
            rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts)
            duration['querying'][rel_name] = time.time() - tic

            # Convert the datetimes to ISO formatted strings
            for att in date_atts:
                rel[att] = rel[att].map(lambda x: x.isoformat())

            # Strip the whitespace from the string columns
            for att in rel.columns:
                if rel[att].dtype == 'object':
                    rel[att] = rel[att].str.rstrip()

            # Store the relation
            self.relations[rel_name] = rel

        # Close the connection to the database
        conn.close()

        return duration
示例#14
0
 def delete_table(engine: sa.engine.base.Engine) -> bool:
     conn = engine.connect()
     try:
         conn.execute(f"drop table {User.__tablename__}")
         return True
     except sa.exc.OperationalError as err:
         if f"no such table: {User.__tablename__}" in err._message():
             if (engine.echo):
                 print(f"[!] Table '{User.__tablename__}' does not exist!")
             return False
         else:
             raise err
示例#15
0
def cleanup_after_sync(resource_name: str, sync_db: sqlalchemy.engine.base.Engine):
    """
    Delete sync temporary tables if they exist

    Parameters
    ----------
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections
    """
    with sync_db.connect() as con:
        con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}")
        con.execute(f"DROP TABLE IF EXISTS Unmatched_{resource_name}")
示例#16
0
def temp_ids_con(engine: sa.engine.base.Engine, ids: set):
    """Create database connection that makes temp.ids available as single column temp table
    """

    with engine.connect() as con:
        rows = ", ".join([f"({sql_clause_format(id)})" for id in ids])
        queries = [
            "DROP TABLE IF EXISTS temp.ids",
            "CREATE TEMP TABLE temp.ids(id INTEGER)",
            f"INSERT INTO temp.ids(id) VALUES {rows}",
        ]
        for query in queries:
            con.execute(query)
        yield con
示例#17
0
def _create_sync_table_from_resource_df(
    resource_df: DataFrame,
    identity_columns: List[str],
    resource_name: str,
    sync_db: sqlalchemy.engine.base.Engine,
):
    """
    Take fetched data and push to a new temporary sync table.  Includes
    hash and tentative extractor CreateDate/LastModifiedDates.

    Parameters
    ----------
    resource_df: DataFrame
        a DataFrame with current fetched data.
    identity_columns: List[str]
        a List of the identity columns for the resource dataframe.
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections
    """
    with sync_db.connect() as con:
        # ensure sync table exists, need column ordering to be identical to regular table
        con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}")
        con.execute(
            f"""
            CREATE TABLE IF NOT EXISTS Sync_{resource_name} (
                {SYNC_COLUMNS_SQL}
            )
            """
        )

    sync_df: DataFrame = resource_df.copy()
    sync_df = add_hash_and_json_to(sync_df)

    # add (possibly composite) primary key, sorting for consistent ordering
    add_sourceid_to(sync_df, identity_columns)

    now: datetime = datetime.now()
    sync_df["CreateDate"] = now
    sync_df["LastModifiedDate"] = now
    sync_df["SyncNeeded"] = 1

    sync_df = sync_df[SYNC_COLUMNS]
    sync_df.set_index("SourceId", inplace=True)
    # push to temporary sync table
    sync_df.to_sql(
        f"Sync_{resource_name}", sync_db, if_exists="append", index=True, chunksize=1000
    )
示例#18
0
    def setup(self, engine: sqlalchemy.engine.base.Engine):

        # Retrieve the metadata to know what tables and joins are available
        metadata = tools.get_metadata(engine)
        self.rel_names = tuple(metadata.tables.keys())

        # Create a connection to the database
        conn = engine.connect()

        # Retrieve relation cardinalities
        self.rel_cards = {}
        query = '''
        SELECT relname, reltuples
        FROM pg_class
        WHERE relname IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, card) in rows:
            self.rel_cards[rel_name] = card

        # Retrieve attribute cardinalities and number of nulls
        self.att_cards = defaultdict(dict)
        self.null_fracs = defaultdict(dict)
        query = '''
        SELECT tablename, attname, n_distinct, null_frac
        FROM pg_stats
        WHERE tablename IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, att_name, card, null_frac) in rows:
            self.att_cards[rel_name][att_name] = -card * self.rel_cards[
                rel_name] if card < 0 else card
            self.null_fracs[rel_name][att_name] = null_frac

        # Retrieve the type of each attribute
        self.att_types = defaultdict(dict)
        query = '''
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_name IN :rel_names
        '''
        rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names)
        for (rel_name, att_name, att_type) in rows:
            self.att_types[rel_name][att_name] = att_type

        # Close the connection to the database
        conn.close()
def sync_to_db_without_cleanup(
    resource_df: DataFrame,
    identity_columns: List[str],
    resource_name: str,
    sync_db: sqlalchemy.engine.base.Engine,
):
    """
    Take fetched data and sync with database. Creates tables when necessary,
    but ok if temporary tables are there to start. Does not delete temporary tables when finished.

    Parameters
    ----------
    resource_df: DataFrame
        a DataFrame with current fetched data
    identity_columns: List[str]
        a List of the identity columns for the resource dataframe.
    resource_name: str
        the name of the API resource, e.g. "Courses", to be used in SQL
    sync_db: sqlalchemy.engine.base.Engine
        an Engine instance for creating database connections

    Returns
    -------
    DataFrame
        a DataFrame with current fetched data and reconciled CreateDate/LastModifiedDate
    """
    assert (Series(identity_columns).isin(
        resource_df.columns).all()), "Identity columns missing from dataframe"

    # In certain cases we can end up with duplicate records, for example
    # in Canvas when a course belongs to a sub-account. De-duplicate the
    # DataFrame based on the identity_columns
    resource_df.drop_duplicates(subset=identity_columns, inplace=True)

    _create_sync_table_from_resource_df(resource_df, identity_columns,
                                        resource_name, sync_db)

    with sync_db.connect() as con:
        _ensure_main_table_exists(resource_name, con)
        _create_unmatched_records_temp_table(resource_name, con)
        _get_true_create_dates_for_unmatched_records(resource_name, con)
        _update_resource_table_with_changes(resource_name, con)
        result_df: DataFrame = _update_dataframe_with_true_dates(
            resource_df, identity_columns, resource_name, con)

    return result_df
示例#20
0
def _table_exist(
    table_name: str,
    db_engine: sqlalchemy.engine.base.Engine,
) -> bool:
    with db_engine.connect() as con:
        result: Union[ResultProxy, None] = con.execute(
            """
            SELECT name
            FROM sqlite_master
            WHERE type='table' AND name=?;
        """,
            (table_name, ),
        )

        if result is None:
            return False
        if result.first() is None:
            return False
        return True
示例#21
0
def load_features(engine: sqlalchemy.engine.base.Engine, processed_data,
                  processed_labels):
    processed_data.to_sql('features',
                          con=engine,
                          index_label='user_id',
                          if_exists='replace',
                          method=psql_insert_copy)

    processed_labels = pd.DataFrame({'is_fraudster': processed_labels},
                                    index=processed_labels.index)

    processed_labels.to_sql('labels',
                            con=engine,
                            index_label='user_id',
                            if_exists='replace',
                            method=psql_insert_copy)

    with engine.connect() as con:
        con.execute('ALTER TABLE features ADD PRIMARY KEY (user_id);')
        con.execute('ALTER TABLE labels ADD PRIMARY KEY (user_id);')
示例#22
0
文件: utils.py 项目: loglabs/mltrace
def _drop_everything(engine: sqlalchemy.engine.base.Engine):
    """(On a live db) drops all foreign key constraints before dropping all
    tables. Workaround for SQLAlchemy not doing DROP ## CASCADE for drop_all()
    (https://github.com/pallets/flask-sqlalchemy/issues/722)
    """

    con = engine.connect()
    trans = con.begin()
    inspector = Inspector.from_engine(engine)

    # We need to re-create a minimal metadata with only the required things to
    # successfully emit drop constraints and tables commands for
    # postgres (based on the actual schema of the running instance)
    meta = MetaData()
    tables = []
    all_fkeys = []

    for view_name in inspector.get_view_names():
        con.execute(
            "DROP MATERIALIZED VIEW IF EXISTS {} CASCADE".format(view_name))

    for table_name in inspector.get_table_names():
        fkeys = []

        for fkey in inspector.get_foreign_keys(table_name):
            if not fkey["name"]:
                continue

            fkeys.append(ForeignKeyConstraint((), (), name=fkey["name"]))

        tables.append(Table(table_name, meta, *fkeys))
        all_fkeys.extend(fkeys)

    for fkey in all_fkeys:
        con.execute(DropConstraint(fkey))

    for table in tables:
        con.execute(DropTable(table))

    trans.commit()
    Base.metadata.drop_all(engine)
示例#23
0
def request_all_usage_as_df(
    resource: Optional[Resource],
    sync_db: sqlalchemy.engine.base.Engine,
    env_start_date: str,
    env_end_date: str,
) -> DataFrame:
    usage_df: DataFrame = request_latest_usage_as_df(
        resource, start_date(sync_db, env_start_date), end_date(env_end_date))
    if usage_df.empty:
        return usage_df

    usage_df.to_sql("Usage",
                    sync_db,
                    if_exists="append",
                    index=False,
                    chunksize=500)
    # remove duplicates - leave only the most recent
    with sync_db.connect() as con:
        con.execute("DELETE from Usage "
                    "WHERE rowid not in (select max(rowid) "
                    "FROM Usage "
                    "GROUP BY email, asOfDate)")

    return usage_df
示例#24
0
 def create_user(info: dict, engine: sa.engine.base.Engine) -> bool:
     user_table = User.get_table_obj(engine)
     ins = user_table.insert()
     conn = engine.connect()
     ## incomplete fields check integrity hashing, email/uname/pass format validation etc
     try:
         #user = list(
         conn.execute(
             ins.values(
                 name=info['name'],
                 username=info['username'],
                 email=info['email'],
                 password=bcrypt.hashpw(info['password'].encode(),
                                        bcrypt.gensalt()).decode(),
                 role=info['role'],
             ))  #.returning(sa.literal_column('*'))))
         #if(len(user) == 0): return False
         #else: user = User.from_row_to_obj(engine)
         #print(user)
         return True
     except sa.exc.IntegrityError as err:
         if (engine.echo):
             print(f"[!] {err._message()}")
         return False
示例#25
0
    def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict:

        self.setup(engine)

        # Create a connection to the database
        conn = engine.connect()

        # Record the time spent
        duration = {'querying': {}, 'parameters': {}}

        # Create histograms per attribute
        self.histograms = {}
        self.n_in_bin = {}
        sampling_method = {
            True: 'SYSTEM',
            False: 'BERNOULLI'
        }[self.block_sampling]

        for rel_name in self.rel_names:

            self.histograms[rel_name] = {}
            self.n_in_bin[rel_name] = {}

            rel_card = self.rel_cards[rel_name]

            # Sample the relation if the number of rows is high enough
            query = 'SELECT * FROM {}'.format(rel_name)
            # Add a sampling statement if the sampling ratio is lower than 1
            sampling_ratio = max(self.sampling_ratio, self.min_rows / rel_card)
            if sampling_ratio < 1:
                # Make sure there won't be less samples then the minimum number of allowed rows
                query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format(
                    sampling_method, sampling_ratio * 100, self.seed)
            date_atts = [
                att for att, typ in self.att_types[rel_name].items()
                if typ == 'date'
            ]
            tic = time.time()
            rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts)
            duration['querying'][rel_name] = time.time() - tic

            # Convert the datetimes to ISO formatted strings
            for att in date_atts:
                rel[att] = rel[att].map(lambda x: x.isoformat())

            # Strip the whitespace from the string columns
            for att in rel.columns:
                if rel[att].dtype == 'object':
                    rel[att] = rel[att].str.rstrip()

            # Blacklist the ID columns
            blacklist = [
                att for att in rel.columns
                if '_id' in att or 'id_' in att or att == 'id' or '_sk' in att
                or self.att_types[rel_name][att] == 'character varying'
                or round(rel_card * self.null_fracs[rel_name][att] +
                         self.att_cards[rel_name][att]) == rel_card
            ]

            # Create one histogram per attribute
            tic = time.time()
            for att in set(rel.columns) - set(blacklist):
                rel[att], self.n_in_bin[rel_name][
                    att] = tools.discretize_series(rel[att],
                                                   n_mcv=self.n_mcv,
                                                   n_bins=self.n_bins)
                self.histograms[rel_name][att] = distribution.Distribution(
                    on=att, by=None)
                self.histograms[rel_name][att].build_from_df(
                    rel, types=self.att_types[rel_name])

            duration['parameters'][rel_name] = time.time() - tic

        # Close the connection to the database
        conn.close()

        return duration
示例#26
0
def update_on_table(df: pd.DataFrame, keys: update_key_type,
                    values: update_key_type, table_name: str,
                    engine: sa.engine.base.Engine, schema: str) -> int:
    """

    :param df: a dataframe with data tha needs to be updated. Must have columns to be used as key and some for values
    :param keys: the set of columns to use as key, i.e. update when matched
    :param values: the set of columns to update, i.e. set when matched
    :param table_name: a table name as in util_function
    :param engine: the sqlalchemy engine for the database
    :param schema: a schema of interest - None if default schema of database is ok
    :return: the number of records updated
    """

    # get table
    tbl = util_function(table_name, engine, schema)

    # change nan to None, make sure columns are modified so that we can easily bindparam
    df_ = df.copy()
    df_.columns = [f"{el.lower()}_updt" for el in df_.columns]
    groups = toolz.partition_all(
        CHUNK_SIZE,
        df_.where(pd.notnull(df_), None).to_dict(orient='records'))

    if not isinstance(keys, tuple) and not isinstance(keys, dict):
        raise BadArgumentType(
            "keys and values must either be both tuples or both dicts", None)

    # create where clause, and update statement
    update_statement: dml.Update
    if isinstance(keys, tuple):
        if not isinstance(values, tuple):
            raise BadArgumentType(
                "keys and values must either be both tuples or both dicts",
                None)

        where = [
            tbl.c[el] == sa.bindparam(f"{el.lower()}_updt") for el in keys
        ]
        update_statement = tbl.update().where(sa.and_(*where)).values(
            dict((a, sa.bindparam(f"{a.lower()}_updt")) for a in values))

    if isinstance(keys, dict):
        if not isinstance(values, dict):
            raise BadArgumentType(
                "keys and values must either be both tuples or both dicts",
                None)
        where = [
            tbl.c[k] == sa.bindparam(f"{v.lower()}_updt")
            for k, v in keys.items()
        ]
        update_statement = tbl.update().where(sa.and_(*where)).values(
            dict((k, sa.bindparam(f"{v.lower()}_updt"))
                 for k, v in values.items()))

    # update
    count, last_successful_update = 0, None
    with engine.connect() as connection:
        for group in groups:
            try:
                result = connection.execute(update_statement, group)
                last_successful_update = group[-1]
                count += result.rowcount
            except exc.OperationalError as _:
                # try again
                time.sleep(2)

                try:
                    result = connection.execute(update_statement, group)
                    last_successful_update = group[-1]
                    count += result.rowcount
                except exc.OperationalError as e:
                    raise OperationalError(
                        f"Failed to update records. Last successful update: {last_successful_update}",
                        e)

    return count
示例#27
0
def atb_fixed_var_om_existing(
    results: pd.DataFrame,
    atb_hr_df: pd.DataFrame,
    settings: dict,
    pudl_engine: sqlalchemy.engine.base.Engine,
) -> pd.DataFrame:
    """Add fixed and variable O&M for existing power plants

    ATB O&M data for new power plants are used as reference values. Fixed and variable
    O&M for each technology and heat rate are calculated. Assume that O&M scales with
    heat rate from new plants to existing generators. A separate multiplier for fixed
    O&M is specified in the settings file.

    Parameters
    ----------
    results : DataFrame
        Compiled results of clustered power plants with weighted average heat rates.
        Note that column names should include "technology", "Heat_rate_MMBTU_per_MWh",
        and "region". Technology names should not yet be converted to snake case.
    atb_hr_df : DataFrame
        Heat rate data from NREL ATB
    settings : dict
        User-defined parameters from a settings file

    Returns
    -------
    DataFrame
        Same as incoming "results" dataframe but with new columns
        "Fixed_OM_cost_per_MWyr" and "Var_OM_cost_per_MWh"
    """
    logger.info("Adding fixed and variable O&M for existing plants")
    techs = settings["eia_atb_tech_map"]
    existing_year = settings["atb_existing_year"]

    # ATB string is <technology>_<tech_detail>
    techs = {eia: atb.split("_") for eia, atb in techs.items()}
    df_list = []
    grouped_results = results.reset_index().groupby(
        ["plant_id_eia", "technology"], as_index=False)
    for group, _df in grouped_results:

        plant_id, eia_tech = group
        existing_hr = _df["heat_rate_mmbtu_mwh"].mean()
        try:
            atb_tech, tech_detail = techs[eia_tech]
        except KeyError:
            if eia_tech in settings["tech_groups"]:
                raise KeyError(
                    f"{eia_tech} is defined in 'tech_groups' but doesn't have a "
                    "corresponding ATB technology in 'eia_atb_tech_map'")

            else:
                raise KeyError(
                    f"{eia_tech} doesn't have a corresponding ATB technology in "
                    "'eia_atb_tech_map'")

        try:
            new_build_hr = (atb_hr_df.query(
                "technology==@atb_tech & tech_detail==@tech_detail"
                "& basis_year==@existing_year").squeeze().at["heat_rate"])
        except ValueError:
            # Not all technologies have a heat rate. If they don't, just set both values
            # to 1
            existing_hr = 1
            new_build_hr = 1
        try:
            s = f"""
                select parameter_value
                from technology_costs_nrelatb
                where
                    technology == "{atb_tech}"
                    AND tech_detail == "{tech_detail}"
                    AND basis_year == "{existing_year}"
                    AND financial_case == "Market"
                    AND cost_case == "Mid"
                    AND atb_year == "{settings['atb_data_year']}"
                    AND parameter == "variable_o_m_mwh"
                
                """
            atb_var_om_mwh = pudl_engine.execute(s).fetchall()[0][0]
        except IndexError:
            # logger.warning(f"No variable O&M for {atb_tech}")
            atb_var_om_mwh = 0

        try:
            s = f"""
                select parameter_value
                from technology_costs_nrelatb
                where
                    technology == "{atb_tech}"
                    AND tech_detail == "{tech_detail}"
                    AND basis_year == "{existing_year}"
                    AND financial_case == "Market"
                    AND cost_case == "Mid"
                    AND atb_year == "{settings['atb_data_year']}"
                    AND parameter == "fixed_o_m_mw"
                
                """
            atb_fixed_om_mw_yr = pudl_engine.execute(s).fetchall()[0][0]
        except IndexError:
            # logger.warning(f"No fixed O&M for {atb_tech}")
            atb_fixed_om_mw_yr = 0

        nems_o_m_techs = [
            "Combined Cycle",
            "Combustion Turbine",
            "Coal",
            "Steam Turbine",
            "Hydroelectric",
            "Geothermal",
            "Nuclear",
        ]
        if any(t in eia_tech for t in nems_o_m_techs):
            # Change CC and CT O&M to EIA NEMS values, which are much higher for CCs and
            # lower for CTs than a heat rate & linear mulitpler correction to the ATB
            # values.
            # Add natural gas steam turbine O&M.
            # Also using the new values for coal plants, assuming 40-50 yr age and half
            # FGD
            # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf
            # logger.info(f"Using NEMS values for {eia_tech} fixed/variable O&M")
            target_usd_year = settings["target_usd_year"]
            simple_o_m = {
                "Combined Cycle": {
                    # "fixed_o_m_mw": inflation_price_adjustment(
                    #     13.08 * 1000, 2017, target_usd_year
                    # ),
                    # "variable_o_m_mwh": inflation_price_adjustment(
                    #     3.91, 2017, target_usd_year
                    # )
                },
                # "Combustion Turbine": {
                #     # This includes both the Fixed O&M and Capex. Capex includes
                #     # variable O&M, which is split out in the calculations below.
                #     "fixed_o_m_mw": inflation_price_adjustment(
                #         (5.33 + 6.90) * 1000, 2017, target_usd_year
                #     ),
                #     "variable_o_m_mwh": 0,
                # },
                "Natural Gas Steam Turbine": {
                    # NEMS documenation splits capex and fixed O&M across 2 tables
                    # "fixed_o_m_mw": inflation_price_adjustment(
                    #     (15.96 + 24.68) * 1000, 2017, target_usd_year
                    # ),
                    "variable_o_m_mwh":
                    inflation_price_adjustment(1.0, 2017, target_usd_year)
                },
                "Coal": {
                    # "fixed_o_m_mw": inflation_price_adjustment(
                    #     ((22.2 + 27.88) / 2 + 46.01) * 1000, 2017, target_usd_year
                    # ),
                    "variable_o_m_mwh":
                    inflation_price_adjustment(1.78, 2017, target_usd_year)
                },
                "Conventional Hydroelectric": {
                    "fixed_o_m_mw":
                    inflation_price_adjustment(44.56 * 1000, 2017,
                                               target_usd_year),
                    "variable_o_m_mwh":
                    0,
                },
                "Geothermal": {
                    "fixed_o_m_mw":
                    inflation_price_adjustment(198.04 * 1000, 2017,
                                               target_usd_year),
                    "variable_o_m_mwh":
                    0,
                },
                "Pumped Hydro": {
                    "fixed_o_m_mw":
                    inflation_price_adjustment((23.63 + 14.83) * 1000, 2017,
                                               target_usd_year),
                    "variable_o_m_mwh":
                    0,
                },
            }

            if "Combined Cycle" in eia_tech:
                # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf
                plant_capacity = _df[settings["capacity_col"]].sum()
                assert plant_capacity > 0
                if plant_capacity < 500:
                    fixed = 15.62 * 1000
                    variable = 4.31
                elif 500 <= plant_capacity < 1000:
                    fixed = 9.27 * 1000
                    variable = 3.42
                else:
                    fixed = 11.68 * 1000
                    variable = 3.37

                _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment(
                    fixed, 2017, target_usd_year)
                _df["Var_OM_cost_per_MWh"] = inflation_price_adjustment(
                    variable, 2017, target_usd_year)

            if "Combustion Turbine" in eia_tech:
                # need to adjust the EIA fixed/variable costs because they have no
                # variable cost per MWh for existing CTs but they do have per MWh for
                # new build. Assume $11/MWh from new-build and 4% CF:
                # (11*8760*0.04/1000)=$3.85/kW-yr. Scale the new-build variable
                # (~$11/MWh) by relative heat rate and subtract a /kW-yr value as
                # calculated above from the FOM.
                # Based on conversation with Jesse J. on Dec 20, 2019.
                plant_capacity = _df[settings["capacity_col"]].sum()
                op, op_value = (settings.get("atb_modifiers", {}).get(
                    "ngct", {}).get("Var_OM_cost_per_MWh", (None, None)))

                if op:
                    f = operator.attrgetter(op)
                    atb_var_om_mwh = f(operator)(atb_var_om_mwh, op_value)

                variable = atb_var_om_mwh  # * (existing_hr / new_build_hr)

                if plant_capacity < 100:
                    annual_capex = 9.0 * 1000
                    fixed = annual_capex + 5.96 * 1000
                elif 100 <= plant_capacity <= 300:
                    annual_capex = 6.18 * 1000
                    fixed = annual_capex + 6.43 * 1000
                else:
                    annual_capex = 6.95 * 1000
                    fixed = annual_capex + 3.99 * 1000

                fixed = fixed - (variable * 8760 * 0.04)

                _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment(
                    fixed, 2017, target_usd_year)
                _df["Var_OM_cost_per_MWh"] = inflation_price_adjustment(
                    variable, 2017, target_usd_year)

            if "Natural Gas Steam Turbine" in eia_tech:
                # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf
                plant_capacity = _df[settings["capacity_col"]].sum()
                assert plant_capacity > 0
                if plant_capacity < 500:
                    annual_capex = 18.86 * 1000
                    fixed = annual_capex + 29.73 * 1000
                elif 500 <= plant_capacity < 1000:
                    annual_capex = 11.57 * 1000
                    fixed = annual_capex + 17.98 * 1000
                else:
                    annual_capex = 10.82 * 1000
                    fixed = annual_capex + 14.51 * 1000

                _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment(
                    fixed, 2017, target_usd_year)
                _df["Var_OM_cost_per_MWh"] = simple_o_m[
                    "Natural Gas Steam Turbine"]["variable_o_m_mwh"]

            if "Coal" in eia_tech:

                plant_capacity = _df[settings["capacity_col"]].sum()
                assert plant_capacity > 0

                age = settings["model_year"] - _df.operating_date.dt.year
                age = age.fillna(age.mean())
                age = age.fillna(40)

                # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf
                annual_capex = (16.53 + (0.126 * age) + (5.68 * 0.5)) * 1000

                if plant_capacity < 500:
                    fixed = 44.21 * 1000
                elif 500 <= plant_capacity < 1000:
                    fixed = 34.02 * 1000
                elif 1000 <= plant_capacity < 2000:
                    fixed = 28.52 * 1000
                else:
                    fixed = 33.27 * 1000

                _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment(
                    fixed + annual_capex, 2017, target_usd_year)
                _df["Var_OM_cost_per_MWh"] = simple_o_m["Coal"][
                    "variable_o_m_mwh"]
            if "Hydroelectric" in eia_tech:
                _df["Fixed_OM_cost_per_MWyr"] = simple_o_m[
                    "Conventional Hydroelectric"]["fixed_o_m_mw"]
                _df["Var_OM_cost_per_MWh"] = simple_o_m[
                    "Conventional Hydroelectric"]["variable_o_m_mwh"]
            if "Geothermal" in eia_tech:
                _df["Fixed_OM_cost_per_MWyr"] = simple_o_m["Geothermal"][
                    "fixed_o_m_mw"]
                _df["Var_OM_cost_per_MWh"] = simple_o_m["Geothermal"][
                    "variable_o_m_mwh"]
            if "Pumped" in eia_tech:
                _df["Fixed_OM_cost_per_MWyr"] = simple_o_m["Pumped Hydro"][
                    "fixed_o_m_mw"]
                _df["Var_OM_cost_per_MWh"] = simple_o_m["Pumped Hydro"][
                    "variable_o_m_mwh"]
            if "Nuclear" in eia_tech:
                num_units = len(_df)
                plant_capacity = _df[settings["capacity_col"]].sum()

                # Operating costs for different size/num units in 2016 INL report
                # "Economic and Market Challenges Facing the U.S. Nuclear Fleet"
                # https://gain.inl.gov/Shared%20Documents/Economics-Nuclear-Fleet.pdf,
                # table 1. Average of the two costs are used in each case.
                # The costs in that report include fuel and VOM. Assume $0.66/mmbtu
                # and $2.32/MWh plus 90% CF (ATB 2020) to get the costs below.
                # The INL report doesn't give a dollar year for costs, assume 2015.
                if num_units == 1 and plant_capacity < 900:
                    fixed = 315000
                elif num_units == 1 and plant_capacity >= 900:
                    fixed = 252000
                else:
                    fixed = 177000
                # age = (settings["model_year"] - _df.operating_date.dt.year).values
                # age = age.fillna(age.mean())
                # age = age.fillna(40)
                # EIA, 2020, "Assumptions to Annual Energy Outlook, Electricity Market Module,"
                # Available: https://www.eia.gov/outlooks/aeo/assumptions/pdf/electricity.pdf
                # fixed = np.ones_like(age)
                # fixed[age < 30] *= 27 * 1000
                # fixed[age >= 30] *= (27+37) * 1000

                _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment(
                    fixed, 2015, target_usd_year)
                _df["Var_OM_cost_per_MWh"] = atb_var_om_mwh * (existing_hr /
                                                               new_build_hr)

        else:
            _df["Fixed_OM_cost_per_MWyr"] = atb_fixed_om_mw_yr
            _df["Var_OM_cost_per_MWh"] = atb_var_om_mwh * (existing_hr /
                                                           new_build_hr)

        df_list.append(_df)

    mod_results = pd.concat(df_list, ignore_index=True)
    # mod_results = mod_results.sort_values(["model_region", "technology", "cluster"])
    mod_results.loc[:,
                    "Fixed_OM_cost_per_MWyr"] = mod_results.loc[:,
                                                                "Fixed_OM_cost_per_MWyr"].astype(
                                                                    int)
    mod_results.loc[:,
                    "Var_OM_cost_per_MWh"] = mod_results.loc[:,
                                                             "Var_OM_cost_per_MWh"]

    return mod_results
示例#28
0
def _create_usage_table_if_it_does_not_exist(
        db_engine: sqlalchemy.engine.base.Engine) -> None:
    if _table_exist(USAGE_TABLE_NAME, db_engine) is False:
        db_engine.execute(f"""CREATE TABLE {USAGE_TABLE_NAME} (
            FILE_NAME TEXT PRIMARY KEY
        );""")
示例#29
0
def fetch_atb_costs(
    pudl_engine: sqlalchemy.engine.base.Engine,
    settings: dict,
    offshore_spur_costs: pd.DataFrame = None,
) -> pd.DataFrame:
    """Get NREL ATB power plant cost data from database, filter where applicable.

    This function can also remove NREL ATB offshore spur costs if more accurate costs
    will be included elsewhere (e.g. as part of total interconnection costs).

    Parameters
    ----------
    pudl_engine : sqlalchemy.Engine
        A sqlalchemy connection for use by pandas
    settings : dict
        User-defined parameters from a settings file. Needs to have keys
        `atb_data_year`, `atb_new_gen`, and `target_usd_year`. If the key
        `atb_financial_case` is not included, the default value will be "Market".
    offshore_spur_costs : pd.DataFrame
        An optional dataframe with spur costs for offshore wind resources. These costs
        are included in ATB for a fixed distance (same for all sites). PowerGenome
        interconnection costs for offshore sites include a spur cost calculated
        using actual distance from shore.

    Returns
    -------
    pd.DataFrame
        Power plant cost data with columns:
        ['technology', 'cap_recovery_years', 'cost_case', 'financial_case',
       'basis_year', 'tech_detail', 'fixed_o_m_mw', 'variable_o_m_mwh', 'capex', 'cf',
       'fuel', 'lcoe', 'wacc_nominal']
    """
    logger.info("Loading NREL ATB data")

    col_names = [
        "technology",
        "tech_detail",
        "cost_case",
        "parameter",
        "basis_year",
        "parameter_value",
        "dollar_year",
    ]
    atb_year = settings["atb_data_year"]
    fin_case = settings.get("atb_financial_case", "Market")

    # Fetch cost data from sqlite and create dataframe. Only get values for techs/cases
    # listed in the settings file.
    all_rows = []
    wacc_rows = []
    tech_list = []
    techs = settings["atb_new_gen"]
    mod_techs = []
    if settings.get("modified_atb_new_gen"):
        for _, m in settings.get("modified_atb_new_gen").items():
            mod_techs.append([
                m["atb_technology"], m["atb_tech_detail"], m["atb_cost_case"],
                None
            ])

    cost_params = (
        "capex_mw",
        "fixed_o_m_mw",
        "variable_o_m_mwh",
        "capex_mwh",
        "fixed_o_m_mwh",
    )
    # add_pv_wacc = True
    for tech in techs + mod_techs:
        tech, tech_detail, cost_case, _ = tech
        # if tech == "UtilityPV":
        #     add_pv_wacc = False

        s = f"""
        SELECT technology, tech_detail, cost_case, parameter, basis_year, parameter_value, dollar_year
        from technology_costs_nrelatb
        where
            technology == "{tech}"
            AND tech_detail == "{tech_detail}"
            AND financial_case == "{fin_case}"
            AND cost_case == "{cost_case}"
            AND atb_year == {atb_year}
            AND parameter IN ({','.join('?'*len(cost_params))})
        """
        all_rows.extend(pudl_engine.execute(s, cost_params).fetchall())

        if tech not in tech_list:
            # ATB2020 summary file provides a single WACC for each technology and a single
            # tech detail of "*", so need to fetch this separately from other cost params.
            # Only need to fetch once per technology.
            wacc_s = f"""
            select technology, cost_case, basis_year, parameter_value
            from technology_costs_nrelatb
            where
                technology == "{tech}"
                AND financial_case == "{fin_case}"
                AND cost_case == "{cost_case}"
                AND atb_year == {atb_year}
                AND parameter == "wacc_nominal"
            """
            wacc_rows.extend(pudl_engine.execute(wacc_s).fetchall())

        tech_list.append(tech)

    if "Battery" not in tech_list:
        df = pd.DataFrame(all_rows, columns=col_names)
        wacc_df = pd.DataFrame(
            wacc_rows,
            columns=["technology", "cost_case", "basis_year", "wacc_nominal"])
    else:
        # ATB doesn't have a WACC for battery storage. We use UtilityPV WACC as a default
        # stand-in -- make sure we have it in case.
        s = 'SELECT DISTINCT("technology") from technology_costs_nrelatb WHERE parameter == "wacc_nominal"'
        atb_techs = [x[0] for x in pudl_engine.execute(s).fetchall()]
        battery_wacc_standin = settings.get("atb_battery_wacc")
        battery_tech = [x for x in techs if x[0] == "Battery"][0]
        if isinstance(battery_wacc_standin, float):
            if battery_wacc_standin > 0.1:
                logger.warning(
                    f"You defined a battery WACC of {battery_wacc_standin}, which seems"
                    " very high. Check settings parameter `atb_battery_wacc`.")
            battery_wacc_rows = [(battery_tech[0], battery_tech[2], year,
                                  battery_wacc_standin)
                                 for year in range(2017, 2051)]
            wacc_rows.extend(battery_wacc_rows)
        elif battery_wacc_standin in atb_techs:
            # if battery_wacc_standin in tech_list:
            #     pass
            # else:
            logger.info(
                f"Using {battery_wacc_standin} {fin_case} WACC for Battery storage."
            )
            wacc_s = f"""
            select technology, cost_case, basis_year, parameter_value
            from technology_costs_nrelatb
            where
                technology == "{battery_wacc_standin}"
                AND financial_case == "{fin_case}"
                AND cost_case == "Mid"
                AND atb_year == {atb_year}
                AND parameter == "wacc_nominal"
            
            """
            b_rows = pudl_engine.execute(wacc_s).fetchall()
            battery_wacc_rows = [(battery_tech[0], battery_tech[2], b_row[2],
                                  b_row[3]) for b_row in b_rows]
            wacc_rows.extend(battery_wacc_rows)
        else:
            raise ValueError(
                f"The settings key `atb_battery_wacc` value is {battery_wacc_standin}. It "
                f"should either be a float or a string from the list {atb_techs}."
            )

        df = pd.DataFrame(all_rows, columns=col_names)
        wacc_df = pd.DataFrame(
            wacc_rows,
            columns=["technology", "cost_case", "basis_year", "wacc_nominal"])

    # Transform from tidy to wide dataframe, which makes it easier to fill generator
    # rows with the correct values.
    atb_costs = (df.drop_duplicates().set_index([
        "technology",
        "tech_detail",
        "cost_case",
        "dollar_year",
        "basis_year",
        "parameter",
    ]).unstack(level=-1))
    atb_costs.columns = atb_costs.columns.droplevel(0)
    atb_costs = (atb_costs.reset_index().merge(
        wacc_df, on=["technology", "cost_case",
                     "basis_year"], how="left").drop_duplicates())
    atb_costs = atb_costs.fillna(0)

    usd_columns = [
        "fixed_o_m_mw",
        "fixed_o_m_mwh",
        "variable_o_m_mwh",
        "capex_mw",
        "capex_mwh",
    ]
    for col in usd_columns:
        if col not in atb_costs.columns:
            atb_costs[col] = 0

    atb_target_year = settings["target_usd_year"]
    atb_costs[usd_columns] = atb_costs.apply(
        lambda row: inflation_price_adjustment(row[usd_columns],
                                               base_year=row["dollar_year"],
                                               target_year=atb_target_year),
        axis=1,
    )

    if any("PV" in tech for tech in tech_list) and atb_year == 2019:
        print("Inflating ATB 2019 PV costs from DC to AC")
        atb_costs.loc[atb_costs["technology"].str.contains("PV"),
                      ["capex_mw", "fixed_o_m_mw", "variable_o_m_mwh"],
                      ] *= settings.get("pv_ac_dc_ratio", 1.3)
    elif atb_year > 2019:
        logger.info(
            "PV costs are already in AC units, not inflating the cost.")

    if offshore_spur_costs is not None and "OffShoreWind" in atb_costs[
            "technology"]:
        idx_cols = ["technology", "tech_detail", "cost_case", "basis_year"]
        offshore_spur_costs = offshore_spur_costs.set_index(idx_cols)
        atb_costs = atb_costs.set_index(idx_cols)

        atb_costs.loc[idx["OffShoreWind", :, :, :], "capex_mw"] = (
            atb_costs.loc[idx["OffShoreWind", :, :, :], "capex_mw"]  # .values
            - offshore_spur_costs.loc[idx["OffShoreWind", :, :, :],
                                      "capex_mw"]  # .values
        )
        atb_costs = atb_costs.reset_index()

    return atb_costs