def process(self, debug=False): start = time.time() self.logger.info("Running BaDataPyoCleaner for %d rows" % len(self.d.df)) self.d.df = self.d.df.fillna(0) if not debug: self.r = self.d.df.apply(self._process, axis=1) else: r_list = [] delta_list = [] for idx, row in self.d.df.iterrows(): _, r, deltas = self._process(row, debug=True) r_list.append(r) delta_list.append(deltas) self.r = pd.concat(r_list, axis=1).transpose() self.deltas = pd.concat(delta_list, axis=1).transpose() self.deltas.index = self.d.df.index self.r.index = self.d.df.index # Make sure the cleaning step performed as expected self.r = BaData(df=self.r) self.logger.info("Checking BAs...") for ba in self.r.regions: self.r.checkBA(ba) self.logger.info("Execution took %.2f seconds" % (time.time() - start))
def update_d3map(folder_in, folder_out, file_name, thresh_date="2000-01-01"): poll = BaData(fileNm=join(folder_in, f"{file_name}_co2.csv"), variable="CO2") elec = BaData(fileNm=join(folder_in, f"{file_name}_elec.csv"), variable="E") # Remove old map data shutil.rmtree(folder_out) os.makedirs(folder_out, exist_ok=True) for ts in poll.df.loc[thresh_date:, :].index: _ = create_graph(poll, elec, ts, folder_out=folder_out, save_data=True)
def process(self): """ Compute emissions production, consumption and flows. Compute (i) production emissions, and (ii) consumption-based emissions factors Then recreate a BaData object for emissions and check physical balances. """ self.logger.info("Running BaDataEmissionsCalc for %d rows" % len(self.df)) cnt_na = self.df.isna().any().sum() if cnt_na > 0: self.logger.warning(f"Setting {cnt_na} NaNs to zero") self.logger.debug( f"Dumping cols with NaNs: {self.df.columns[self.df.isna().any()]}" ) self._add_production_emissions() self._add_consumption_efs() # Create columns for demand for ba in self.regions: self.df.loc[:, "%s_%s_D" % (self.poll, ba)] = ( self.df.loc[:, "%si_%s_D" % (self.poll, ba)] * self.df.loc[:, self.ba_data.get_cols(r=ba, field="D")[0]] ) # Create columns for pairwise trade for ba in self.regions: for ba2 in self.ba_data.get_trade_partners(ba): imp = self.df.loc[:, self.KEY_E["ID"] % (ba, ba2)].apply( lambda x: min(x, 0) ) exp = self.df.loc[:, self.KEY_E["ID"] % (ba, ba2)].apply( lambda x: max(x, 0) ) self.df.loc[:, self.KEY_poll["ID"] % (ba, ba2)] = ( imp * self.df.loc[:, "%si_%s_D" % (self.poll, ba2)] + exp * self.df.loc[:, "%si_%s_D" % (self.poll, ba)] ) # Create columns for total trade for ba in self.regions: self.df.loc[:, self.KEY_poll["TI"] % ba] = self.df.loc[ :, [ self.KEY_poll["ID"] % (ba, ba2) for ba2 in self.ba_data.get_trade_partners(ba) ], ].sum(axis=1) # Create BaData object for pollutant self.poll_data = BaData( df=self.df.loc[ :, [col for col in self.df.columns if "%s_" % self.poll in col] ], variable=self.poll, ) # Check balances self.logger.warn("Consumption calcs - unimplemented balance check!")
def process(self, debug=False, with_ng_src=True): start = time.time() self.logger.info("Running BaDataCvxCleaner for %d rows" % len(self.d.df)) self.d.df = self.d.df.fillna(0) results = [] def cvx_solve(row, regions, debug=False): if row.isna().sum() > 0: raise ValueError("Cannot call this method on data with NaNs") n_regions = len(regions) D = row[[KEYS["E"]["D"] % r for r in regions]].values D_W = [ el**0.5 for el in row[[KEYS["E"]["D"] % r + "_W" for r in regions]].values ] NG = row[[KEYS["E"]["NG"] % r for r in regions]].values NG_W = [ el**0.5 for el in row[[KEYS["E"]["NG"] % r + "_W" for r in regions]].values ] TI = row[[KEYS["E"]["TI"] % r for r in regions]].values TI_W = [ el**0.5 for el in row[[KEYS["E"]["TI"] % r + "_W" for r in regions]].values ] delta_D = cp.Variable(n_regions, name="delta_D") delta_NG = cp.Variable(n_regions, name="delta_NG") delta_TI = cp.Variable(n_regions, name="delta_TI") obj = (cp.sum_squares(cp.multiply(D_W, delta_D)) + cp.sum_squares(cp.multiply(NG_W, delta_NG)) + cp.sum_squares(cp.multiply(TI_W, delta_TI))) ID = {} ID_W = {} for i, ri in enumerate(regions): for j, rj in enumerate(regions): if KEYS["E"]["ID"] % (ri, rj) in row.index: ID[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj)] ID_W[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj) + "_W"] delta_ID = {k: cp.Variable(name=f"{k}") for k in ID} constraints = [ D + delta_D >= 1.0, NG + delta_NG >= 1.0, D + delta_D + TI + delta_TI - NG - delta_NG == 0.0, ] if with_ng_src: NG_SRC = {} NG_SRC_W = {} for i, src in enumerate(SRC): for j, r in enumerate(regions): if KEYS["E"][f"SRC_{src}"] % r in row.index: NG_SRC[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r] NG_SRC_W[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r + "_W"] delta_NG_SRC = {k: cp.Variable(name=f"{k}") for k in NG_SRC} for k in NG_SRC: constraints += [NG_SRC[k] + delta_NG_SRC[k] >= 1.0] obj += NG_SRC_W[k] * delta_NG_SRC[k]**2 # Add the antisymmetry constraints twice is less efficient but not a huge deal. for ri, rj in ID: # then (rj, ri) must also be in ID constraints += [ ID[(ri, rj)] + delta_ID[(ri, rj)] + ID[(rj, ri)] + delta_ID[(rj, ri)] == 0.0 ] obj += ID_W[(ri, rj)] * delta_ID[(ri, rj)]**2 for i, ri in enumerate(regions): if with_ng_src: constraints += [ NG[i] + delta_NG[i] - cp.sum([ NG_SRC[(src, ri)] + delta_NG_SRC[(src, ri)] for src in SRC if (src, ri) in NG_SRC ]) == 0.0 ] constraints += [ TI[i] + delta_TI[i] - cp.sum([ ID[(ri, rj)] + delta_ID[(ri, rj)] for rj in regions if (ri, rj) in ID ]) == 0.0 ] objective = cp.Minimize(obj) prob = cp.Problem(objective, constraints) prob.solve() if with_ng_src: r = pd.concat([ pd.Series( NG + delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions], ), pd.Series( D + delta_D.value, index=[KEYS["E"]["D"] % r for r in regions], ), pd.Series( TI + delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions], ), pd.Series({ KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value for k in ID }), pd.Series({ KEYS["E"][f"SRC_{s}"] % r: NG_SRC[(s, r)] + delta_NG_SRC[(s, r)].value for (s, r) in NG_SRC }), pd.Series({"CleaningObjective": prob.value}), ]) else: r = pd.concat([ pd.Series( NG + delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions], ), pd.Series( D + delta_D.value, index=[KEYS["E"]["D"] % r for r in regions], ), pd.Series( TI + delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions], ), pd.Series({ KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value for k in ID }), pd.Series({"CleaningObjective": prob.value}), ]) if not debug: return r if with_ng_src: deltas = pd.concat([ pd.Series(delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions]), pd.Series(delta_D.value, index=[KEYS["E"]["D"] % r for r in regions]), pd.Series(delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions]), pd.Series( {KEYS["E"]["ID"] % k: delta_ID[k].value for k in ID}), pd.Series({ KEYS["E"][f"SRC_{s}"] % r: delta_NG_SRC[(s, r)].value for (s, r) in NG_SRC }), ]) else: deltas = pd.concat([ pd.Series(delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions]), pd.Series(delta_D.value, index=[KEYS["E"]["D"] % r for r in regions]), pd.Series(delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions]), pd.Series( {KEYS["E"]["ID"] % k: delta_ID[k].value for k in ID}), ]) return pd.concat([r, deltas.rename(lambda x: x + "_Delta")]) cvx_solve = dask.delayed(cvx_solve) for idx, row in self.d.df.iterrows(): results.append(cvx_solve(row, self.d.regions, debug=debug)) results = dask.compute(*results, scheduler="processes") df = pd.DataFrame(results, index=self.d.df.index) self.r = df.loc[:, [ c for c in df.columns if "Delta" not in c and "CleaningObjective" not in c ], ] self.CleaningObjective = df.CleaningObjective self.deltas = df.loc[:, [c for c in df.columns if "Delta" in c]] # Make sure the cleaning step performed as expected self.r = BaData(df=self.r) self.logger.info("Checking BAs...") for ba in self.r.regions: self.r.checkBA(ba) self.logger.info("Execution took %.2f seconds" % (time.time() - start))
def process(self): self.logger.info("Running BaDataBasicCleaner") start = time.time() data = self.d missing_D_cols = [ col for col in data.NG_cols if col not in data.D_cols ] self.logger.info("Adding demand columns for %d bas" % len(missing_D_cols)) for ba in missing_D_cols: data.df.loc[:, data.KEY["D"] % ba] = 1.0 data.df.loc[:, data.KEY["NG"] % ba] -= 1.0 data.df.loc[:, data.KEY["TI"] % ba] -= 1.0 # AVRN only exports to BPAT - this is missing for now if "AVRN" not in data.ID_cols: self.logger.info("Adding trade columns for AVRN") ba = "AVRN" ba2 = "BPAT" data.df.loc[:, data.KEY["ID"] % (ba, ba2)] = (data.df.loc[:, data.KEY["NG"] % ba] - 1.0) data.df.loc[:, data.KEY["ID"] % (ba2, ba)] = (-data.df.loc[:, data.KEY["NG"] % ba] + 1.0) # Add columns for biomass and geothermal for CISO # We are assuming constant generation for each of these sources # based on historical data. Before updating this, need to # contact the EIA API maintainers to understand why this isn't # reported and where to find it self.logger.info("Adding GEO and BIO columns for CISO") data.df.loc[:, "EBA.CISO-ALL.NG.GEO.H"] = 900.0 data.df.loc[:, "EBA.CISO-ALL.NG.BIO.H"] = 600.0 # data.df.loc[:, "EBA.CISO-ALL.NG.H"] += 600.0 + 900.0 # Add columns for the BAs that are outside of the US foreign_bas = list( set([col for col in data.ID_cols2 if col not in data.NG_cols])) self.logger.info( "Adding demand, generation and TI columns for %d foreign bas" % len(foreign_bas)) for ba in foreign_bas: trade_cols = [ col for col in data.df.columns if "%s.ID.H" % ba in col ] TI = -data.df.loc[:, trade_cols].sum(axis=1) data.df.loc[:, data.KEY["TI"] % ba] = TI exports = TI.apply(lambda x: max(x, 0)) imports = TI.apply(lambda x: min(x, 0)) data.df.loc[:, data.KEY["D"] % ba] = -imports data.df.loc[:, data.KEY["NG"] % ba] = exports if ba in ["BCHA", "HQT", "MHEB"]: # Assume for these Canadian BAs generation is hydro data.df.loc[:, data.KEY["SRC_WAT"] % ba] = exports else: # And all others are OTH (other) data.df.loc[:, data.KEY["SRC_OTH"] % ba] = exports for col in trade_cols: ba2 = re.split(r"\.|-|_", col)[1] data.df.loc[:, data.KEY["ID"] % (ba, ba2)] = -data.df.loc[:, col] # Make sure that trade columns exist both ways for col in data.get_cols(field="ID"): ba = re.split(r"\.|-|_", col)[1] ba2 = re.split(r"\.|-|_", col)[2] othercol = data.KEY["ID"] % (ba2, ba) if othercol not in data.df.columns: self.logger.info("Adding %s" % othercol) data.df.loc[:, othercol] = -data.df.loc[:, col] # Filter unrealistic values using self.reject_dict self._create_reject_dict() cols = (data.get_cols(field="D") + data.get_cols(field="NG") + data.get_cols(field="TI") + data.get_cols(field="ID")) for col in cols: s = data.df.loc[:, col] data.df.loc[:, col] = s.where((s >= self.reject_dict[col][0]) & (s <= self.reject_dict[col][1])) # Do the same for the generation by source columns # If there is no generation by source, add one that is OTH # Edge case for solar: # There are a lot of values at -50 MWh or so during the night. We want # to set those to 0, but consider that very negative values (below # -1GW) are rejected for ba in data.regions: missing = True for src in SRC: col = data.KEY["SRC_%s" % src] % ba if col in data.df.columns: missing = False s = data.df.loc[:, col] if src == "SUN": self.reject_dict[col] = (-1e3, 200e3) data.df.loc[:, col] = s.where( (s >= self.reject_dict[col][0]) & (s <= self.reject_dict[col][1])) if src == "SUN": data.df.loc[:, col] = data.df.loc[:, col].apply( lambda x: max(x, 0)) if missing: data.df.loc[:, data.KEY["SRC_OTH"] % ba] = data.df.loc[:, data.KEY["NG"] % ba] # Reinitialize fields self.logger.info("Reinitializing fields") data = BaData(df=data.df) self.r = data self.logger.info("Basic cleaning took %.2f seconds" % (time.time() - start))
class BaDataPyoCleaner(BaDataCleaner): """ Optimization-based cleaning class. Uses pyomo to build the model and Gurobi as the default solver. """ def __init__(self, ba_data, weights=None, solver="gurobi"): super().__init__(ba_data) import pyomo.environ as pyo from pyomo.opt import SolverFactory self.m = BaDataPyoCleaningModel().m self.opt = SolverFactory(solver) self.weights = weights if weights is not None: self.d.df = pd.concat( [self.d.df, weights.rename(lambda x: x + "_W", axis=1)], axis=1) def process(self, debug=False): start = time.time() self.logger.info("Running BaDataPyoCleaner for %d rows" % len(self.d.df)) self.d.df = self.d.df.fillna(0) if not debug: self.r = self.d.df.apply(self._process, axis=1) else: r_list = [] delta_list = [] for idx, row in self.d.df.iterrows(): _, r, deltas = self._process(row, debug=True) r_list.append(r) delta_list.append(deltas) self.r = pd.concat(r_list, axis=1).transpose() self.deltas = pd.concat(delta_list, axis=1).transpose() self.deltas.index = self.d.df.index self.r.index = self.d.df.index # Make sure the cleaning step performed as expected self.r = BaData(df=self.r) self.logger.info("Checking BAs...") for ba in self.r.regions: self.r.checkBA(ba) self.logger.info("Execution took %.2f seconds" % (time.time() - start)) def _process(self, row, debug=False): if row.isna().sum() > 0: raise ValueError("Cannot call this method on data with NaNs") i = self._create_instance(row) self.opt.solve(i) r = pd.concat([ pd.Series({ self.d.KEY["NG"] % k: (i.NG[k] + pyo.value(i.delta_NG[k])) for k in i.regions }), pd.Series({ self.d.KEY["D"] % k: (i.D[k] + pyo.value(i.delta_D[k])) for k in i.regions }), pd.Series({ self.d.KEY["TI"] % k: (i.TI[k] + pyo.value(i.delta_TI[k])) for k in i.regions }), pd.Series({ self.d.KEY["ID"] % (k1, k2): (i.ID[k1, k2] + pyo.value(i.delta_ID[k1, k2])) for (k1, k2) in i.regions2 }), pd.Series({ self.d.KEY["SRC_%s" % s] % k: (i.NG_SRC[k, s] + pyo.value(i.delta_NG_SRC[k, s])) for (k, s) in i.regions_srcs }), ]) deltas = pd.concat([ pd.Series({ self.d.KEY["NG"] % k: (pyo.value(i.delta_NG[k])) for k in i.regions }), pd.Series({ self.d.KEY["D"] % k: (pyo.value(i.delta_D[k])) for k in i.regions }), pd.Series({ self.d.KEY["TI"] % k: (pyo.value(i.delta_TI[k])) for k in i.regions }), pd.Series({ self.d.KEY["ID"] % (k1, k2): (pyo.value(i.delta_ID[k1, k2])) for (k1, k2) in i.regions2 }), pd.Series({ self.d.KEY["SRC_%s" % s] % k: (pyo.value(i.delta_NG_SRC[k, s])) for (k, s) in i.regions_srcs }), ]) if not debug: return r return i, r, deltas def _create_instance(self, row): def append_W(x): return [c + "_W" for c in x] NG_SRC_data = self._get_ng_src(row) NG_SRC_data_W = self._get_ng_src(row, weights=True) opt_data = { None: { "regions": { None: self.d.regions }, "srcs": { None: SRC }, "regions2": { None: list( set([(re.split(r"\.|-|_", el)[1], re.split(r"\.|-|_", el)[2]) for el in self.d.df.columns if "ID" in re.split(r"\.|-|_", el)])) }, "regions_srcs": { None: list(NG_SRC_data.keys()) }, "D": self._reduce_cols( row.loc[self.d.get_cols(field="D")].to_dict()), "NG": self._reduce_cols( row.loc[self.d.get_cols(field="NG")].to_dict()), "TI": self._reduce_cols( row.loc[self.d.get_cols(field="TI")].to_dict()), "ID": self._reduce_cols( row.loc[self.d.get_cols(field="ID")].to_dict(), nfields=2), "NG_SRC": NG_SRC_data, } } if self.weights is not None: opt_data[None]["D_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="D"))].to_dict()) opt_data[None]["NG_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="NG"))].to_dict()) opt_data[None]["TI_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="TI"))].to_dict()) opt_data[None]["ID_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="ID"))].to_dict(), nfields=2) opt_data[None]["NG_SRC_W"] = NG_SRC_data_W instance = self.m.create_instance(opt_data) return instance def _reduce_cols(self, mydict, nfields=1): """ Helper function to simplify the names in a dictionary """ newdict = {} for k in mydict: if nfields == 1: newk = re.split(r"\.|-|_", k)[1] elif nfields == 2: newk = (re.split(r"\.|-|_", k)[1], re.split(r"\.|-|_", k)[2]) else: raise ValueError("Unexpected argument") newdict[newk] = mydict[k] return newdict def _get_ng_src(self, r, weights=False): """ Helper function to get the NG_SRC data. """ mydict = {} for ba in self.d.regions: for src in SRC: col = self.d.KEY["SRC_%s" % src] % ba if weights: col += "_W" if col in self.d.df.columns: mydict[(ba, src)] = r[col] return mydict
def process(self, file_name="", folder_hist="", nruns=2): """ Processor function for the cleaner object. Parameters ---------- file_name : str Base name of the file from which to read historical data. Data is read from "%s_basic.csv" % file_name folder_hist : str Folder from which to read historical data nruns : int Number of times to apply the rolling window procedure Notes ----- If we are not processing a large amount of data at a time, we may not have enough data to appropriately estimate the rolling mean and standard deviation for the rolling window procedure. If values are given for `file_name` and `folder_hist`, data will be read from a historical dataset to estimate the rolling mean and standard deviation. If there are very large outliers, they can 'mask' smaller outliers. Running the rolling window procedure a couple of times helps with this issue. """ self.logger.info("Running BaDataRollingCleaner (%d runs)" % nruns) start = time.time() data = self.d # Remember what part we are cleaning idx_cleaning = data.df.index try: # Load the data we already have in memory df_hist = pd.read_csv( os.path.join(folder_hist, "%s_basic.csv" % file_name), index_col=0, parse_dates=True, ) # Only take the last 1,000 rows # Note that if df_hist has less than 1,000 rows, # pandas knows to select df_hist without raising an error. df_hist = df_hist.iloc[-1000:] # Overwrite with the new data old_rows = df_hist.index.difference(data.df.index) df_hist = data.df.append(df_hist.loc[old_rows, :], sort=True) df_hist.sort_index(inplace=True) except FileNotFoundError: self.logger.info("No history file") df_hist = data.df # Apply rolling horizon threshold procedure # 20200206 update: don't try replacing NaNs anymore, leave that to the # next step for _ in range(nruns): df_hist = rolling_window_filter(df_hist, replace_nan_with_mean=False) # Deal with NaNs # First deal with NaNs by taking the average of the previous day and # next day. In general we observe strong daily patterns so this seems # to work well. Limit the filling procedure to one day at a time. If # there are multiple missing days, this makes for a smoother transition # between the two valid days. If we had to do this more than 4 times, # give up and use forward and backward fills without limits for col in df_hist.columns: npasses = 0 while (df_hist.loc[:, col].isna().sum() > 0) and (npasses < 4): npasses += 1 df_hist.loc[:, col] = pd.concat( [ df_hist.loc[:, col].groupby( df_hist.index.hour).ffill(limit=1), df_hist.loc[:, col].groupby( df_hist.index.hour).bfill(limit=1), ], axis=1, ).mean(axis=1) if npasses == 4: self.logger.debug("A lot of bad data for %s" % col) df_hist.loc[:, col] = pd.concat( [ df_hist.loc[:, col].groupby( df_hist.index.hour).ffill(), df_hist.loc[:, col].groupby( df_hist.index.hour).bfill(), ], axis=1, ).mean(axis=1) # All bad data columns if df_hist.loc[:, col].isna().sum() == len(df_hist): df_hist.loc[:, col] = 0.0 # Some NaNs will still remain - try using the rolling mean average df_hist, mean_ = rolling_window_filter(df_hist, replace_nan_with_mean=True, return_mean=True) if df_hist.isna().sum().sum() > 0: self.logger.warning("There are still some NaNs. Unexpected") # Just keep the indices we are working on currently data = BaData(df=df_hist.loc[idx_cleaning, :]) self.r = data self.weights = mean_.loc[idx_cleaning, :].applymap( lambda x: A / max(GAMMA, abs(x))) self.logger.info("Rolling window cleaning took %.2f seconds" % (time.time() - start))
def run_test(i="", level=0.2, debug=False): # Load raw data and restrict to a 2 day test period file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv") data_raw = BaData(fileNm=file_name_raw) start = pd.to_datetime("2020-11-01T00:00Z") end = pd.to_datetime("2020-11-03T00:00Z") data_raw.df = data_raw.df.loc[start:end] # Create a copy of the test dataset and modify it data_raw_copy = BaData(df=data_raw.df.copy(deep=True)) data_raw_copy.df.loc[ :, data_raw_copy.get_cols("CISO", "D")[0] ] *= np.random.uniform(1 - level, 1 + level, len(data_raw_copy.df)) # Set up test folder and save data to the folder tmp_folder = join(gridemissions.config["APEN_PATH"], "si_test4", f"{i}", "tmp") os.makedirs(tmp_folder, exist_ok=True) data_raw_copy.df.to_csv(join(tmp_folder, "EBA_raw.csv")) # Load historical data and restrict to 15 days before when we are testing folder_hist = join(gridemissions.config["APEN_PATH"], f"si_test4", "hist") if ~isdir(folder_hist): file_name_basic = join( gridemissions.config["APEN_PATH"], "data", "EBA_basic.csv" ) data_basic = BaData(fileNm=file_name_basic) end_hist = start start_hist = end_hist - pd.Timedelta("15D") data_basic.df = data_basic.df.loc[start_hist:end_hist] os.makedirs(folder_hist, exist_ok=True) data_basic.df.to_csv(join(folder_hist, "EBA_basic.csv")) # Run workflow on fake dataset make_dataset( tmp_folder=tmp_folder, folder_hist=folder_hist, scrape=False, ) # Reload results file_name = join(tmp_folder, "EBA_%s.csv") raw = BaData(fileNm=file_name % "raw") opt = BaData(fileNm=file_name % "opt") # Compute error d_col = raw.get_cols("CISO", "D")[0] error = ( (data_raw.df.loc[start:end, d_col] - opt.df.loc[:, d_col]).abs() / data_raw.df.loc[start:end, d_col] ).mean() if debug: basic = BaData(fileNm=file_name % "basic") rolling = BaData(fileNm=file_name % "rolling") return error, raw, basic, rolling, opt, data_raw return error
def make_dataset( start, end, file_name="EBA", tmp_folder=None, folder_hist=None, scrape=True, ): """ Make dataset between two dates Pull fresh data from the EIA API between `start` and `end`, then run the data through the cleaning workflow before computing consumption emissions. Uses historical data if available. """ start_time = time.time() if tmp_folder is None: tmp_folder = config["TMP_PATH"] tmp_folder.mkdir(exist_ok=True) file_name_raw = tmp_folder / f"{file_name}_raw.csv" file_name_basic = tmp_folder / f"{file_name}_basic.csv" eia_columns = load_eia_columns() if scrape: # else: assume that the file exists # Scrape EIA data logger.info("Scraping EIA data from %s to %s" % (start, end)) scraper = EBA_data_scraper() df = scraper.scrape(eia_columns, start=start, end=end, split_calls=True) df.to_csv(file_name_raw) # Basic data cleaning logger.info("Basic data cleaning") data = BaData(fileNm=file_name_raw) if len(data.df) == 0: raise ValueError( f"Aborting make_dataset: no new data in {file_name_raw}") cleaner = BaDataBasicCleaner(data) cleaner.process() cleaner.r.df.to_csv(file_name_basic) data = cleaner.r weights = None if folder_hist is not None: # Rolling-window-based data cleaning logger.info("Rolling window data cleaning") data = BaData(fileNm=file_name_basic) cleaner = BaDataRollingCleaner(data) cleaner.process(file_name, folder_hist) cleaner.r.df.to_csv(join(tmp_folder, "%s_rolling.csv" % file_name)) data = cleaner.r weights = cleaner.weights weights.to_csv(join(tmp_folder, "%s_weights.csv" % file_name)) else: logger.warning("No rolling window data cleaning!") if len(data.df.loc[:THRESH_DATE, :]) > 0: logger.info( f"Optimization-based cleaning without src data: pre {THRESH_DATE}") ba_data = BaData(df=data.df.loc[:THRESH_DATE, :]) if weights is not None: cleaner = BaDataCvxCleaner(ba_data, weights=weights.loc[:THRESH_DATE, :]) else: cleaner = BaDataCvxCleaner(ba_data) cleaner.process(debug=False, with_ng_src=False) cleaner.r.df.to_csv(join(tmp_folder, "%s_opt_no_src.csv" % file_name)) cleaner.CleaningObjective.to_csv( join(tmp_folder, "%s_objective_no_src.csv" % file_name)) # Only keep going if we have data post THRESH_DATE if len(data.df.loc[THRESH_DATE:, :]) == 0: return logger.info( f"Optimization-based cleaning with src data: post {THRESH_DATE}") data.df = data.df.loc[THRESH_DATE:, :] if weights is not None: cleaner = BaDataCvxCleaner(data, weights=weights.loc[THRESH_DATE:, :]) else: cleaner = BaDataCvxCleaner(data) cleaner.process(debug=False) cleaner.r.df.to_csv(join(tmp_folder, "%s_opt.csv" % file_name)) cleaner.CleaningObjective.to_csv( join(tmp_folder, "%s_objective.csv" % file_name)) # Post-processing (none for now) cleaner.r.df.to_csv(join(tmp_folder, "%s_elec.csv" % file_name)) data = cleaner.r # Consumption-based emissions logger.info("Computing consumption-based emissions") co2_calc = BaDataEmissionsCalc(data) co2_calc.process() co2_calc.poll_data.df.to_csv(join(tmp_folder, "%s_co2.csv" % file_name)) logger.info("gridemissions.workflows.make_dataset took %.2f seconds" % (time.time() - start_time))
def main(): # Setup plotting register_matplotlib_converters() plt.style.use("seaborn-paper") plt.rcParams["figure.figsize"] = [6.99, 2.5] plt.rcParams["grid.color"] = "k" plt.rcParams["axes.grid"] = True plt.rcParams["grid.linestyle"] = ":" plt.rcParams["grid.linewidth"] = 0.5 plt.rcParams["figure.dpi"] = 200 plt.rcParams["figure.dpi"] = 200 plt.rcParams["font.size"] = 10 cmap = cmocean.cm.cmap_d["phase"] colors = sns.color_palette("colorblind") # Parse args argparser = argparse.ArgumentParser() argparser.add_argument("--report", default="1", help="Which report to make") argparser.add_argument("--year", default="2021", help="""Which year, for report "heatmap" """) args = argparser.parse_args() # Configure logging logger = logging.getLogger("gridemissions") FIG_PATH = gridemissions.config["FIG_PATH"] # Load data file_name = join(gridemissions.config["DATA_PATH"], "analysis", "webapp", "EBA_%s.csv") co2 = BaData(fileNm=file_name % "co2", variable="CO2") elec = BaData(fileNm=file_name % "elec", variable="E") # Do work if args.report == "1": logger.info("Creating full hourly report") fig_folder = join(FIG_PATH, "hourly_full") for ba in elec.regions: annual_plot_hourly(elec, co2, ba, save=True, fig_folder=fig_folder) elif args.report == "2": logger.info("Creating full weekly report") fig_folder = join(FIG_PATH, "weekly_full") for ba in elec.regions: annual_plot_weekly(elec, co2, ba, save=True, fig_folder=fig_folder) elif args.report == "3": logger.info("Creating hourly report for last 2 weeks") fig_folder = join(FIG_PATH, "hourly_2weeks") now = datetime.utcnow() start = now - timedelta(hours=14 * 30) end = now small_elec = BaData(df=elec.df.loc[start:end]) small_co2 = BaData(df=co2.df.loc[start:end], variable="CO2") for ba in elec.regions: annual_plot_hourly(small_elec, small_co2, ba, save=True, fig_folder=fig_folder) elif args.report == "heatmap": logger.info(f"Running report heatmap for year {args.year}") fig_folder = pathlib.Path(FIG_PATH) / "heatmap_report" heatmap_report(co2, elec, year=args.year, fig_folder=fig_folder) _generate_contents_heatmap(fig_folder) elif args.report == "timeseries": logger.info(f"Running report timeseries") fig_folder = pathlib.Path(FIG_PATH) / "timeseries_report" timeseries_report(co2, elec, fig_folder=fig_folder) _generate_contents_timeseries(fig_folder) else: logger.error("Unknown report option! %s" % args.report)