def main(): #Create EIA API using your specific API key api_key = "3c109e8bc5897c4015d86e77e699ebc6" api = eia.API(api_key) #Declare desired series ID series_ID = 'TOTAL.SOT5PUS.A' global df df = retrieve_time_series(api, series_ID)
def __call_api(report, key="", series=None): api = eia.API(key) if series == None: series_search = api.data_by_series( series=commonData.dictEiaReports.get(report)) else: series_search = api.data_by_series(series=series) result = pd.DataFrame(series_search) return result
def main(): api_key = "YOUR_API_KEY_HERE" api = eia.API(api_key) #Total US inventories #series_ID1='http://api.eia.gov/series/?api_key=YOUR_API_KEY_HER&series_id=PET.WCESTUS1.W' series_ID1 = 'http://api.eia.gov/series/?YOUR_API_KEY_HER&series_id=PET.W_EPC0_SAX_YCUOK_MBBL.W' series_ID2 = 'http://api.eia.gov/series/?api_key=YOUR_API_KEY_HER&series_id=PET.RWTC.D' df1 = retrieve_time_series(api, series_ID1) df2 = retrieve_time_series(api, series_ID2) plot_time_series(df1, df2)
def main(): """ Run main script """ #Create EIA API using your specific API key api_key = "YOUR API KEY HERE" api = eia.API(api_key) #Declare desired series ID series_ID = 'EMISS.CO2-TOTV-TT-NG-TX.A' df = retrieve_time_series(api, series_ID) #Print the returned dataframe df print(df)
def main(): api_key = "5fbe8e00551266c048f84d7d28961828" api = eia.API(api_key) series_ID = 'EBA.PSEI-ALL.D.HL' df = retrieve_time_series(api, series_ID) # Cleaning the data df.reset_index(level=0, inplace=True) df.rename(columns={ 'index': 'Date', df.columns[1]: 'Electricity Demand' }, inplace=True) df['Hour'] = df['Date'].str[10:12] df['Date'] = pd.to_datetime(df['Date'].str[:-9], format='%Y %m%d') df['Date'] = pd.to_datetime( df['Date']) + df['Hour'].astype('timedelta64[h]')
df_BAA = pd.read_excel('data/BA_Codes_930.xlsx', sheetname='Table 1') df_BAA.drop(df_BAA.index[:3], inplace=True) df_BAA.rename(columns={ 'HOURLY AND DAILY BALANCING AUTHORITY': 'BAA_Acronym', 'Unnamed: 1': 'BAA_Name', 'Unnamed: 2': 'NRC_ID', 'Unnamed: 3': 'Region' }, inplace=True) BAA = pd.np.array(df_BAA['BAA_Acronym']) #%% #Use the EIA python call to pull an example dataset from the EIA API #Use this example dataset to figure out how to format dates api_key = "d365fe67a9ec71960d69102951ae474f" api = eia.API(api_key) series_search = api.data_by_series(series='EBA.PJM-ALL.TI.H') df = pd.DataFrame(series_search) df.index #Convert dataframe index of date strings to a list of date strings for processing date_list = df.index.tolist() #Remove the last three characters from the date string 2015 0701T05Z 01; #Datetime won't process two instances of days of the month, so I have to remove the '01' dates_trimmed = [x[:-3] for x in date_list] #Use datetime.strptime to parse the sting into a datetime object dates_formatted = [ datetime.strptime(date, '%Y %m%dT%HZ') for date in dates_trimmed ]
def main(): api_key = "YOUR_API_KEY_HERE" api = eia.API(api_key) series_ID='http://api.eia.gov/series/?api_key=6569647505cfb6f73e1aa9363045abc3&series_id=PET.WCESTUS1.W' df = retrieve_time_series(api, series_ID) plot_time_series(df)
import pandas as pd import eia import utils import api_keys if __name__ == "__main__": eia_api = eia.API(api_keys.eia) time_series_label = "TOTAL.COEXPUS.M" # (C)rude (O)il (E)xport - (M)onthly eia_data = pd.DataFrame(eia_api.data_by_series(series=time_series_label)) utils.clean_EIA_series(eia_data, column_label=time_series_label) utils.plot_time_series( eia_data, x_label="Date", x_unit="Year", y_label="U.S Exports of Crude Oil (Monthly)", y_unit="Thousand Barrels Per Day", column_name=time_series_label, ) pass
def getEIAData(series_ID): api_key = "776d3a3fe9bf6dfbd47b9141d0059f79" api = eia.API(api_key) df = retrieve_time_series(api, series_ID) return df
#pip install EIA-python #pip install networkx import numpy as np import pandas as pd import eia import networkx as nx import matplotlib.pyplot as plt #Get API key from EIA website and pass into eia.API() method apiKey = "5f54b3e66477e22ec068066b1de8026d" api = eia.API(apiKey) series_id_list = [ "INTL.57-1-DZA-TBPD.M", "INTL.57-1-AGO-TBPD.M", "INTL.57-1-COG-TBPD.M", "INTL.57-1-COD-TBPD.M", "INTL.57-1-ECU-TBPD.M", "INTL.57-1-GNQ-TBPD.M", "INTL.57-1-GAB-TBPD.M", "INTL.57-1-IRN-TBPD.M", "INTL.57-1-IRQ-TBPD.M", "INTL.57-1-KWT-TBPD.M", "INTL.57-1-LBY-TBPD.M", "INTL.57-1-NGA-TBPD.M", "INTL.57-1-QAT-TBPD.M", "INTL.57-1-RUS-TBPD.M", "INTL.57-1-SAU-TBPD.M", "INTL.57-1-ARE-TBPD.M", "INTL.57-1-VEN-TBPD.M", "INTL.57-1-USA-TBPD.M" ] #call the method for each series within the api.data_by_series() method and plug into a pandas dataframe df_list = [ pd.DataFrame(api.data_by_series(series)) for series in series_id_list ] oil_data = pd.concat(df_list, axis=1) #Drop NAN values oil_data = oil_data.replace("--", np.nan) oil_data_reduced = oil_data.dropna() oil_data_reduced
def fetch_eia(api_key, plant_id, file_path): """ Read in EIA data of wind farm of interest - from EIA API for monthly productions, return monthly net energy generation time series - from local Excel files for wind farm metadata, return dictionary of metadata Args: api_key(:obj:`string`): 32-character user-specific API key, obtained from EIA plant_id(:obj:`string`): 5-character EIA power plant code file_path(:obj:`string`): directory with EIA metadata .xlsx files in 2017 Returns: :obj:`pandas.Series`: monthly net energy generation in MWh :obj:`dictionary`: metadata of the wind farm with 'plant_id' """ # EIA metadata plant_var_list = [ "City", "Latitude", "Longitude", "Balancing Authority Name", "Transmission or Distribution System Owner", ] wind_var_list = [ "Utility Name", "Plant Name", "State", "County", "Nameplate Capacity (MW)", "Operating Month", "Operating Year", "Number of Turbines", "Predominant Turbine Manufacturer", "Predominant Turbine Model Number", "Turbine Hub Height (Feet)", ] def meta_dic_fn(metafile, sheet, var_list): all_plant = pd.read_excel(file_path + metafile, sheet_name=sheet, skiprows=1) eia_plant = all_plant.loc[all_plant["Plant Code"] == np.int( plant_id)] # specific wind farm if eia_plant.shape[0] == 0: # Couldn't locate EIA ID in database raise Exception("Plant ID not found in EIA database") eia_info = eia_plant[var_list] # select column eia_info = eia_info.reset_index(drop=True) # reset index to 0 eia_dic = eia_info.T.to_dict() # convert to dictionary out_dic = eia_dic[ 0] # remove extra level of dictionary, "0" in this case return out_dic # file path with 2017 EIA metadata files plant_dic = meta_dic_fn("2___Plant_Y2017.xlsx", "Plant", plant_var_list) wind_dic = meta_dic_fn("3_2_Wind_Y2017.xlsx", "Operable", wind_var_list) # convert feet to meter hubheight_meter = np.round( unit_conversion.convert_feet_to_meter( wind_dic["Turbine Hub Height (Feet)"])) wind_dic.update({"Turbine Hub Height (m)": hubheight_meter}) wind_dic.pop("Turbine Hub Height (Feet)", None) # delete hub height in feet out_dic = plant_dic.copy() out_dic.update(wind_dic) # append dictionary # EIA monthly energy production data api = eia.API(api_key) # get data from EIA series_search_m = api.data_by_series(series="ELEC.PLANT.GEN.%s-ALL-ALL.M" % plant_id) eia_monthly = pd.DataFrame( series_search_m) # net monthly energy generation of wind farm in MWh eia_monthly.columns = ["eia_monthly_mwh"] # rename column eia_monthly = eia_monthly.set_index(pd.DatetimeIndex( eia_monthly.index)) # convert to DatetimeIndex return eia_monthly, out_dic
def main(): """ Run main script """ #Create EIA API using your specific API key api_key = "YOR API KEY HERE" api = eia.API(api_key) #Pull the electricity price data series_ID = 'ELEC.PRICE.TX-ALL.M' electricity_df = retrieve_time_series(api, series_ID) electricity_df.reset_index(level=0, inplace=True) #Rename the columns for easer analysis electricity_df.rename(columns={ 'index': 'Date', electricity_df.columns[1]: 'Electricity_Price' }, inplace=True) #Convert the Date column into a date object electricity_df['Date'] = pd.to_datetime(electricity_df['Date']) #Set Date as a Pandas DatetimeIndex electricity_df.index = pd.DatetimeIndex(electricity_df['Date']) #Decompose the time series into parts decompose_time_series(electricity_df['Electricity_Price']) #Pull in natural gas time series data series_ID = 'NG.N3035TX3.M' nat_gas_df = retrieve_time_series(api, series_ID) nat_gas_df.reset_index(level=0, inplace=True) #Rename the columns nat_gas_df.rename(columns={ 'index': 'Date', nat_gas_df.columns[1]: 'Nat_Gas_Price_MCF' }, inplace=True) #Convert the Date column into a date object nat_gas_df['Date'] = pd.to_datetime(nat_gas_df['Date']) #Set Date as a Pandas DatetimeIndex nat_gas_df.index = pd.DatetimeIndex(nat_gas_df['Date']) #Decompose the time series into parts decompose_time_series(nat_gas_df['Nat_Gas_Price_MCF']) #Merge the two time series together based on Date Index master_df = pd.merge(electricity_df['Electricity_Price'], nat_gas_df['Nat_Gas_Price_MCF'], left_index=True, right_index=True) master_df.reset_index(level=0, inplace=True) #Plot the two variables in the same plot plt.plot(master_df['Date'], master_df['Electricity_Price'], label="Electricity_Price") plt.plot(master_df['Date'], master_df['Nat_Gas_Price_MCF'], label="Nat_Gas_Price") # Place a legend to the right of this smaller subplot. plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.title('Natural Gas Price vs. TX Electricity Price over Time') plt.show() #Transform the columns using natural log master_df['Electricity_Price_Transformed'] = np.log( master_df['Electricity_Price']) master_df['Nat_Gas_Price_MCF_Transformed'] = np.log( master_df['Nat_Gas_Price_MCF']) #In order to make the time series stationary, difference the data by 1 month n = 1 master_df['Electricity_Price_Transformed_Differenced'] = master_df[ 'Electricity_Price_Transformed'] - master_df[ 'Electricity_Price_Transformed'].shift(n) master_df['Nat_Gas_Price_MCF_Transformed_Differenced'] = master_df[ 'Nat_Gas_Price_MCF_Transformed'] - master_df[ 'Nat_Gas_Price_MCF_Transformed'].shift(n) #Run each differenced time series thru the Augmented Dickey Fuller test print('Augmented Dickey-Fuller Test: Electricity Price Time Series') augmented_dickey_fuller_statistics( master_df['Electricity_Price_Transformed_Differenced'].dropna()) print('Augmented Dickey-Fuller Test: Natural Gas Price Time Series') augmented_dickey_fuller_statistics( master_df['Nat_Gas_Price_MCF_Transformed_Differenced'].dropna()) #Conver the dataframe to a numpy array master_array = np.array(master_df[[ 'Electricity_Price_Transformed_Differenced', 'Nat_Gas_Price_MCF_Transformed_Differenced' ]].dropna()) #Generate a training and test set for building the model: 95/5 split training_set = master_array[:int(0.95 * (len(master_array)))] test_set = master_array[int(0.95 * (len(master_array))):] #Fit to a VAR model model = VAR(endog=training_set) model_fit = model.fit() #Print a summary of the model results model_fit.summary() #Compare the forecasted results to the real data prediction = model_fit.forecast(model_fit.y, steps=len(test_set)) #Merge the array data back into the master dataframe, and un-difference and back-transform data_with_predictions = pd.DataFrame(np.vstack( (training_set, prediction))).rename( columns={ 0: 'Electricity_Price_Transformed_Differenced_PostProcess', 1: 'Nat_Gas_Price_MCF_Transformed_Differenced_PostProcess' }) #Define which data is predicted and which isn't in the 'Predicted' column data_with_predictions.loc[:, 'Predicted'] = 1 data_with_predictions.loc[(data_with_predictions.index >= 0) & (data_with_predictions.index <= (len(training_set) - 1)), 'Predicted'] = 0 #Add a row of NaN at the begining of the df data_with_predictions.loc[-1] = [None, None, None] # adding a row data_with_predictions.index = data_with_predictions.index + 1 # shifting index data_with_predictions.sort_index(inplace=True) #Add back into the original dataframe master_df.loc[:, 'Electricity_Price_Transformed_Differenced_PostProcess'] = data_with_predictions[ 'Electricity_Price_Transformed_Differenced_PostProcess'] master_df.loc[:, 'Predicted'] = data_with_predictions['Predicted'] #Un-difference the data for i in range(1, len(master_df.index) - 1): master_df.at[i, 'Electricity_Price_Transformed'] = master_df.at[ i - 1, 'Electricity_Price_Transformed'] + master_df.at[ i, 'Electricity_Price_Transformed_Differenced_PostProcess'] #Back-transform the data master_df.loc[:, 'Predicted_Electricity_Price'] = np.exp( master_df['Electricity_Price_Transformed']) #Compare the forecasted data to the real data print(master_df[master_df['Predicted'] == 1][[ 'Date', 'Electricity_Price', 'Predicted_Electricity_Price' ]]) #Evaluate the accuracy of the results, pre un-differencing and back-transformation calculate_model_accuracy_metrics( list(master_df[master_df['Predicted'] == 1]['Electricity_Price']), list(master_df[master_df['Predicted'] == 1] ['Predicted_Electricity_Price']))
class UpdateParams: """ The intention of this class is to obtain the latest LCOH parameters that are from online databases. Expects inputs of NG, PETRO or, COAL NG -> industrial price ($ per thousand cubic feet) COAL -> other industrial use price ($ per short ton) Petroleum -> residual fuel oil prices by area -> wholesale/resale price by all sellers annual ($ per gallon) CHECK UNITS BETWEEN SERIES _ HENRY HUB , etc. """ path = "./calculation_data" today = datetime.datetime.now() eia_api_key = "68ea6b4094e685e32ec986a8053568d9" api = eia.API(eia_api_key) eerc_esc = pd.read_csv(os.path.join(path, "EERC_Fuel_Esc.csv"), index_col=["State"]) def get_max_fp(state_abbr, fuel_type="NG", year=False): """Obtains max state-level fuel price""" if (not year): year = UpdateParams.today.year if fuel_type.upper() == "NG": series_ID = "NG.N3035" + state_abbr + "3.A" elif fuel_type.upper() == "COAL": series_ID = "COAL.COST." + state_abbr + "-10.A" elif fuel_type.upper() == "PETRO": series_ID = "PET.EMA_EPPR_PWA_S" + state_abbr + "_DPG.A" else: raise AssertionError("Please input a valid fuel_type") # Check if state-level available, if not return USA price try: fuel_series = UpdateParams.api.data_by_series(series=series_ID) dict_key = list(fuel_series.keys())[0] # if fuel price in state is empty return national price if all(v is None for v in list(fuel_series[dict_key].values())): return 0.0 except KeyError: return 0.0 j = 0 while True: try: return fuel_series[dict_key][str(year - j) + " "] / 1.0 break except: j += 1 def get_fuel_price(state_abbr, fuel_type="NG", year=False): """Obtain fuel avgs on the annul, state scale from the EIA database.""" if (not year): year = UpdateParams.today.year if fuel_type.upper() == "NG": series_ID = "NG.N3035" + state_abbr + "3.A" series_USA = "NG.RNGWHHD.A" series_LA = UpdateParams.api.data_by_series(series="NG.N3035" + "LA" + "3.A") dict_key_LA = list(series_LA.keys())[0] elif fuel_type.upper() == "COAL": series_ID = "COAL.COST." + state_abbr + "-10.A" series_USA = "COAL.COST.US-10.A" elif fuel_type.upper() == "PETRO": # state level wholesale/resale price data ends 2011 series_ID = "PET.EMA_EPPR_PWA_S" + state_abbr + "_DPG.A" series_USA = "PET.EMA_EPPR_PWG_NUS_DPG.A" else: raise AssertionError("Please input a valid fuel_type") fuel_series_USA = UpdateParams.api.data_by_series(series=series_USA) dict_key_USA = list(fuel_series_USA.keys())[0] # find latest USA value i = 0 while True: try: fp_USA = fuel_series_USA[dict_key_USA][str(year - i) + " "] / 1.0 break except: i += 1 # Check if state-level available, if not return USA price try: fuel_series = UpdateParams.api.data_by_series(series=series_ID) dict_key = list(fuel_series.keys())[0] # if fuel price in state is empty return national price if all(v is None for v in list(fuel_series[dict_key].values())): return (fp_USA, year - i) except KeyError: return (fp_USA, year - i) j = 0 # find latest year for state while True: try: fp_state = fuel_series[dict_key][str(year - j) + " "] / 1.0 break except: j += 1 if fuel_type.upper() == "NG": # series_LA is just the actual series not a series ID fp_mult = fp_state / series_LA[dict_key_LA][str(year - j) + " "] return (fp_mult * fp_USA / 1.037, year - j) # return USA value if 2 years more recent vs state if ((year - i) - (year - j) >= 2) | (fp_state >= fp_USA): return (fp_USA / 1.037, year - i) return (fp_state, year - j) def get_esc(state_abbr, fuel_type="NG"): """Grabs fuel esc from EERC""" temp_dict = {"NG": "Natural Gas", "COAL": "Coal", "PETRO": "Residual"} return UpdateParams.eerc_esc.loc[state_abbr, temp_dict[fuel_type]] def create_index(): """ https://fred.stlouisfed.org/series/WPU061 - producer price index csv https://www.chemengonline.com/pci - chemical eng cost index - by year """ path = UpdateParams.path def remove_nonnumeric(string): dummy_var = float(re.sub(r'[^\d.]', '', string)) return dummy_var def get_CE_index(): cost_dict = {} index_list = [ "CE INDEX", "Equipment", "Heat Exchangers and Tanks", "Process Machinery", "Pipe, valves and fittings", "Process Instruments", "Pumps and Compressors", "Electrical equipment", "Structural supports", "Construction Labor", "Buildings", "Engineering Supervision" ] # grab raw txt file = open(os.path.join(path, "cost_index.txt"), "r") text = file.read() file.close() # modify initial year here data = text.split("1978") # Remove the initial few words data.pop(0) cost_dict['1978'] = data[0:12] del data[0:12] data = data[0] # Go through text and grab data points as a function of year for i in range(1979, 2019): data = data.split(str(i)) data.pop(0) cost_dict[str(i)] = data[0:12] del data[0:12] data = data[0] df = pd.DataFrame(cost_dict, index=index_list) return df.applymap(remove_nonnumeric) ce_index = get_CE_index() def get_ppi_inds(): """https://www.bls.gov/developers/api_signature_v2.htm""" # noyears is the maximum number of years you can pull from api in 1 query noyears = 20 # the last year that you want the data from - default is this year -1 endyear = UpdateParams.today.year - 1 # the first year that you want the data from, if not available NaN will be the value startyear = 1970 noyear_list = [noyears] * ((endyear - startyear) // noyears) + [ (endyear - startyear) % 20 + 1 ] year_tracker = endyear df = pd.DataFrame( columns=list(map(str, list(range(startyear, endyear + 1))))) for noyears in noyear_list: headers = {'Content-type': 'application/json'} # please label your series as "PPI series id" : "df label name" series_list = \ OrderedDict({ 'WPU061': "Industrial Chemicals", "PCU33241033241052": "Boiler", "PCU333994333994": "Furnace", "PCU333414333414": "Solar Field", "PCU33361133361105": "CHP", "WPU10250105": "Aluminum", "WPU11790105": "BatteryStorage" }) data = json.dumps({ "seriesid": list(series_list.keys()), "annualaverage": "true", "startyear": str(year_tracker - noyears + 1), "endyear": str(year_tracker), "registrationkey": "2ad8d1d2aa574a05a389c070bee5e070" }) p = requests.post( 'https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers) json_data = json.loads(p.text) pd_dict = {} for i in range(len(json_data["Results"]["series"])): series_id = json_data["Results"]["series"][i]["seriesID"] pd_dict[series_id] = [ j for j in json_data["Results"]["series"][i]["data"] if j["periodName"] == "Annual" ] for i in series_list.keys(): ser_vals = [j["value"] for j in pd_dict[i][::-1]] ser_vals = [float("nan") ] * (noyears - len(ser_vals)) + ser_vals df.loc[series_list[i], list( map( str, list( range(year_tracker - noyears + 1, year_tracker + 1))))] = ser_vals year_tracker -= noyears return df ppi_index = get_ppi_inds() comb_index = pd.concat([ce_index, ppi_index], join="outer", sort=True) comb_index.to_csv(os.path.join(path, "cost_index_data.csv"))
def __init__(self, **kwargs): """Initialize, loading data.""" import eia import googlemaps import zillow from shapely.geometry import Polygon self._p = kwargs.get('logger') load_canopy_polys = kwargs.get('load_canopy_polys', True) self._dir_name = os.path.dirname(os.path.realpath(__file__)) # Load meta.json. self.prt('Loading meta file...') meta_path = os.path.join(self._dir_name, '..', 'meta.json') if os.path.exists(meta_path): with open(meta_path, 'r') as f: meta = json.load(f) self._imgs_mean = meta['mean'] self._imgs_std = meta['std'] self._train_count = meta['train_count'] # Load parcel data. self._parcels_fname = self.download_file(self._PARCELS_URL) with open(self._parcels_fname, 'r') as f: self._parcels = json.load(f) self._parcel_polygons = [ x.get('geometry', {}).get('coordinates', []) for x in self._parcels.get('features', []) ] self._parcel_polygons = list( filter(None, ([[y for y in x if len(y) >= 3] for x in self._parcel_polygons]))) self._parcel_polygons = [[Polygon(y) for y in x if len(y) >= 3] for x in self._parcel_polygons] with open(os.path.join(self._dir_name, '..', 'google.key'), 'r') as f: self._google_key = f.readline().strip() self._google_client = googlemaps.Client(key=self._google_key) with open(os.path.join(self._dir_name, '..', 'eia.key'), 'r') as f: self._eia_key = f.readline().strip() self._eia_client = eia.API(self._eia_key) with open(os.path.join(self._dir_name, '..', 'zillow.key'), 'r') as f: self._zillow_key = f.readline().strip() self._zillow_client = zillow.ValuationApi() self._cropsize = self._INPUT_IMG_SIZE - 2 * self._CROPPIX # Load canopy data. if not load_canopy_polys: return self._canopy_fname = os.path.join(self._dir_name, '..', 'geo', 'ENVIRONMENTAL_TreeCanopy2014.json') with open(self._canopy_fname, 'r') as f: self._canopies = json.load(f) raw_canpols = [ x.get('geometry', {}).get('coordinates', []) for x in self._canopies.get('features', []) if x.get('geometry', {}) is not None ] raw_canpols = list( filter(None, ([[y for y in x if len(y) >= 3] for x in raw_canpols]))) raw_canpols = [[ Polygon([(a, b + self._LAT_OFFSET) for a, b in y]) for y in x if len(y) >= 3 ] for x in raw_canpols] self._canopy_polygons = [] for canpoly in tqdm(raw_canpols, desc='Extracting canopy polygons'): cps = [x.buffer(0) for x in canpoly] cps = self._canopy_polygons.extend([ a for b in [ list(x.geoms) if 'multi' in str(type(x)).lower() else [x] for x in cps ] for a in b ]) self._model = None
def main(): """ Run main script """ #Create EIA API using your specific API key api_key = 'API KEY HERE' api = eia.API(api_key) #Pull the electricity price data series_ID = 'EBA.TEX-ALL.D.H' electricity_demand_df = retrieve_time_series(api, series_ID) electricity_demand_df.reset_index(level=0, inplace=True) #Rename the columns for easer analysis electricity_demand_df.rename(columns={ 'index': 'Date_Time', electricity_demand_df.columns[1]: 'Electricity_Demand_MWh' }, inplace=True) #Format the 'Date' column electricity_demand_df['Date_Time'] = electricity_demand_df[ 'Date_Time'].astype(str).str[:-4] #Remove the 'T' from the Date column electricity_demand_df['Date_Time'] = electricity_demand_df[ 'Date_Time'].str.replace('T', ' ') #Convert the Date column into a date object electricity_demand_df['Date_Time'] = pd.to_datetime( electricity_demand_df['Date_Time'], format='%Y %m%d %H') #Convert from UTC to Central Standard Time electricity_demand_df['Date_Time'] = electricity_demand_df[ 'Date_Time'].dt.tz_localize('UTC') electricity_demand_df['Date_Time'] = pd.to_datetime( electricity_demand_df['Date_Time'].dt.tz_convert( 'US/Central').dt.strftime("%Y-%m-%d %H:%M:%S")) #Plot the data on a yearly basis, using 2019 as an example year plot_data(df=electricity_demand_df[ (electricity_demand_df['Date_Time'] >= pd.to_datetime('2019-01-01')) & (electricity_demand_df['Date_Time'] < pd.to_datetime('2020-01-01'))], x_variable='Date_Time', y_variable='Electricity_Demand_MWh', title='TX Electricity Demand: 2019') #Plot the data on a monthly basis, using December 2017 as an example plot_data(df=electricity_demand_df[ (electricity_demand_df['Date_Time'] >= pd.to_datetime('2017-12-01')) & (electricity_demand_df['Date_Time'] < pd.to_datetime('2018-01-01'))], x_variable='Date_Time', y_variable='Electricity_Demand_MWh', title='TX Electricity Demand: December 2017') #Plot the data on a weekly basis, using July 1-7, 2019 as an example plot_data(df=electricity_demand_df[ (electricity_demand_df['Date_Time'] >= pd.to_datetime('2019-07-01')) & (electricity_demand_df['Date_Time'] < pd.to_datetime('2019-07-07'))], x_variable='Date_Time', y_variable='Electricity_Demand_MWh', title='TX Electricity Demand: Monday-Sunday July 1-7, 2019') #Pull the hour into and individual column electricity_demand_df['Hour'] = electricity_demand_df['Date_Time'].dt.hour #Pull the day of month for each reading electricity_demand_df['Day_Of_Month'] = electricity_demand_df[ 'Date_Time'].dt.day #Pull day of week for each reading electricity_demand_df['Day_Of_Week'] = electricity_demand_df[ 'Date_Time'].dt.day_name() #Pull the numeric value for day of the week electricity_demand_df['Day_Of_Week_Numeric'] = electricity_demand_df[ 'Date_Time'].dt.dayofweek + 1 #Pull the date in terms of week electricity_demand_df['Week'] = electricity_demand_df['Date_Time'].dt.week #Pull the month of the year electricity_demand_df['Month'] = electricity_demand_df[ 'Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x]) #Pull the numeric value for month electricity_demand_df['Month_Numeric'] = electricity_demand_df[ 'Date_Time'].dt.month #Pull th year electricity_demand_df['Year'] = electricity_demand_df['Date_Time'].dt.year #Calculate the hour with max demand for each date in the data set electricity_demand_df[ 'Peak_Demand_Hour_MWh_For_Day'] = electricity_demand_df.groupby( ['Day_Of_Month', 'Month', 'Year'], sort=False)['Electricity_Demand_MWh'].transform('max') #Create time series with just peak hourly data peak_demand_hour_df = electricity_demand_df[ electricity_demand_df['Electricity_Demand_MWh'] == electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']] #Rename the 'Hour' column to 'Peak_Demand_Hour' peak_demand_hour_df = peak_demand_hour_df.rename( columns={'Hour': 'Peak_Demand_Hour'}) #Create a histogram of counts by hour ax = peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot( kind='bar', title='Peak Demand Hour by Number of Occurrences') ax.set_xlabel("Demand Hour (0-23 hour)") ax.set_ylabel("Number of Occurrences") #Create a histogram of counts by peak demand hour, grouped by day of the week generate_histogram_of_aggregated_counts( peak_demand_hour_df, peak_demand_hour_column='Peak_Demand_Hour', group_by_column='Day_Of_Week_Numeric') #Create a histogram of counts by peak demand hour, grouped by month generate_histogram_of_aggregated_counts( peak_demand_hour_df, peak_demand_hour_column='Peak_Demand_Hour', group_by_column='Month_Numeric') #Subset the dataframe to only include the features and labels that we're going to use #in the random forest model peak_demand_hour_model = peak_demand_hour_df[[ 'Peak_Demand_Hour', 'Day_Of_Week', 'Week', 'Month' ]] #Convert the Week, Year, and Peak_Demand_Your variables into categoric string variables (from numeric) peak_demand_hour_model.loc[:, 'Week'] = peak_demand_hour_model['Week'].apply( str) peak_demand_hour_model.loc[:, 'Peak_Demand_Hour'] = 'Hour ' + peak_demand_hour_model[ 'Peak_Demand_Hour'].apply(str) #Pull the counts per peak demand hour category counts_by_category = pd.DataFrame( peak_demand_hour_model.groupby('Peak_Demand_Hour') ['Peak_Demand_Hour'].count()) #Isolate peak hour occurrences that occur more than 15 times more_than_15_occurrences = counts_by_category[ counts_by_category['Peak_Demand_Hour'] > 15] #Filter the data set to only include instances with more than 15 occurrences--this is just to remove #any super anomalous cases from the model peak_demand_hour_model = peak_demand_hour_model[ peak_demand_hour_model['Peak_Demand_Hour'].isin( list(more_than_15_occurrences.index))] #Remove the labels from the features features = peak_demand_hour_model.drop('Peak_Demand_Hour', axis=1) #One hot encode the categorical features features = pd.get_dummies(features) #Create labels labels = np.array(peak_demand_hour_model['Peak_Demand_Hour']) #Saving feature names for later use feature_list = list(features.columns) # Convert to numpy array features = np.array(features) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.25, random_state=5) #Create the parameter grid, which is plugged into #GridSearchCV, where all hyperparamter combos are tested to find the optimal parameters combination parameter_grid = { 'max_depth': [80, 90, 100, 110], 'n_estimators': [700, 800, 900, 1000, 1100, 1200] } grid_search_rf(parameter_grid, train_features, train_labels) """ Grid Search Outputs: Fitting 3 folds for each of 24 candidates, totalling 72 fits [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 25.3s [Parallel(n_jobs=-1)]: Done 72 out of 72 | elapsed: 54.0s finished {'max_depth': 100, 'n_estimators': 1100} """ #Plug in optimized model parameters into final RF model rf = RandomForestClassifier(n_estimators=1100, max_depth=100, random_state=1500) #Fit the model rf.fit(train_features, train_labels) # Use the forest's predict method on the test data print( confusion_matrix(test_labels, rf.predict(test_features), labels=[ 'Hour 8', 'Hour 9', 'Hour 10', 'Hour 14', 'Hour 15', 'Hour 16', 'Hour 17', 'Hour 18', 'Hour 19', 'Hour 20', 'Hour 21' ])) accuracy_score(test_labels, rf.predict(test_features), normalize=True, sample_weight=None) #Obtain feature importances in the model feature_importances = pd.DataFrame(rf.feature_importances_, index=feature_list, columns=['importance' ]).sort_values('importance', ascending=False) print(feature_importances)