def _format_start_end(self, data): formatted_sliced = [] if 'gen' not in self.options['data']: formatted_sliced = [ i for i in data if i['timestamp'] >= self.options['start_at'] and i['timestamp'] <= self.options['end_at'] ] else: try: yesterday = (self.local_now() - timedelta(days=2)).replace( hour=0, minute=0, second=0, microsecond=0) tomorrow = (self.local_now() + timedelta(days=1)).replace( hour=23, minute=0, second=0, microsecond=0) assert ((self.options['start_at'] >= yesterday) and (self.options['end_at'] <= tomorrow)) formatted_sliced = [ i for i in data if i['timestamp'] >= self.options['start_at'] and i['timestamp'] <= self.options['end_at'] ] except: LOGGER.error('Generation data error for %s' % self.BA) raise ValueError( 'Generation data is available for the \ previous and current day.', self.options) return formatted_sliced
def latest_fuel_mix(self): # set up request url = self.base_url + '/ria/FuelMix.aspx?CSV=True' # carry out request response = self.request(url) if not response: return pd.DataFrame() # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for generation') return pd.DataFrame() # preliminary parsing df = pd.read_csv(BytesIO(response.content), header=0, index_col=0, parse_dates=True) # set index df.index = self.utcify_index(df.index) df.index.set_names(['timestamp'], inplace=True) # set names and labels df['fuel_name'] = df.apply(lambda x: self.fuels[x['CATEGORY']], axis=1) df['gen_MW'] = df['ACT'] # return return df[['fuel_name', 'gen_MW']]
def latest_fuel_mix(self): # set up request url = self.base_url + '/ria/FuelMix.aspx?CSV=True' # carry out request response = self.request(url) if not response: return pd.DataFrame() # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for generation') return pd.DataFrame() # preliminary parsing df = pd.read_csv(BytesIO(response.content), header=0, index_col=0, parse_dates=True) # set index df.index = self.utcify_index(df.index) df.index.set_names(['timestamp'], inplace=True) # set names and labels df['fuel_name'] = df.apply(lambda x: self.fuels[x['CATEGORY']], axis=1) df['gen_MW'] = df['ACT'] # return return df[['fuel_name', 'gen_MW']]
def handle_options(self, **kwargs): """ Process and store keyword argument options. """ super(EIAClient, self).handle_options(**kwargs) if not hasattr(self, 'BA'): LOGGER.error('Balancing authority not set.') raise ValueError('Balancing authority not set.') if 'market' not in self.options: if self.options['forecast']: self.options['market'] = self.MARKET_CHOICES.dam elif self.options['sliceable'] and self.options['data'] == 'gen': self.options['market'] = self.MARKET_CHOICES.dam else: self.options['market'] = self.MARKET_CHOICES.hourly if 'freq' not in self.options: if self.options['forecast']: self.options['freq'] = self.FREQUENCY_CHOICES.hourly elif self.options['sliceable'] and self.options['data'] == 'gen': self.options['freq'] = self.FREQUENCY_CHOICES.hourly else: self.options['freq'] = self.FREQUENCY_CHOICES.hourly if 'yesterday' not in self.options: self.options['yesterday'] = False
def handle_options(self, **kwargs): """ Process and store keyword argument options. """ super(EIAClient, self).handle_options(**kwargs) if not hasattr(self, 'BA'): LOGGER.error('Balancing authority not set.') raise ValueError('Balancing authority not set.') if 'market' not in self.options: if self.options['forecast']: self.options['market'] = self.MARKET_CHOICES.dam elif self.options['sliceable'] and self.options['data'] == 'gen': self.options['market'] = self.MARKET_CHOICES.dam else: self.options['market'] = self.MARKET_CHOICES.hourly if 'freq' not in self.options: if self.options['forecast']: self.options['freq'] = self.FREQUENCY_CHOICES.hourly elif self.options['sliceable'] and self.options['data'] == 'gen': self.options['freq'] = self.FREQUENCY_CHOICES.hourly else: self.options['freq'] = self.FREQUENCY_CHOICES.hourly if 'yesterday' not in self.options: self.options['yesterday'] = False
def unzip(self, content): """ Unzip encoded data. Returns the unzipped content as an array of strings, each representing one file's content or returns None if an error was encountered. ***Previous behavior: Only returned the content from the first file*** """ # create zip file try: filecontent = BytesIO(content) except TypeError: filecontent = StringIO(content) try: # have zipfile z = zipfile.ZipFile(filecontent) except zipfile.BadZipfile: LOGGER.error('%s: unzip failure for content:\n%s' % (self.NAME, content)) return None # have unzipped content unzipped = [z.read(thisfile) for thisfile in z.namelist()] z.close() # return return unzipped
def get_generation(self, latest=False, yesterday=False, start_at=False, end_at=False, **kwargs): """ Scrape and parse generation fuel mix data. Note: Generation may be quite low for HST and NSB BAs. """ self.handle_options(data='gen', latest=latest, yesterday=yesterday, start_at=start_at, end_at=end_at, **kwargs) self.handle_ba_limitations() self.format_url() result = self.request(self.url) if result is not None: result_json = json.loads(result.text) result_formatted = self.format_result(result_json) return result_formatted else: LOGGER.error('No results for %s' % self.BA) return []
def get_ancillary_market_mcp(self, latest=False, **kwargs): # set up request url = self.base_url + '?messageType=getAncillaryMarketMCP&returnType=json' # carry out request response = self.request(url) if not response: return None # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for Ancillary Market MCP') return None ammcp = json.loads(response.content) # set args self.handle_options(latest=latest, **kwargs) # get data if self.options['latest']: data = self.parse_latest_ammcp_data(ammcp) elif self.options['forecast']: data = self.parse_forecast_ammcp_data(ammcp) else: raise ValueError('Either latest or forecast must be True') # return good return self.serialize_faster(data)
def parse_latest_ammcp_data(self, content): if not content: return pd.DataFrame() # add time to each zone. if self.options['latest']: for i in range(0, 8): content['MCPData']['RealTimeMCP']['Zone'][i][ 'time'] = content['MCPData']['MktDay'] + " " + content[ 'MCPData']['RealTimeMCP']['HourAndMin'] + ":00" df = pd.DataFrame(content['MCPData']['RealTimeMCP']['Zone'], columns=[ 'time', 'number', 'GenRegMCP', 'GenSpinMCP', 'GenSuppMCP', 'DemSuppMCP', 'RegMileageMCP' ]) df['time'] = pd.to_datetime(df['time']) df.set_index('time', inplace=True) # set index try: df.index = self.utcify_index(df.index) except AttributeError: LOGGER.error( 'MISO: Error in source data for Ancillary Market MCP data %s' % content) return pd.DataFrame() df.index.set_names(['timestamp'], inplace=True) return df
def parse_latest_fuel_mix(self, content): # handle bad input if not content: return pd.DataFrame() # preliminary parsing df = pd.read_csv(BytesIO(content), header=0, index_col=0, parse_dates=True) # set index try: df.index = self.utcify_index(df.index) except AttributeError: LOGGER.error('MISO: Error in source data for generation %s' % content) return pd.DataFrame() df.index.set_names(['timestamp'], inplace=True) # set names and labels df['fuel_name'] = df.apply(lambda x: self.fuels[x['CATEGORY']], axis=1) df['gen_MW'] = df['ACT'] # return return df[['fuel_name', 'gen_MW']]
def get_load(self, latest=False, yesterday=False, start_at=False, end_at=False, forecast=False, **kwargs): """ Scrape and parse load data. """ self.handle_options(data='load', latest=latest, start_at=start_at, end_at=end_at, **kwargs) self.handle_ba_limitations() self.format_url() result = self.request(self.url) if result is not None: result_json = json.loads(result.text) result_formatted = self.format_result(result_json) return result_formatted else: LOGGER.error('No results for %s' % self.BA) return []
def unzip(self, content): """ Unzip encoded data. Returns the unzipped content as an array of strings, each representing one file's content or returns None if an error was encountered. ***Previous behavior: Only returned the content from the first file*** """ # create zip file try: filecontent = BytesIO(content) except TypeError: filecontent = StringIO(content) try: # have zipfile z = zipfile.ZipFile(filecontent) except zipfile.BadZipfile: LOGGER.error('%s: unzip failure for content:\n%s' % (self.NAME, content)) return None # have unzipped content unzipped = [z.read(thisfile) for thisfile in z.namelist()] z.close() # return return unzipped
def latest_fuel_mix(self): # set up request url = self.base_url + "/ria/FuelMix.aspx?CSV=True" # carry out request response = self.request(url) if not response: return pd.DataFrame() # test for valid content if "The page cannot be displayed" in response.text: LOGGER.error("MISO: Error in source data for generation") return pd.DataFrame() # preliminary parsing df = pd.read_csv(StringIO(response.text), header=0, index_col=0, parse_dates=True) # set index df.index = self.utcify_index(df.index) df.index.set_names(["timestamp"], inplace=True) # set names and labels df["fuel_name"] = df.apply(lambda x: self.fuels[x["CATEGORY"]], axis=1) df["gen_MW"] = df["ACT"] # return return df[["fuel_name", "gen_MW"]]
def fetch_oasis(self, payload={}, return_all_files=False): """ Returns a list of report data elements, or an empty list if an error was encountered. If return_all_files=False, returns only the content from the first file in the .zip - this is the default behavior and was used in earlier versions of this function. If return_all_files=True, will return an array representing the content from each file. This is useful for processing LMP data or other fields where multiple price components are returned in a zip. """ # set up storage raw_data = [] if return_all_files is True: default_return_val = [] else: default_return_val = '' # try get response = self.request(self.base_url_oasis, params=payload) if not response: return default_return_val # read data from zip # This will be an array of content if successful, and None if unsuccessful content = self.unzip(response.content) if not content: return default_return_val # check xml content for errors soup = BeautifulSoup(content[0], 'lxml') error = soup.find('m:error') if error: code = error.find('m:err_code') desc = error.find('m:err_desc') msg = 'XML error for CAISO OASIS with payload %s: %s %s' % ( payload, code, desc) LOGGER.error(msg) return default_return_val # return xml or csv data if payload.get('resultformat', False) == 6: # If we requested CSV files if return_all_files: return content else: return content[0] else: # Return XML content if return_all_files: raw_data = [ BeautifulSoup(thisfile).find_all('report_data') for thisfile in content ] return raw_data else: raw_data = soup.find_all('report_data') return raw_data
def time_from_soup(self, soup): """ Returns a UTC timestamp if one is found in the soup, or None if an error was encountered. """ ts_elt = soup.find(class_='ts') if not ts_elt: LOGGER.error('PJM: Timestamp not found in soup:\n%s' % soup) return None return self.utcify(ts_elt.string)
def fetch_oasis(self, payload={}, return_all_files=False): """ Returns a list of report data elements, or an empty list if an error was encountered. If return_all_files=False, returns only the content from the first file in the .zip - this is the default behavior and was used in earlier versions of this function. If return_all_files=True, will return an array representing the content from each file. This is useful for processing LMP data or other fields where multiple price components are returned in a zip. """ # set up storage raw_data = [] if return_all_files is True: default_return_val = [] else: default_return_val = '' # try get response = self.request(self.base_url_oasis, params=payload) if not response: return default_return_val # read data from zip # This will be an array of content if successful, and None if unsuccessful content = self.unzip(response.content) if not content: return default_return_val # check xml content for errors soup = BeautifulSoup(content[0], 'lxml') error = soup.find('m:error') if error: code = error.find('m:err_code') desc = error.find('m:err_desc') msg = 'XML error for CAISO OASIS with payload %s: %s %s' % (payload, code, desc) LOGGER.error(msg) return default_return_val # return xml or csv data if payload.get('resultformat', False) == 6: # If we requested CSV files if return_all_files: return content else: return content[0] else: # Return XML content if return_all_files: raw_data = [BeautifulSoup(thisfile).find_all('report_data') for thisfile in content] return raw_data else: raw_data = soup.find_all('report_data') return raw_data
def request(self, url, mode='get', retry_sec=5, **kwargs): """ Get or post to a URL with the provided kwargs. Returns the response, or None if an error was encountered. If the mode is not 'get' or 'post', raises ValueError. """ # check args allowed_modes = ['get', 'post'] if mode not in allowed_modes: raise ValueError('Invalid request mode %s' % mode) # check for session try: session = getattr(self, 'session') except AttributeError: self.session = requests.Session() session = self.session # carry out request try: response = getattr(session, mode)(url, verify=False, timeout=self.TIMEOUT_SECONDS, **kwargs) # except requests.exceptions.ChunkedEncodingError as e: # # JSON incomplete or not found # msg = '%s: chunked encoding error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: # eg max retries exceeded msg = '%s: connection error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) LOGGER.error(msg) return None # except requests.exceptions.RequestException: # msg = '%s: request exception for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None if response.status_code == 200: # success LOGGER.debug('%s: request success for %s, %s with cache hit %s' % (self.NAME, url, kwargs, getattr(response, 'from_cache', None))) elif response.status_code == 429: # retry on throttle LOGGER.warn('%s: retrying in %d seconds, throttled for %s, %s' % (self.NAME, retry_sec, url, kwargs)) sleep(retry_sec) return self.request(url, mode=mode, retry_sec=retry_sec, **kwargs) else: # non-throttle error LOGGER.error('%s: request failure with code %s for %s, %s' % (self.NAME, response.status_code, url, kwargs)) return response
def parse_oasis_renewable(self, raw_data): """Parse raw data output of fetch_oasis for renewables.""" # set up storage preparsed_data = {} parsed_data = [] # extract values from xml for raw_soup_dp in raw_data: # set up storage for timestamp ts = self.utcify( raw_soup_dp.find(['INTERVAL_START_GMT', 'interval_start_gmt']).string) if ts not in preparsed_data: preparsed_data[ts] = {'wind': 0, 'solar': 0} # store generation value try: fuel_name = raw_soup_dp.find( ['RENEWABLE_TYPE', 'renewable_type']).string.lower() gen_MW = float(raw_soup_dp.find(['VALUE', 'value']).string) preparsed_data[ts][fuel_name] += gen_MW except TypeError: LOGGER.error('Error in schema for CAISO OASIS result %s' % raw_soup_dp.prettify()) continue # collect values into dps freq = self.options.get('freq', self.FREQUENCY_CHOICES.hourly) market = self.options.get('market', self.MARKET_CHOICES.hourly) for ts, preparsed_dp in preparsed_data.items(): # set up base base_parsed_dp = { 'timestamp': ts, 'freq': freq, 'market': market, 'gen_MW': 0, 'ba_name': self.NAME } # collect data for fuel_name in ['wind', 'solar']: parsed_dp = copy.deepcopy(base_parsed_dp) parsed_dp['fuel_name'] = fuel_name parsed_dp['gen_MW'] += preparsed_dp[fuel_name] parsed_data.append(parsed_dp) # return return parsed_data
def _assert_entries_1hr_apart(self, result_ts): prev_entry = None for entry in result_ts: if prev_entry: seconds_delta = (entry['timestamp'] - prev_entry['timestamp']).total_seconds() if seconds_delta > 3600: LOGGER.error('prev_entry timestamp: ' + str( prev_entry['timestamp'].astimezone(pytz.timezone(self.nbpower_client.TZ_NAME)) )) LOGGER.error('entry timestamp: ' + str( entry['timestamp'].astimezone(pytz.timezone(self.nbpower_client.TZ_NAME)) )) self.assertEqual(3600, seconds_delta) prev_entry = entry
def handle_ba_limitations(self): """Handle BA limitations""" today = pytz.utc.localize(datetime.utcnow()).astimezone(pytz.timezone(self.TZ_NAME)) two_days_ago = today - timedelta(days=2) load_not_supported_bas = ['DEAA', 'EEI', 'GRIF', 'GRMA', 'GWA', 'HGMA', 'SEPA', 'WWA', 'YAD'] delay_bas = ['AEC', 'DOPD', 'GVL', 'HST', 'NSB', 'PGE', 'SCL', 'TAL', 'TIDC', 'TPWR'] canada_mexico = ['IESO', 'BCTC', 'MHEB', 'AESO', 'HQT', 'NBSO', 'CFE', 'SPC'] if self.BA in delay_bas: if self.options['end_at'] and self.options['end_at'] > two_days_ago: LOGGER.error('No data for %s due to 2 day delay' % self.BA) raise ValueError('No data: 2 day delay for this BA.') elif self.options['yesterday']: LOGGER.error('No data for %s due to 2 day delay' % self.BA) raise ValueError('No data: 2 day delay for this BA.') elif self.options['forecast']: raise ValueError('No data: 2 day delay for this BA.') if self.BA in load_not_supported_bas: if self.options['data'] == 'load': LOGGER.error('Load data not supported for %s' % self.BA) raise ValueError('Load data not supported for this BA.') if self.BA in canada_mexico: LOGGER.error('Data not supported for %s' % self.BA) raise ValueError('Data not currently supported for Canada and Mexico')
def handle_ba_limitations(self): """Handle BA limitations""" today = pytz.utc.localize(datetime.utcnow()).astimezone( pytz.timezone(self.TZ_NAME)) two_days_ago = today - timedelta(days=2) load_not_supported_bas = [ 'DEAA', 'EEI', 'GRIF', 'GRMA', 'GWA', 'HGMA', 'SEPA', 'WWA', 'YAD' ] delay_bas = [ 'AEC', 'DOPD', 'GVL', 'HST', 'NSB', 'PGE', 'SCL', 'TAL', 'TIDC', 'TPWR' ] canada_mexico = [ 'IESO', 'BCTC', 'MHEB', 'AESO', 'HQT', 'NBSO', 'CFE', 'SPC' ] if self.BA in delay_bas: if self.options['end_at'] and self.options['end_at'] > two_days_ago: LOGGER.error('No data for %s due to 2 day delay' % self.BA) raise ValueError('No data: 2 day delay for this BA.') elif self.options['yesterday']: LOGGER.error('No data for %s due to 2 day delay' % self.BA) raise ValueError('No data: 2 day delay for this BA.') elif self.options['forecast']: raise ValueError('No data: 2 day delay for this BA.') if self.BA in load_not_supported_bas: if self.options['data'] == 'load': LOGGER.error('Load data not supported for %s' % self.BA) raise ValueError('Load data not supported for this BA.') if self.BA in canada_mexico: LOGGER.error('Data not supported for %s' % self.BA) raise ValueError( 'Data not currently supported for Canada and Mexico')
def _assert_entires_5min_apart(self, result_ts): prev_entry = None for entry in result_ts: if prev_entry: seconds_delta = (entry['timestamp'] - prev_entry['timestamp']).total_seconds() if seconds_delta > 300: LOGGER.error('prev_entry timestamp: ' + str(prev_entry['timestamp'].astimezone( pytz.timezone(self.ieso_client.TZ_NAME)))) LOGGER.error('entry timestamp: ' + str(entry['timestamp'].astimezone( pytz.timezone(self.ieso_client.TZ_NAME)))) self.assertEqual(300, seconds_delta) prev_entry = entry
def get_latest_fuel_mix(self): # set up request url = self.base_url + '/ria/FuelMix.aspx?CSV=True' # carry out request response = self.request(url) if not response: return None # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for generation') return None # return good return response.content
def get_latest_fuel_mix(self): # set up request url = self.base_url + '?messageType=getfuelmix&returnType=csv' # carry out request response = self.request(url) if not response: return None # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for generation') return None # return good return response.content
def parse_ace_data(self, content): if not content: return pd.DataFrame() # preliminary parsing df = pd.DataFrame(content, columns=['instantEST', 'value']) df['instantEST'] = pd.to_datetime(df['instantEST']) df.set_index('instantEST', inplace=True) # set index try: df.index = self.utcify_index(df.index) except AttributeError: LOGGER.error('MISO: Error in source data for ACE %s' % content) return pd.DataFrame() df.index.set_names(['timestamp'], inplace=True) return df
def get_latest_fuel_mix(self): # set up request url = self.base_url + '/ria/FuelMix.aspx?CSV=True' # carry out request response = self.request(url) if not response: return None # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for generation') return None # return good return response.content
def _format_start_end(self, data): formatted_sliced = [] if 'gen' not in self.options['data']: formatted_sliced = [i for i in data if i['timestamp'] >= self.options['start_at'] and i['timestamp'] <= self.options['end_at']] else: try: yesterday = (self.local_now() - timedelta(days=2)).replace(hour=0, minute=0, second=0, microsecond=0) tomorrow = (self.local_now() + timedelta(days=1)).replace(hour=23, minute=0, second=0, microsecond=0) assert ((self.options['start_at'] >= yesterday) and (self.options['end_at'] <= tomorrow)) formatted_sliced = [i for i in data if i['timestamp'] >= self.options['start_at'] and i['timestamp'] <= self.options['end_at']] except: LOGGER.error('Generation data error for %s' % self.BA) raise ValueError('Generation data is available for the \ previous and current day.', self.options) return formatted_sliced
def val_from_soup(self, soup, key): """ Returns a float value if one is found in the soup for the provided key, or None if an error was encountered. """ for elt in soup.find_all('td'): try: if elt.find('a').string == key: # numbers may have commas in the thousands val_str = elt.next_sibling.string.replace(',', '') return float(val_str) except AttributeError: # no 'a' child continue # no value found LOGGER.error('PJM: Value for %s not found in soup:\n%s' % (key, soup)) return None
def get_load(self, latest=False, yesterday=False, start_at=False, end_at=False, forecast=False, **kwargs): """ Scrape and parse load data. """ self.handle_options(data='load', latest=latest, start_at=start_at, end_at=end_at, **kwargs) self.handle_ba_limitations() self.format_url() result = self.request(self.url) if result is not None: result_json = json.loads(result.text) result_formatted = self.format_result(result_json) return result_formatted else: LOGGER.error('No results for %s' % self.BA) return []
def get_latest_ace(self): # set up request url = self.base_url + '?messageType=getACE&returnType=json' # carry out request response = self.request(url) if not response: return None # test for valid content if 'The page cannot be displayed' in response.text: LOGGER.error('MISO: Error in source data for ACE') return None ace = json.loads(response.content) # return good data = self.parse_ace_data(ace['ACE']) return self.serialize_faster(data)
def get_generation(self, latest=False, yesterday=False, start_at=False, end_at=False, **kwargs): """ Scrape and parse generation fuel mix data. Note: Generation may be quite low for HST and NSB BAs. """ self.handle_options(data='gen', latest=latest, yesterday=yesterday, start_at=start_at, end_at=end_at, **kwargs) self.handle_ba_limitations() self.format_url() result = self.request(self.url) if result is not None: result_json = json.loads(result.text) result_formatted = self.format_result(result_json) return result_formatted else: LOGGER.error('No results for %s' % self.BA) return []
def get_generation(self, latest=False, **kwargs): # set args self.handle_options(data='gen', **kwargs) # get data load_ts, load_val = self.fetch_edata_point('instLoad', 'PJM RTO Total') imports_ts, imports_val = self.fetch_edata_point('tieFlow', 'PJM RTO') wind_ts, wind_gen = self.fetch_edata_point('wind', 'RTO Wind Power') # compute nonwind gen try: total_gen = load_val - imports_val nonwind_gen = total_gen - wind_gen except TypeError: # value was None LOGGER.error('PJM: No timestamps found for options %s' % str(self.options)) return [] # choose best time to use if load_ts: ts = load_ts elif imports_ts: ts = imports_ts elif wind_ts: ts = wind_ts else: LOGGER.error('PJM: No timestamps found for options %s' % str(self.options)) return [] # set up storage parsed_data = [] base_dp = {'timestamp': ts, 'freq': self.FREQUENCY_CHOICES.fivemin, 'market': self.MARKET_CHOICES.fivemin, 'gen_MW': 0, 'ba_name': self.NAME} # collect data for gen_MW, fuel_name in [(wind_gen, 'wind'), (nonwind_gen, 'nonwind')]: parsed_dp = copy.deepcopy(base_dp) parsed_dp['fuel_name'] = fuel_name parsed_dp['gen_MW'] = gen_MW parsed_data.append(parsed_dp) # return return parsed_data
def parse_oasis_renewable(self, raw_data): """Parse raw data output of fetch_oasis for renewables.""" # set up storage preparsed_data = {} parsed_data = [] # extract values from xml for raw_soup_dp in raw_data: # set up storage for timestamp ts = self.utcify(raw_soup_dp.find('interval_start_gmt').string) if ts not in preparsed_data: preparsed_data[ts] = {'wind': 0, 'solar': 0} # store generation value try: fuel_name = raw_soup_dp.find('renewable_type').string.lower() gen_MW = float(raw_soup_dp.find('value').string) preparsed_data[ts][fuel_name] += gen_MW except TypeError: LOGGER.error('Error in schema for CAISO OASIS result %s' % raw_soup_dp.prettify()) continue # collect values into dps freq = self.options.get('freq', self.FREQUENCY_CHOICES.hourly) market = self.options.get('market', self.MARKET_CHOICES.hourly) for ts, preparsed_dp in preparsed_data.items(): # set up base base_parsed_dp = {'timestamp': ts, 'freq': freq, 'market': market, 'gen_MW': 0, 'ba_name': self.NAME} # collect data for fuel_name in ['wind', 'solar']: parsed_dp = copy.deepcopy(base_parsed_dp) parsed_dp['fuel_name'] = fuel_name parsed_dp['gen_MW'] += preparsed_dp[fuel_name] parsed_data.append(parsed_dp) # return return parsed_data
def format_url(self): """Set EIA API URL based on options""" if self.options['data'] == 'gen': if self.options['forecast']: LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for generation.') else: self.set_url('series', '-ALL.NG.H') elif self.options['data'] == 'load': if self.options['forecast']: self.set_url('series', '-ALL.DF.H') else: self.set_url('series', '-ALL.D.H') elif self.options['data'] == 'trade': if self.options['forecast']: LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for trade.') elif self.options['end_at']: if self.options['end_at'] > pytz.utc.localize( datetime.utcnow()): LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for trade.') else: self.set_url('series', '-ALL.TI.H') else: self.set_url('series', '-ALL.TI.H')
def format_url(self): """Set EIA API URL based on options""" if self.options['data'] == 'gen': if self.options['forecast']: LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for generation.') else: self.set_url('series', '-ALL.NG.H') elif self.options['data'] == 'load': if self.options['forecast']: self.set_url('series', '-ALL.DF.H') else: self.set_url('series', '-ALL.D.H') elif self.options['data'] == 'trade': if self.options['forecast']: LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for trade.') elif self.options['end_at']: if self.options['end_at'] > pytz.utc.localize(datetime.utcnow()): LOGGER.error('Forecast not supported for generation.') raise ValueError('Forecast not supported for trade.') else: self.set_url('series', '-ALL.TI.H') else: self.set_url('series', '-ALL.TI.H')
def format_result(self, data): """Output EIA API results in pyiso format""" try: assert('series' in data) except: LOGGER.error('Unable to format result for %s' % data['request']) raise ValueError('Query error for %s:' % data['request']) market = self._set_market() data_type = self._set_data_type() data_formatted = [] if self.options['latest']: data_formatted = self._format_latest(data, data_type, market) elif self.options['yesterday']: data_formatted = self._format_yesterday(data, data_type, market) else: data_formatted = self._format_general(data, data_type, market) if self.options['start_at'] and self.options['end_at']: data_formatted = self._format_start_end(data_formatted) if self.options['data'] == 'gen': data_formatted = self.add_gen_data(data_formatted) return data_formatted
def format_result(self, data): """Output EIA API results in pyiso format""" try: assert ('series' in data) except: LOGGER.error('Unable to format result for %s' % data['request']) raise ValueError('Query error for %s:' % data['request']) market = self._set_market() data_type = self._set_data_type() data_formatted = [] if self.options['latest']: data_formatted = self._format_latest(data, data_type, market) elif self.options['yesterday']: data_formatted = self._format_yesterday(data, data_type, market) else: data_formatted = self._format_general(data, data_type, market) if self.options['start_at'] and self.options['end_at']: data_formatted = self._format_start_end(data_formatted) if self.options['data'] == 'gen': data_formatted = self.add_gen_data(data_formatted) return data_formatted
def parse_oasis_renewable(self, raw_data): """Parse raw data output of fetch_oasis for renewables.""" # set up storage preparsed_data = {} parsed_data = [] # extract values from xml for raw_soup_dp in raw_data: # set up storage for timestamp ts = self.utcify(raw_soup_dp.find("interval_start_gmt").string) if ts not in preparsed_data: preparsed_data[ts] = {"wind": 0, "solar": 0} # store generation value try: fuel_name = raw_soup_dp.find("renewable_type").string.lower() gen_MW = float(raw_soup_dp.find("value").string) preparsed_data[ts][fuel_name] += gen_MW except TypeError: LOGGER.error("Error in schema for CAISO OASIS result %s" % raw_soup_dp.prettify()) continue # collect values into dps freq = self.options.get("freq", self.FREQUENCY_CHOICES.hourly) market = self.options.get("market", self.MARKET_CHOICES.hourly) for ts, preparsed_dp in preparsed_data.items(): # set up base base_parsed_dp = {"timestamp": ts, "freq": freq, "market": market, "gen_MW": 0, "ba_name": self.NAME} # collect data for fuel_name in ["wind", "solar"]: parsed_dp = copy.deepcopy(base_parsed_dp) parsed_dp["fuel_name"] = fuel_name parsed_dp["gen_MW"] += preparsed_dp[fuel_name] parsed_data.append(parsed_dp) # return return parsed_data
def parse_latest_fuel_mix(self, content): # handle bad input if not content: return pd.DataFrame() # preliminary parsing df = pd.read_csv(BytesIO(content), header=0, index_col=0, skiprows=2, parse_dates=True) # set index try: df.index = self.utcify_index(df.index) except AttributeError: LOGGER.error('MISO: Error in source data for generation %s' % content) return pd.DataFrame() df.index.set_names(['timestamp'], inplace=True) # set names and labels df['fuel_name'] = df.apply(lambda x: self.fuels[x['CATEGORY']], axis=1) df['gen_MW'] = df['ACT'] # return return df[['fuel_name', 'gen_MW']]
def time_as_of(self, content): """ Returns a UTC timestamp if one is found in the html content, or None if an error was encountered. """ # soup it up soup = BeautifulSoup(content, 'lxml') # like 12.11.2015 17:15 ts_elt = soup.find(id='ctl00_ContentPlaceHolder1_DateAndTime') if not ts_elt: LOGGER.error('PJM: Timestamp not found in soup:\n%s' % soup) return None ts_str = ts_elt.string # EDT or EST tz_elt = ts_elt.next_sibling tz_str = tz_elt.string.strip() is_dst = tz_str == 'EDT' # utcify and return return self.utcify(ts_str, is_dst=is_dst)
def time_as_of(self, content): """ Returns a UTC timestamp if one is found in the html content, or None if an error was encountered. """ # soup it up soup = BeautifulSoup(content, 'lxml') # like 12.11.2015 17:15 ts_elt = soup.find(id='ctl00_ContentPlaceHolder1_DateAndTime') if not ts_elt: LOGGER.error('PJM: Timestamp not found in soup:\n%s' % soup) return None ts_str = ts_elt.string # EDT or EST tz_elt = ts_elt.next_sibling tz_str = tz_elt.string.strip() is_dst = tz_str == 'EDT' # utcify and return return self.utcify(ts_str, is_dst=is_dst)
def set_ba(self, bal_auth): if bal_auth in self.EIA_BAs: self.BA = bal_auth else: LOGGER.error('Unknown BA: %s' % bal_auth) raise ValueError('Unknown BA: %s' % bal_auth)
def request(self, url, mode='get', retry_sec=5, retries_remaining=5, **kwargs): """ Get or post to a URL with the provided kwargs. Returns the response, or None if an error was encountered. If the mode is not 'get' or 'post', raises ValueError. """ # check args allowed_modes = ['get', 'post'] if mode not in allowed_modes: raise ValueError('Invalid request mode %s' % mode) # check for session try: session = getattr(self, 'session') except AttributeError: self.session = requests.Session() session = self.session # carry out request try: response = getattr(session, mode)(url, verify=False, timeout=self.timeout_seconds, **kwargs) # except requests.exceptions.ChunkedEncodingError as e: # # JSON incomplete or not found # msg = '%s: chunked encoding error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: # eg max retries exceeded msg = '%s: connection error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) LOGGER.error(msg) return None # except requests.exceptions.RequestException: # msg = '%s: request exception for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None if response.status_code == 200: # success LOGGER.debug('%s: request success for %s, %s with cache hit %s' % (self.NAME, url, kwargs, getattr(response, 'from_cache', None))) elif response.status_code == 429: if retries_remaining > 0: # retry on throttle LOGGER.warn( '%s: retrying in %d seconds (%d retries remaining), throttled for %s, %s' % (self.NAME, retry_sec, retries_remaining, url, kwargs)) sleep(retry_sec) retries_remaining -= 1 return self.request(url, mode=mode, retry_sec=retry_sec * 2, retries_remaining=retries_remaining, **kwargs) else: # exhausted retries LOGGER.warn('%s: exhausted retries for %s, %s' % (self.NAME, url, kwargs)) return None else: # non-throttle error LOGGER.error('%s: request failure with code %s for %s, %s' % (self.NAME, response.status_code, url, kwargs)) return response
def _generation_historical(self): # set up storage parsed_data = [] # collect data request_date = self.options['start_at'].astimezone(self.ca_tz).date() local_end_at = self.options['end_at'].astimezone(self.ca_tz).date() while request_date <= local_end_at: # set up request url_file = request_date.strftime('%Y%m%d_DailyRenewablesWatch.txt') url = self.base_url_gen + url_file # carry out request response = self.request(url) if not response: request_date += timedelta(days=1) continue dst_error_text = 'The supplied DateTime represents an invalid time. For example, when the clock is ' \ 'adjusted forward, any time in the period that is skipped is invalid.' header_idx = 1 for part in [1, 2]: # process both halves of page (i.e. two parts) num_data_rows = 24 # The day transitioning to daylight saving time adds extra erroneous lines of text. if part == 1 and dst_error_text in response.text: num_data_rows = 29 df = self.parse_to_df(response.text, nrows=num_data_rows, header=header_idx, delimiter='\t+') # The day transitioning to daylight saving time has errors in part two of the file that need removal. if part == 2: df = df[df.THERMAL.map(str) != '#VALUE!'] # combine date with hours to index try: indexed = self.set_dt_index(df, request_date, df['Hour']) except Exception as e: LOGGER.error(e) continue # original header is fuel names indexed.rename(columns=self.fuels, inplace=True) # remove non-fuel cols fuel_cols = list(set(self.fuels.values()) & set(indexed.columns)) subsetted = indexed[fuel_cols] # pivot pivoted = self.unpivot(subsetted) pivoted.rename(columns={'level_1': 'fuel_name', 0: 'gen_MW'}, inplace=True) # slice times sliced = self.slice_times(pivoted) # store parsed_data += self.serialize(sliced, header=['timestamp', 'fuel_name', 'gen_MW'], extras={'ba_name': self.NAME, 'market': self.MARKET_CHOICES.hourly, 'freq': self.FREQUENCY_CHOICES.hourly}) # If processing the first part, set the header index for second part. if part == 1: header_idx = num_data_rows + 3 # finish day request_date += timedelta(days=1) # return return parsed_data
def set_ba(self, bal_auth): if bal_auth in self.EIA_BAs: self.BA = bal_auth else: LOGGER.error('Unknown BA: %s' % bal_auth) raise ValueError('Unknown BA: %s' % bal_auth)
def _generation_historical(self): # set up storage parsed_data = [] # collect data request_date = self.options['start_at'].astimezone(self.ca_tz).date() local_end_at = self.options['end_at'].astimezone(self.ca_tz).date() while request_date <= local_end_at: # set up request url_file = request_date.strftime('%Y%m%d_DailyRenewablesWatch.txt') url = self.base_url_gen + url_file # carry out request response = self.request(url) if not response: request_date += timedelta(days=1) continue dst_error_text = 'The supplied DateTime represents an invalid time. For example, when the clock is ' \ 'adjusted forward, any time in the period that is skipped is invalid.' header_idx = 1 for part in [1, 2]: # process both halves of page (i.e. two parts) num_data_rows = 24 # The day transitioning to daylight saving time adds extra erroneous lines of text. if part == 1 and dst_error_text in response.text: num_data_rows = 29 df = self.parse_to_df(response.text, nrows=num_data_rows, header=header_idx, delimiter='\t+') # The day transitioning to daylight saving time has errors in part two of the file that need removal. if part == 2: df = df[df.THERMAL.map(str) != '#VALUE!'] # combine date with hours to index try: indexed = self.set_dt_index(df, request_date, df['Hour']) except Exception as e: LOGGER.error(e) continue # original header is fuel names indexed.rename(columns=self.fuels, inplace=True) # remove non-fuel cols fuel_cols = list( set(self.fuels.values()) & set(indexed.columns)) subsetted = indexed[fuel_cols] # pivot pivoted = self.unpivot(subsetted) pivoted.rename(columns={ 'level_1': 'fuel_name', 0: 'gen_MW' }, inplace=True) # slice times sliced = self.slice_times(pivoted) # store parsed_data += self.serialize( sliced, header=['timestamp', 'fuel_name', 'gen_MW'], extras={ 'ba_name': self.NAME, 'market': self.MARKET_CHOICES.hourly, 'freq': self.FREQUENCY_CHOICES.hourly }) # If processing the first part, set the header index for second part. if part == 1: header_idx = num_data_rows + 3 # finish day request_date += timedelta(days=1) # return return parsed_data