def test_build_db_ts_range_unsupported_frequency(): cur_date = datetime.datetime(2020, 8, 5) request_year = 2019 frequency = 'bad_frequency_string' with pytest.raises(NotImplementedError): utils.fetch_ts_start_end(cur_date, request_year, frequency)
def test_build_db_ts_range_bad_year(): cur_date = datetime.datetime(2020, 8, 5) request_year = 2021 # later than current frequency = 'H' with pytest.raises(ValueError): utils.fetch_ts_start_end(cur_date=cur_date, request_year=request_year, frequency=frequency)
def test_build_db_ts_range_general(): cur_date = datetime.datetime(2020, 8, 5) request_year = 2019 frequency = 'H' exp_start = f'{request_year}-01-01 00:00:00' exp_end = f'{request_year}-12-31 23:00:00' actual = utils.fetch_ts_start_end(cur_date, request_year, frequency) expected = (exp_start, exp_end) # sort for testing ease expected = sorted(expected) actual = sorted(actual) assert actual == expected
def construct_database(self): """Constructs database from raw datafiles and saves it in UTC""" # Determine expected timestamps for dataset self.curr_date = datetime.now(tz=pytz.timezone('US/Eastern')) # update current time after download start, end = utils.fetch_ts_start_end(self.curr_date, self.year, self.dataset_details.f) timestamps = pd.date_range(start, end, freq=self.dataset_details.f, tz='US/Eastern') # Construct Database print('Constructing DB...') files = sorted(pl.Path(self.download_dir).glob('*.csv')) if not files: print('Warning: No raw datafiles found!') return # skip the rest else: # Concatenate all CSVs into a DataFrame frames = [pd.read_csv(file, index_col=0) for file in files] df = pd.concat(frames, sort=False) df.index = pd.to_datetime(df.index) # If self.dataset_details.col is None then there is no need to pivot if ('Time Zone' in df.columns) or (self.dataset_details.col is None): # Make index timezone aware (US/Eastern) if 'Time Zone' in df.columns: df = df.tz_localize('US/Eastern', ambiguous=df['Time Zone'] == 'EST') elif self.dataset_details.col is None: df = df.tz_localize('US/Eastern', ambiguous='infer') df = df.sort_index(axis='index').tz_convert('UTC') # Convert to UTC so that pivot can work without throwing error for duplicate indices (due to if 'Time Zone' in df.columns: print('Pivoting Data...') df = df.pivot(columns=self.dataset_details.col, values=self.dataset_details.val_col) # make columns print('Resampling...') df = df.resample(self.dataset_details.f).mean() df = utils.check_and_interpolate_nans(df) # When there is no timezone column and there is 'stacked' data else: print('Data is stacked...') frames = [] for ctype, subdf in df.groupby(by=self.dataset_details.col): subdf = subdf.tz_localize('US/Eastern', ambiguous='infer').tz_convert('UTC') subdf = subdf.resample(self.dataset_details.f).mean() subdf = utils.check_and_interpolate_nans(subdf) subdf.loc[:, self.dataset_details.col] = ctype frames.append(subdf) df = pd.concat(frames) # check if the number of regions/interface flow name are equal if not (len(set(df[self.dataset_details.col].value_counts().values)) <= 1): print('Warning: There seems to be underlying missing data.\n{}'.format( df[self.dataset_details.col].value_counts())) if self.dataset_details.type == 'load': df['NYCA'] = df.sum(axis='columns') # Calculate statewide load based on interpolated values if self.dataset_details.type == 'interface_flows': # remap external interface names to match website df['Interface Name'] = df['Interface Name'].map(EXTERNAL_TFLOWS_MAP).fillna(df['Interface Name']) df = df.rename(columns={'Flow (MWH)': 'Flow (MW)', 'Postitive Limit (MWH)': 'Postitive Limit (MW)', 'Negative Limit (MWH)': 'Negative Limit (MW)'}) # Convert back to US/Eastern to select time period based on local time df = df.tz_convert('US/Eastern') df = df.loc[start:end] # Check to make sure that all the expected timestamps exist assert timestamps[~timestamps.isin(df.index)].empty, 'Index is missing data! {}'.format( timestamps[~timestamps.isin(df.index)]) assert ~df.isnull().values.any(), 'NANs Found! Resampling and interpolation should have handled this.' # Save and return dataset in UTC df = df.tz_convert('UTC') filepath = pl.Path(self.output_dir, f'{self.year}_{self.dataset}.pkl') df.to_pickle(filepath) # pickle will contains timezone and frequency information if self.create_csvs: df.to_csv(filepath) self.df = df