def censusdp1tract_to_sqlite(pudl_settings=None, year=2010): """ Use GDAL's ogr2ogr utility to convert the Census DP1 GeoDB to an SQLite DB. The Census DP1 GeoDB is read from the datastore, where it is stored as a zipped archive. This archive is unzipped into a temporary directory so that ogr2ogr can operate on the ESRI GeoDB, and convert it to SQLite. The resulting SQLite DB file is put in the PUDL output directory alongside the ferc1 and pudl SQLite databases. Args: pudl_settings (dict): A PUDL settings dictionary. year (int): Year of Census data to extract (currently must be 2010) Returns: None """ if pudl_settings is None: pudl_settings = pudl.workspace.setup.get_defaults() ds = Datastore(local_cache_path=pudl_settings["data_dir"]) # If we're in a conda environment, use the version of ogr2ogr that has been # installed by conda. Otherwise, try and use a system installed version # at /usr/bin/ogr2ogr This allows us to avoid simply running whatever # program happens to be in the user's path and named ogr2ogr. This is a # fragile solution that will not work on all platforms, but should cover # conda environments, Docker, and continuous integration on GitHub. ogr2ogr = os.environ.get("CONDA_PREFIX", "/usr") + "/bin/ogr2ogr" # Extract the sippzed GeoDB archive from the Datastore into a temporary # directory so that ogr2ogr can operate on it. Output the resulting SQLite # database into the user's PUDL workspace. We do not need to keep the # unzipped GeoDB around after this conversion. Using a temporary directory # makes the cleanup automatic. with TemporaryDirectory() as tmpdir: # Use datastore to grab the Census DP1 zipfile tmpdir_path = Path(tmpdir) zip_ref = ds.get_zipfile_resource("censusdp1tract", year=year) extract_root = tmpdir_path / Path(zip_ref.filelist[0].filename) out_path = Path(pudl_settings["sqlite_dir"]) / "censusdp1tract.sqlite" logger.info("Extracting the Census DP1 GeoDB to %s", out_path) zip_ref.extractall(tmpdir_path) logger.info("extract_root = %s", extract_root) logger.info("out_path = %s", out_path) subprocess.run( # nosec: B603 Trying to use absolute paths. [ogr2ogr, str(out_path), str(extract_root)], check=True)
def extract(epaipm_tables: List[str], ds: Datastore) -> Dict[str, pd.DataFrame]: """Extracts data from IPM files. Args: epaipm_tables (iterable): A tuple or list of table names to extract ds (:class:`EpaIpmDatastore`): Initialized datastore Returns: dict: dictionary of DataFrames with extracted (but not yet transformed) data from each file. """ # Prep for ingesting EPA IPM logger.info('Beginning ETL for EPA IPM.') ds = EpaIpmDatastore(ds) if "plant_region_map_epaipm" in epaipm_tables: # NEEDS is the only IPM data file with multiple sheets. Keeping the overall # code simpler but adding this if statement to read both sheets (active and # retired by 2021). epaipm_tables.remove("plant_region_map_epaipm") epaipm_tables.extend([ "plant_region_map_epaipm_active", "plant_region_map_epaipm_retired" ]) return {f: ds.get_dataframe(f) for f in epaipm_tables}
def extract(epacems_years, states, ds: Datastore): """ Coordinate the extraction of EPA CEMS hourly DataFrames. Args: epacems_years (list): The years of CEMS data to extract, as 4-digit integers. states (list): The states whose CEMS data we want to extract, indicated by 2-letter US state codes. ds (:class:`Datastore`): Initialized datastore Yields: dict: a dictionary with a single EPA CEMS tabular data resource name as the key, having the form "hourly_emissions_epacems_YEAR_STATE" where YEAR is a 4 digit number and STATE is a lower case 2-letter code for a US state. The value is a :class:`pandas.DataFrame` containing all the raw EPA CEMS hourly emissions data for the indicated state and year. """ ds = EpaCemsDatastore(ds) for year in epacems_years: # The keys of the us_states dictionary are the state abbrevs for state in states: partition = EpaCemsPartition(state=state, year=year) logger.info(f"Performing ETL for EPA CEMS hourly {state}-{year}") # Return a dictionary where the key identifies this dataset # (just like the other extract functions), but unlike the # others, this is yielded as a generator (and it's a one-item # dictionary). yield { ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()): ds.get_data_frame(partition) }
def main(): # noqa: C901 """Clone the FERC Form 1 FoxPro database into SQLite.""" # Display logged output from the PUDL package: pudl_logger = logging.getLogger("pudl") log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s' coloredlogs.install(fmt=log_format, level='INFO', logger=pudl_logger) args = parse_command_line(sys.argv) if args.logfile: file_logger = logging.FileHandler(args.logfile) file_logger.setFormatter(logging.Formatter(log_format)) pudl_logger.addHandler(file_logger) with pathlib.Path(args.settings_file).open() as f: script_settings = yaml.safe_load(f) defaults = pudl.workspace.setup.get_defaults() pudl_in = script_settings.get("pudl_in", defaults["pudl_in"]) pudl_out = script_settings.get("pudl_out", defaults["pudl_out"]) pudl_settings = pudl.workspace.setup.derive_paths(pudl_in=pudl_in, pudl_out=pudl_out) script_settings = Ferc1ToSqliteSettings().parse_obj( script_settings["ferc1_to_sqlite_settings"]) pudl_settings["sandbox"] = args.sandbox pudl.extract.ferc1.dbf2sqlite( tables=script_settings.tables, years=script_settings.years, refyear=script_settings.refyear, pudl_settings=pudl_settings, bad_cols=script_settings.bad_cols, clobber=args.clobber, datastore=Datastore(local_cache_path=(Path(pudl_in) / "data"), sandbox=args.sandbox))
def _etl_epaipm(etl_params, datapkg_dir, pudl_settings, ds_kwargs): """Extract, transform and load CSVs for EPA IPM. Args: etl_params (dict): ETL parameters required by this data source. datapkg_dir (path-like): The location of the directory for this package, wihch will contain a datapackage.json file and a data directory in which the CSV file are stored. pudl_settings (dict) : a dictionary filled with settings that mostly describe paths to various resources and outputs. Returns: list: Names of PUDL DB tables output by the ETL for this data source. """ epaipm_dict = _validate_params_epaipm(etl_params) epaipm_tables = epaipm_dict['epaipm_tables'] if not epaipm_tables: logger.info('Not ingesting EPA IPM.') return [] static_tables = _load_static_tables_epaipm(datapkg_dir) # Extract IPM tables ds = pudl.extract.epaipm.EpaIpmDatastore(Datastore(**ds_kwargs)) epaipm_raw_dfs = pudl.extract.epaipm.extract(epaipm_tables, ds) epaipm_transformed_dfs = pudl.transform.epaipm.transform( epaipm_raw_dfs, epaipm_tables) pudl.load.csv.dict_dump(epaipm_transformed_dfs, "EPA IPM", datapkg_dir=datapkg_dir) return list(epaipm_transformed_dfs.keys()) + static_tables
def main(): # noqa: C901 """Clone the FERC Form 1 FoxPro database into SQLite.""" # Display logged output from the PUDL package: pudl_logger = logging.getLogger("pudl") log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s' coloredlogs.install(fmt=log_format, level='INFO', logger=pudl_logger) args = parse_command_line(sys.argv) with pathlib.Path(args.settings_file).open() as f: script_settings = yaml.safe_load(f) try: pudl_in = script_settings["pudl_in"] except KeyError: pudl_in = pudl.workspace.setup.get_defaults()["pudl_in"] try: pudl_out = script_settings["pudl_out"] except KeyError: pudl_out = pudl.workspace.setup.get_defaults()["pudl_out"] pudl_settings = pudl.workspace.setup.derive_paths(pudl_in=pudl_in, pudl_out=pudl_out) # Check args for basic validity: for table in script_settings['ferc1_to_sqlite_tables']: if table not in pc.ferc1_tbl2dbf: raise ValueError(f"{table} was not found in the list of " f"available FERC Form 1 tables.") if script_settings['ferc1_to_sqlite_refyear'] \ not in pc.data_years['ferc1']: raise ValueError( f"Reference year {script_settings['ferc1_to_sqlite_refyear']} " f"is outside the range of available FERC Form 1 data " f"({min(pc.data_years['ferc1'])}-" f"{max(pc.data_years['ferc1'])}).") for year in script_settings['ferc1_to_sqlite_years']: if year not in pc.data_years['ferc1']: raise ValueError( f"Requested data from {year} is outside the range of " f"available FERC Form 1 data " f"({min(pc.data_years['ferc1'])}-" f"{max(pc.data_years['ferc1'])}).") try: # This field is optional and generally unused... bad_cols = script_settings['ferc1_to_sqlite_bad_cols'] except KeyError: bad_cols = () pudl_settings["sandbox"] = args.sandbox pudl.extract.ferc1.dbf2sqlite( tables=script_settings['ferc1_to_sqlite_tables'], years=script_settings['ferc1_to_sqlite_years'], refyear=script_settings['ferc1_to_sqlite_refyear'], pudl_settings=pudl_settings, bad_cols=bad_cols, clobber=args.clobber, datastore=Datastore(local_cache_path=(Path(pudl_in) / "data"), sandbox=args.sandbox))
def _etl_epacems(etl_params, datapkg_dir, pudl_settings, ds_kwargs): """Extract, transform and load CSVs for EPA CEMS. Args: etl_params (dict): ETL parameters required by this data source. datapkg_dir (path-like): The location of the directory for this package, wihch will contain a datapackage.json file and a data directory in which the CSV file are stored. pudl_settings (dict) : a dictionary filled with settings that mostly describe paths to various resources and outputs. Returns: list: Names of PUDL DB tables output by the ETL for this data source. """ epacems_dict = pudl.etl._validate_params_epacems(etl_params) epacems_years = epacems_dict['epacems_years'] epacems_states = epacems_dict['epacems_states'] # If we're not doing CEMS, just stop here to avoid printing messages like # "Reading EPA CEMS data...", which could be confusing. if not epacems_states or not epacems_years: logger.info('Not ingesting EPA CEMS.') # NOTE: This a generator for raw dataframes epacems_raw_dfs = pudl.extract.epacems.extract(epacems_years, epacems_states, Datastore(**ds_kwargs)) # NOTE: This is a generator for transformed dataframes epacems_transformed_dfs = pudl.transform.epacems.transform( epacems_raw_dfs=epacems_raw_dfs, datapkg_dir=datapkg_dir) logger.info("Loading tables from EPA CEMS into PUDL:") if logger.isEnabledFor(logging.INFO): start_time = time.monotonic() epacems_tables = [] # run the cems generator dfs through the load step for transformed_df_dict in epacems_transformed_dfs: pudl.load.csv.dict_dump(transformed_df_dict, "EPA CEMS", datapkg_dir=datapkg_dir) epacems_tables.append(list(transformed_df_dict.keys())[0]) if logger.isEnabledFor(logging.INFO): delta_t = time.strftime("%H:%M:%S", time.gmtime(time.monotonic() - start_time)) time_message = f"Loading EPA CEMS took {delta_t}" logger.info(time_message) start_time = time.monotonic() return epacems_tables
def etl_ferc714(self, update: bool = False): """ A single function that runs the temporary FERC 714 ETL and sets all DFs. This is an interim solution, so that we can have a (relatively) standard way of accessing the FERC 714 data prior to getting it integrated into the PUDL DB. Some of these are not yet cleaned up, but there are dummy transform functions which pass through the raw DFs with some minor alterations, so all the data is available as it exists right now. An attempt to access *any* of the dataframes results in all of them being populated, since generating all of them is almost the same amount of work as generating one of them. Args: update: Whether to overwrite the existing dataframes if they exist. """ if isinstance(self.ds, Datastore): pass elif self.ds is None: pudl_settings = pudl.workspace.setup.get_defaults() if pudl_settings["pudl_in"] is None: raise FileNotFoundError( "In order to run the ad-hoc FERC-714 ETL PUDL needs a valid " "Datastore, but none was found. Run 'pudl_setup --help' " "to see how to create one.") self.ds = Datastore(local_cache_path=pudl_settings["data_dir"]) else: raise TypeError( "PudlTabl needs a PUDL Datastore object, but we got " f"a {type(self.ds)}.") if update or self._dfs["respondent_id_ferc714"] is None: logger.warning("Running the interim FERC 714 ETL process!") ferc714_raw_dfs = pudl.extract.ferc714.extract(ds=self.ds) ferc714_tfr_dfs = pudl.transform.ferc714.transform(ferc714_raw_dfs) self._dfs.update(ferc714_tfr_dfs)
def extract(epacems_years, states, ds: Datastore): """ Coordinate the extraction of EPA CEMS hourly DataFrames. Args: epacems_years (list): The years of CEMS data to extract, as 4-digit integers. states (list): The states whose CEMS data we want to extract, indicated by 2-letter US state codes. ds (:class:`Datastore`): Initialized datastore Yields: pandas.DataFrame: A single state-year of EPA CEMS hourly emissions data. """ ds = EpaCemsDatastore(ds) for year in epacems_years: for state in states: partition = EpaCemsPartition(state=state, year=year) logger.info(f"Processing EPA CEMS hourly data for {state}-{year}") # We have to assign the reporting year for partitioning purposes df = (ds.get_data_frame(partition).assign(year=year)) yield df
def etl_eia861(self, update: bool = False): """ A single function that runs the temporary EIA 861 ETL and sets all DFs. This is an interim solution that provides a (somewhat) standard way of accessing the EIA 861 data prior to its being fully integrated into the PUDL database. If any of the dataframes is attempted to be accessed, all of them are set. Only the tables that have actual transform functions are included, and as new transform functions are completed, they would need to be added to the list below. Surely there is a way to do this automatically / magically but that's beyond my knowledge right now. Args: update: Whether to overwrite the existing dataframes if they exist. """ if isinstance(self.ds, Datastore): pass elif self.ds is None: pudl_settings = pudl.workspace.setup.get_defaults() if pudl_settings["pudl_in"] is None: raise FileNotFoundError( "In order to run the ad-hoc EIA-861 ETL PUDL needs a valid " "Datastore, but none was found. Run 'pudl_setup --help' " "to see how to create one.") self.ds = Datastore(local_cache_path=pudl_settings["data_dir"]) else: raise TypeError( "PudlTabl needs a PUDL Datastore object, but we got " f"a {type(self.ds)}.") if update or self._dfs["balancing_authority_eia861"] is None: logger.warning("Running the interim EIA 861 ETL process!") eia861_raw_dfs = (pudl.extract.eia861.Extractor(self.ds).extract( year=pc.WORKING_PARTITIONS["eia861"]["years"])) self._dfs.update(pudl.transform.eia861.transform(eia861_raw_dfs))
def test_prod_datapackages(self): """All datasets point to valid descriptors with 1 or more resources.""" ds = Datastore(sandbox=False) for dataset in ds.get_known_datasets(): desc = ds.get_datapackage_descriptor(dataset) assert list(desc.get_resources())
def _etl_eia( etl_settings: EiaSettings, ds_kwargs: Dict[str, Any] ) -> Dict[str, pd.DataFrame]: """Extract, transform and load CSVs for the EIA datasets. Args: etl_settings: Validated ETL parameters required by this data source. ds_kwargs: Keyword arguments for instantiating a PUDL datastore, so that the ETL can access the raw input data. Returns: A dictionary of EIA dataframes ready for loading into the PUDL DB. """ eia860_tables = etl_settings.eia860.tables eia860_years = etl_settings.eia860.years eia860m = etl_settings.eia860.eia860m eia923_tables = etl_settings.eia923.tables eia923_years = etl_settings.eia923.years if ( (not eia923_tables or not eia923_years) and (not eia860_tables or not eia860_years) ): logger.info('Not loading EIA.') return [] # generate dataframes for the static EIA tables out_dfs = _read_static_tables_eia() ds = Datastore(**ds_kwargs) # Extract EIA forms 923, 860 eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract( year=eia923_years) eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract( year=eia860_years) # if we are trying to add the EIA 860M YTD data, then extract it and append if eia860m: eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract( year_month=pc.WORKING_PARTITIONS['eia860m']['year_month']) eia860_raw_dfs = pudl.extract.eia860m.append_eia860m( eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs) # Transform EIA forms 923, 860 eia860_transformed_dfs = pudl.transform.eia860.transform( eia860_raw_dfs, eia860_tables=eia860_tables) eia923_transformed_dfs = pudl.transform.eia923.transform( eia923_raw_dfs, eia923_tables=eia923_tables) # create an eia transformed dfs dictionary eia_transformed_dfs = eia860_transformed_dfs.copy() eia_transformed_dfs.update(eia923_transformed_dfs.copy()) # convert types.. eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes( eia_transformed_dfs, 'eia') entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform( eia_transformed_dfs, eia860_years=eia860_years, eia923_years=eia923_years, eia860m=eia860m, ) # convert types.. entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia') for table in entities_dfs: entities_dfs[table] = PUDL_META.get_resource(table).encode(entities_dfs[table]) out_dfs.update(entities_dfs) out_dfs.update(eia_transformed_dfs) return out_dfs
def etl_epacems( etl_settings: EpaCemsSettings, pudl_settings: Dict[str, Any], ds_kwargs: Dict[str, Any], ) -> None: """Extract, transform and load CSVs for EPA CEMS. Args: etl_settings: Validated ETL parameters required by this data source. pudl_settings: a dictionary filled with settings that mostly describe paths to various resources and outputs. ds_kwargs: Keyword arguments for instantiating a PUDL datastore, so that the ETL can access the raw input data. Returns: Unlike the other ETL functions, the EPACEMS writes its output to Parquet as it goes, since the dataset is too large to hold in memory. So it doesn't return a dictionary of dataframes. """ epacems_years = etl_settings.years epacems_states = etl_settings.states # If we're not doing CEMS, just stop here to avoid printing messages like # "Reading EPA CEMS data...", which could be confusing. if not epacems_states or not epacems_years: logger.info('Not ingesting EPA CEMS.') pudl_engine = sa.create_engine(pudl_settings["pudl_db"]) # Verify that we have a PUDL DB with plant attributes: inspector = sa.inspect(pudl_engine) if "plants_eia860" not in inspector.get_table_names(): raise RuntimeError( "No plants_eia860 available in the PUDL DB! Have you run the ETL? " f"Trying to access PUDL DB: {pudl_engine}" ) eia_plant_years = pd.read_sql( """ SELECT DISTINCT strftime('%Y', report_date) AS year FROM plants_eia860 ORDER BY year ASC """, pudl_engine).year.astype(int) missing_years = list(set(epacems_years) - set(eia_plant_years)) if missing_years: logger.info( f"EPA CEMS years with no EIA plant data: {missing_years} " "Some timezones may be estimated based on plant state." ) # NOTE: This is a generator for raw dataframes epacems_raw_dfs = pudl.extract.epacems.extract( epacems_years, epacems_states, Datastore(**ds_kwargs)) # NOTE: This is a generator for transformed dataframes epacems_transformed_dfs = pudl.transform.epacems.transform( epacems_raw_dfs=epacems_raw_dfs, pudl_engine=pudl_engine, ) logger.info("Processing EPA CEMS data and writing it to Apache Parquet.") if logger.isEnabledFor(logging.INFO): start_time = time.monotonic() # run the cems generator dfs through the load step for df in epacems_transformed_dfs: pudl.load.parquet.epacems_to_parquet( df, root_path=Path(pudl_settings["parquet_dir"]) / "epacems", ) if logger.isEnabledFor(logging.INFO): delta_t = time.strftime("%H:%M:%S", time.gmtime( time.monotonic() - start_time)) time_message = f"Processing EPA CEMS took {delta_t}" logger.info(time_message) start_time = time.monotonic()
def _etl_eia(etl_params, datapkg_dir, pudl_settings, ds_kwargs): """Extract, transform and load CSVs for the EIA datasets. Args: etl_params (dict): ETL parameters required by this data source. datapkg_dir (path-like): The location of the directory for this package, wihch will contain a datapackage.json file and a data directory in which the CSV file are stored. pudl_settings (dict) : a dictionary filled with settings that mostly describe paths to various resources and outputs. Returns: list: Names of PUDL DB tables output by the ETL for this data source. """ eia_inputs = _validate_params_eia(etl_params) eia860_tables = eia_inputs["eia860_tables"] eia860_years = eia_inputs["eia860_years"] eia860_ytd = eia_inputs["eia860_ytd"] eia923_tables = eia_inputs["eia923_tables"] eia923_years = eia_inputs["eia923_years"] if ((not eia923_tables or not eia923_years) and (not eia860_tables or not eia860_years)): logger.info('Not loading EIA.') return [] # generate CSVs for the static EIA tables, return the list of tables static_tables = _load_static_tables_eia(datapkg_dir) ds = Datastore(**ds_kwargs) # Extract EIA forms 923, 860 eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract( year=eia923_years) eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract( year=eia860_years) # if we are trying to add the EIA 860M YTD data, then extract it and append if eia860_ytd: eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract( year_month=pc.working_partitions['eia860m']['year_month']) eia860_raw_dfs = pudl.extract.eia860m.append_eia860m( eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs) # Transform EIA forms 923, 860 eia860_transformed_dfs = pudl.transform.eia860.transform( eia860_raw_dfs, eia860_tables=eia860_tables) eia923_transformed_dfs = pudl.transform.eia923.transform( eia923_raw_dfs, eia923_tables=eia923_tables) # create an eia transformed dfs dictionary eia_transformed_dfs = eia860_transformed_dfs.copy() eia_transformed_dfs.update(eia923_transformed_dfs.copy()) # Add EIA-EPA crosswalk tables eia_transformed_dfs = _add_eia_epacems_crosswalk(eia_transformed_dfs) # convert types.. eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes( eia_transformed_dfs, 'eia') entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform( eia_transformed_dfs, eia860_years=eia860_years, eia923_years=eia923_years, eia860_ytd=eia860_ytd, ) # convert types.. entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia') # Compile transformed dfs for loading... transformed_dfs = {"Entities": entities_dfs, "EIA": eia_transformed_dfs} # Load step for data_source, transformed_df in transformed_dfs.items(): pudl.load.csv.dict_dump(transformed_df, data_source, datapkg_dir=datapkg_dir) return (list(eia_transformed_dfs.keys()) + list(entities_dfs.keys()) + static_tables)
def test_sandbox_datapackages(self): """All datasets point to valid descriptors and each specifies non-zero resources.""" ds = Datastore(sandbox=True) for dataset in ds.get_known_datasets(): desc = ds.get_datapackage_descriptor(dataset) assert list(desc.get_resources())