Exemplo n.º 1
0
def main():
    """Manage and update the PUDL datastore."""
    # Display logged output from the PUDL package:
    logger = logging.getLogger(pudl.__name__)
    log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
    coloredlogs.install(fmt=log_format, level='INFO', logger=logger)

    args = parse_command_line(sys.argv)

    # Generate a list of valid years of data to download for each data source.
    # If no years were specified, use the full set of valid years.
    # If years were specified, keep only th years which are valid for that
    # data source, and optionally output a message saying which years are
    # being ignored because they aren't valid.
    years_by_source = {}
    for source in args.sources:
        if not args.years:
            years_by_source[source] = pc.data_years[source]
        else:
            years_by_source[source] = [
                int(year) for year in args.years
                if int(year) in pc.data_years[source]
            ]
            if source == "epaipm":
                years_by_source[source] = pc.data_years[source]
                continue
            bad_years = [
                int(year) for year in args.years
                if int(year) not in pc.data_years[source]
            ]
            if args.verbose and bad_years:
                warnings.warn(f"Invalid {source} years ignored: {bad_years}.")

    datastore.parallel_update(
        sources=args.sources,
        years_by_source=years_by_source,
        states=args.states,
        data_dir=str(pathlib.Path(args.datastore_dir, "data")),
        clobber=args.clobber,
        unzip=args.unzip,
        dl=args.download,
    )
Exemplo n.º 2
0
def test_datastore(pudl_settings_fixture, data_scope):
    """Download sample data for each available data source."""
    sources_to_update = ['eia860', 'eia923', 'epaipm']
    years_by_source = {
        'eia860': data_scope['eia860_years'],
        'eia923': data_scope['eia923_years'],
        'epacems': [],
        'epaipm': [
            None,
        ],
        'ferc1': [],
    }
    # Sadly, FERC & EPA only provide access to their data via FTP, and it's
    # not possible to use FTP from within the Travis CI environment:
    if os.getenv('TRAVIS'):
        states = []
    else:
        # Idaho has the least data of any CEMS state.
        states = data_scope['epacems_states']
        sources_to_update.extend(['ferc1', 'epacems'])
        years_by_source['ferc1'] = data_scope['ferc1_years']
        years_by_source['epacems'] = data_scope['epacems_years']

    datastore.parallel_update(
        sources=sources_to_update,
        years_by_source=years_by_source,
        states=states,
        data_dir=pudl_settings_fixture['data_dir'],
    )

    pudl.helpers.verify_input_files(
        ferc1_years=years_by_source['ferc1'],
        epacems_years=years_by_source['epacems'],
        epacems_states=states,
        # Currently no mechanism for automatically verifying EPA IPM files...
        pudl_settings=pudl_settings_fixture,
    )
    data_dir = pudl_settings_fixture['data_dir']
    pudl.extract.eia860.Extractor(data_dir).verify_years(
        years_by_source['eia860'])
    pudl.extract.eia923.Extractor(data_dir).verify_years(
        years_by_source['eia923'])
Exemplo n.º 3
0
def datastore_fixture(pudl_settings_fixture, data_scope):
    """
    Populate a minimal PUDL datastore for the Travis CI tests to access.

    Downloads from FERC & EPA have been... throttled or something, so we
    apparently can't pull the data from them directly any more. To deal with
    that we have checked a small amount of FERC Form 1 and EPA CEMS data into
    the PUDL repository.

    For the EIA 860 and EIA 923 data, we can use the datastore management
    library directly. It's important that we do this from within the tests,
    and not by using the update_datstore script during setup for the Travis
    tests, for two reasons:
     * the pudl module is not installed and importable until after the tox
       run has begun, so unless we import pudl from the filesystem, we can't
       use the script beforehand... and doing so would contaminate the
       environment.
     * Calling the datastore management functions from within the tests will
       add that code to our measurement of test coverage, which is good!
    """
    sources_to_update = [
        'epaipm',
        'eia860',
        'eia923'
    ]

    years_by_source = {
        'eia860': data_scope['eia860_years'],
        'eia923': data_scope['eia923_years'],
        'epacems': [],
        'epaipm': [None, ],
        'ferc1': [],
    }
    # Sadly, FERC & EPA only provide access to their data via FTP, and it's
    # not possible to use FTP from within the Travis CI environment:
    if os.getenv('TRAVIS'):
        logger.info(f"Building a special Travis CI datastore for PUDL.")
        # Simulate having downloaded the data...
        dl_dir = pathlib.Path(pudl_settings_fixture['data_dir'], 'tmp')

        epacems_files = (
            pathlib.Path(os.getenv('TRAVIS_BUILD_DIR'),
                         'test/data/epa/cems/epacems2018/').
            glob('*.zip')
        )
        # Copy the files over to the test-run proto-datastore:
        for file in epacems_files:
            logger.info(
                f"Faking download of {os.path.basename(file)} to {dl_dir}")
            shutil.copy(file, dl_dir)

        # The datastore knows what to do with a file it finds in this dir:
        datastore.organize(
            source='epacems',
            year=2018,
            states=['ID'],
            data_dir=pudl_settings_fixture['data_dir'],
            unzip=True
        )

        ferc1_files = (
            pathlib.Path(os.getenv('TRAVIS_BUILD_DIR'),
                         'test/data/ferc/form1/f1_2018/').
            glob('*.zip')
        )
        # Copy the files over to the test-run proto-datastore:
        for file in ferc1_files:
            logger.info(f"Faking the download of {file} to {dl_dir}")
            shutil.copy(file, dl_dir)

        # The datastore knows what to do with a file it finds in this dir:
        datastore.organize(
            source='ferc1',
            year=2018,
            states=None,
            data_dir=pudl_settings_fixture['data_dir'],
            unzip=True
        )
        states = []
    else:
        sources_to_update.extend(["ferc1", "epacems"])
        years_by_source["ferc1"] = data_scope["ferc1_years"]
        years_by_source["epacems"] = data_scope["epacems_years"]
        states = ["id"]

    # Download the test year for each dataset that we're downloading...
    datastore.parallel_update(
        sources=sources_to_update,
        years_by_source=years_by_source,
        states=states,
        data_dir=pudl_settings_fixture["data_dir"],
    )

    pudl.helpers.verify_input_files(
        ferc1_years=years_by_source["ferc1"],
        epacems_years=years_by_source["epacems"],
        epacems_states=states,
        pudl_settings=pudl_settings_fixture,
    )
    data_dir = pudl_settings_fixture['data_dir']
    pudl.extract.eia860.Extractor(
        data_dir).verify_years(years_by_source['eia860'])
    pudl.extract.eia923.Extractor(
        data_dir).verify_years(years_by_source['eia923'])