def test_fetch_hitemp_all_molecules(molecule, verbose=False, *args, **kwargs): """Test fetch HITEMP for all molecules whose download URL is available. ..warning:: this downloads gigabytes of data. It is unselected by default by Pytest (see radis/setup.cfg) The bz2 compression factor gives about 17 MB / million lines : - OH (57 k lines) is only 900 kb - CO (0.7 M lines) is only ~14 Mb - CH4 (114 M lines) is 435 MB If it fails, check the databases downloaded in ~/.radisdb Notes ----- Performance tests of chunksize tested on CO : - chunksize=1000000 > 22s , 1 iteration ~ 22s - chunksize=100000 > 18s, , 1 iteration ~ 4s - chunksize=50000 > 19s ,1 iteration ~ 2s - chunksize=1000 --> 90s , 1 iteration << 1s """ df = fetch_hitemp(molecule, verbose=verbose) assert f"HITEMP-{molecule}" in getDatabankList() assert len(df) == INFO_HITEMP_LINE_COUNT[molecule]
def test_fetch_hitemp_OH(verbose=True, *args, **kwargs): """Test proper download of HITEMP OH database. Good to test fetch_hitemp. ``13_HITEMP2020.par.bz2`` is only 900 kb so it can be run on online tests without burning the planet ЁЯМ│ тЪая╕П if using the default `chunksize=100000`, it uncompresses in one pass (only 57k lines for OH) and we cannot test that appending to the same HDF5 works. So here we use a smaller chunksize of 20,000. . """ df = fetch_hitemp("OH", cache="regen", chunksize=20000, verbose=3 * verbose) assert "HITEMP-OH" in getDatabankList() assert len(df) == 57019 # Load again and make sure it works (ex: metadata properly loaded etc.): fetch_hitemp("OH")
def build_test_databases(verbose=True): ''' Build test databases and add them in ~/.radis. Generate the file if it doesnt exist In particular: - HITRAN-CO2-TEST: CO2, HITRAN 2016, 4165-4200 nm - HITRAN-CO-TEST: CO, HITRAN 2016, 2000-2300 cm-1 These test databases are used to run the different test routines. They can obviously be used by Users to run simulations, but we suggest Users to download their own line databases files and add them to ~/.radis so they have more control on it ''' # Get list of databases try: dbnames = getDatabankList() except FileNotFoundError: dbnames = [] # %% Add test databases def add_to_parser(config, name, dic): for k, v in dic.items(): config[name][k] = v if verbose: print("Adding '{0}' database in ~/.radis".format(name)) for dbname, dbentries in TEST_DATABASES.items(): if dbname in dbnames: # Check entries are correct # for k diff = diffDatabankEntries(getDatabankEntries(dbname), dbentries, verbose=False) if diff is not None: raise ValueError('{0}'.format(diff)+\ '\nIn ~/.radis\n----------\n{0}'.format(getDatabankEntries(dbname))+\ '\n\nExpected\n---------\n{0}\n\n'.format(dbentries)+\ 'Test Database {0} doesnt match expected '.format(dbname)+\ 'entries for key `{0}`. See comparison above. '.format(diff)+\ 'To regenerate test databases just delete the {0} '.format(dbname)+\ 'entry in your ~/.radis') else: # add them (create ~/.radis file if doesnt exist yet) addDatabankEntries(dbname, dbentries) return
def setup_test_line_databases(verbose=True): """Build :py:data:`~radis.test.utils.TEST_DATABASES` and add them in ~/.radis. Generate the file if it doesnt exist In particular: - HITRAN-CO2-TEST: CO2, HITRAN 2016, 4165-4200 nm - HITRAN-CO-TEST: CO, HITRAN 2016, 2000-2300 cm-1 - HITEMP-CO2-TEST: CO2, HITEMP-2010, 2283.7-2285.1 cm-1, 3 isotopes - HITEMP-CO2-HAMIL-TEST: same as previous, with (some) energy levels computed from Tashkun effective Hamiltonian. These test databases are used to run the different test routines. They can obviously be used by Users to run simulations, but we suggest Users to download their own line databases files and add them to ~/.radis so they have more control on it Examples -------- Initialize the Line databases:: from radis import setup_test_line_databases setup_test_line_databases() Plot a CO2 spectrum at high temperature:: from radis import calc_spectrum calc_spectrum(2284, 2285, Tgas=2000, pressure=1, molecule='CO2', isotope=1 databank='HITEMP-CO2-TEST').plot() Note that 'HITEMP-CO2-TEST' is defined on 2283.7-2285.1 cm-1 only, as can be shown by reading the Database information: from radis.misc.config import printDatabankEntries printDatabankEntries('HITEMP-CO2-TEST') >>> HITEMP-CO2-TEST >>> ------- >>> info : HITEMP-2010, CO2, 3 main isotope (CO2-626, 636, 628), 2283.7-2285.1 cm-1 >>> path : ['/USER/PATH/TO\\radis\\radis\\test\\files\\cdsd_hitemp_09_fragment.txt'] >>> format : cdsd-hitemp >>> parfuncfmt : hapi >>> levelsfmt : radis See Also -------- :ref:`Configuration file <label_lbl_config_file>`, :py:func:`~radis.misc.config.getDatabankList`, :py:func:`~radis.misc.config.printDatabankEntries` """ # TODO: generate large band databases for the main species (let's say CO2, # H2O and CH4) and main isotopes by fetching the HITRAN 2016 database. # Get list of databases try: dbnames = getDatabankList() except FileNotFoundError: dbnames = [] # %% Add test databases def add_to_parser(config, name, dic): for k, v in dic.items(): config[name][k] = v if verbose: print("Adding '{0}' database in ~/.radis".format(name)) for dbname, dbentries in TEST_DATABASES.items(): if dbname in dbnames: # Check entries are correct # for k diff = diffDatabankEntries(getDatabankEntries(dbname), dbentries, verbose=False) if diff is not None: raise ValueError( "{0}".format(diff) + "\nIn ~/.radis\n----------\n{0}".format( getDatabankEntries(dbname)) + "\n\nExpected\n---------\n{0}\n\n".format(dbentries) + "Test Database {0} doesnt match expected ".format(dbname) + "entries for key `{0}`. See comparison above. ".format( diff) + "To regenerate test databases just delete the {0} ".format( dbname) + "entry in your ~/.radis") else: # add them (create ~/.radis file if doesnt exist yet) addDatabankEntries(dbname, dbentries) return
def setup_test_line_databases(verbose=True): ''' Build :py:data:`~radis.test.utils.TEST_DATABASES` and add them in ~/.radis. Generate the file if it doesnt exist In particular: - HITRAN-CO2-TEST: CO2, HITRAN 2016, 4165-4200 nm - HITRAN-CO-TEST: CO, HITRAN 2016, 2000-2300 cm-1 - HITEMP-CO2-TEST: CO2, HITEMP-2010, 2283.7-2285.1 cm-1, 3 isotopes These test databases are used to run the different test routines. They can obviously be used by Users to run simulations, but we suggest Users to download their own line databases files and add them to ~/.radis so they have more control on it See Also -------- :ref:`Configuration file <label_lbl_config_file>` ''' # TODO: generate large band databases for the main species (let's say CO2, # H2O and CH4) and main isotopes by fetching the HITRAN 2016 database. # Get list of databases try: dbnames = getDatabankList() except FileNotFoundError: dbnames = [] # %% Add test databases def add_to_parser(config, name, dic): for k, v in dic.items(): config[name][k] = v if verbose: print("Adding '{0}' database in ~/.radis".format(name)) for dbname, dbentries in TEST_DATABASES.items(): if dbname in dbnames: # Check entries are correct # for k diff = diffDatabankEntries(getDatabankEntries(dbname), dbentries, verbose=False) if diff is not None: raise ValueError( '{0}'.format(diff) + '\nIn ~/.radis\n----------\n{0}'.format( getDatabankEntries(dbname)) + '\n\nExpected\n---------\n{0}\n\n'.format(dbentries) + 'Test Database {0} doesnt match expected '.format(dbname) + 'entries for key `{0}`. See comparison above. '.format( diff) + 'To regenerate test databases just delete the {0} '.format( dbname) + 'entry in your ~/.radis') else: # add them (create ~/.radis file if doesnt exist yet) addDatabankEntries(dbname, dbentries) return
def fetch_hitemp( molecule, local_databases="~/.radisdb/", databank_name="HITEMP-{molecule}", isotope=None, load_wavenum_min=None, load_wavenum_max=None, cache=True, verbose=True, chunksize=100000, clean_cache_files=True, ): """Stream HITEMP file from HITRAN website. Unzip and build a HDF5 file directly. Returns a Pandas DataFrame containing all lines. Parameters ---------- molecule: `"CO2", "N2O", "CO", "CH4", "NO", "NO2", "OH"` HITEMP molecule. See :py:attr:`~radis.io.hitemp.HITEMP_SOURCE_FILES` local_databases: str where to create the RADIS HDF5 files. Default ``"~/.radisdb/"`` databank_name: str name of the databank in RADIS :ref:`Configuration file <label_lbl_config_file>` Default ``"HITEMP-{molecule}"`` isotope: str load only certain isotopes : ``'2'``, ``'1,2'``, etc. If ``None``, loads everything. Default ``None``. load_wavenum_min, load_wavenum_max: float (cm-1) load only specific wavenumbers. Other Parameters ---------------- cache: bool, or ``'regen'`` if ``True``, use existing HDF5 file. If ``False`` or ``'regen'``, rebuild it. verbose: bool chunksize: int number of lines to process at a same time. Higher is usually faster but can create Memory problems and keep the user uninformed of the progress. clean_cache_files: bool if ``True`` clean downloaded cache files after HDF5 are created. Returns ------- df: pd.DataFrame Line list A HDF5 file is also created in ``local_databases`` and referenced in the :ref:`RADIS config file <label_lbl_config_file>` with name ``databank_name`` Notes ----- if using ``load_only_wavenum_above/below`` or ``isotope``, the whole database is anyway downloaded and uncompressed to ``local_databases`` fast access .HDF5 files (which will take a long time on first call). Only the expected wavenumber range & isotopes are returned. The .HFD5 parsing uses :py:func:`~radis.io.hdf5.hdf2df` See Also -------- :py:func:`~radis.io.hdf5.hdf2df` """ # TODO ? : unzip only parts of the database # see https://github.com/radis/radis/pull/194 if databank_name == "HITEMP-{molecule}": databank_name = databank_name.format(**{"molecule": molecule}) local_databases = abspath(local_databases.replace("~", expanduser("~"))) if molecule in ["H2O", "CO2"]: raise NotImplementedError( "Automatic HITEMP download not implemented for {0} : multiple files. Download manually on https://hitran.org/hitemp/ " .format(molecule)) try: inputf = HITEMP_SOURCE_FILES[molecule] except KeyError as err: raise KeyError( f"Please choose one of HITEMP molecules : {list(HITEMP_SOURCE_FILES.keys())}. Got '{molecule}'" ) from err urlname = BASE_URL + inputf try: os.mkdir(local_databases) except OSError: pass else: if verbose: print("Created folder :", local_databases) output = abspath( join(local_databases, molecule + "-" + inputf.replace(".par.bz2", ".h5"))) if not cache or cache == "regen": # Delete existing HDF5 file if exists(output): if verbose: print("Removing existing file ", output) # TODO: also clean the getDatabankList? Todo once it is in JSON format. https://github.com/radis/radis/issues/167 os.remove(output) if exists(output): # check metadata : check_not_deprecated( output, metadata_is={}, metadata_keys_contain=["wavenumber_min", "wavenumber_max"], ) # check database is registered in ~/.radis if not databank_name in getDatabankList(): # if not, check number of rows is correct : error_msg = "" with pd.HDFStore(output, "r") as store: nrows = store.get_storer("df").nrows if nrows != INFO_HITEMP_LINE_COUNT[molecule]: error_msg += ( f"\nNumber of lines in local database ({nrows:,}) " + "differ from the expected number of lines for " + f"HITEMP {molecule}: {INFO_HITEMP_LINE_COUNT[molecule]}" ) file_metadata = store.get_storer("df").attrs.metadata for k in [ "wavenumber_min", "wavenumber_max", "download_url", "download_date", ]: if k not in file_metadata: error_msg += ( "\nMissing key in file metadata to register the database " + f"automatically : {k}") if error_msg: raise ValueError( f"{databank_name} not declared in your RADIS ~/.config file although " + f"{output} exists. {error_msg}\n" + "If you know this file, add it to ~/.radisdb manually. " + "Else regenerate the database with:\n\t" + ">>> radis.SpectrumFactory().fetch_databank(..., use_cached='regen')" + "\nor\n\t" + ">>> radis.io.hitemp.fetch_hitemp({molecule}, cache='regen')" + "\n\n⚠️ It will re-download & uncompress the whole database " + "from HITEMP.\n\nList of declared databanks: {getDatabankList()}.\n" + f"{output} metadata: {file_metadata}") # Else database looks ok : register it if verbose: print( f"{databank_name} not declared in your RADIS ~/.config file although " + f"{output} exists. Registering the database automatically." ) register_database( databank_name, [output], molecule=molecule, wmin=file_metadata["wavenumber_min"], wmax=file_metadata["wavenumber_max"], download_date=file_metadata["download_date"], urlname=file_metadata["download_url"], verbose=verbose, ) if verbose: print(f"Using existing database {databank_name}") return hdf2df( output, isotope=isotope, load_wavenum_min=load_wavenum_min, load_wavenum_max=load_wavenum_max, verbose=verbose, ) # Doesnt exist : download ds = DataSource(join(local_databases, "downloads")) if verbose: print(f"Downloading {inputf} for {molecule}.") download_date = date.today().strftime("%d %b %Y") columns = columns_2004 # Get linereturn (depends on OS, but file may also have been generated # on a different OS. Here we simply read the file to find out) with ds.open(urlname) as gfile: # locally downloaded file dt = _create_dtype( columns, "a2" ) # 'a2' allocates space to get \n or \n\r for linereturn character b = np.zeros(1, dtype=dt) gfile.readinto(b) linereturnformat = _get_linereturnformat(b, columns) with ds.open(urlname) as gfile: # locally downloaded file dt = _create_dtype(columns, linereturnformat) b = np.zeros(chunksize, dtype=dt) # receives the HITRAN 160-character data. wmin = np.inf wmax = 0 if verbose: print( f"Download complete. Building {molecule} database to {output}") with pd.HDFStore(output, mode="a", complib="blosc", complevel=9) as f: Nlines = 0 Ntotal_lines_expected = INFO_HITEMP_LINE_COUNT[molecule] pb = ProgressBar(N=Ntotal_lines_expected, active=verbose) for nbytes in iter(lambda: gfile.readinto(b), 0): if not b[-1]: # End of file flag within the chunk (but does not start # with End of file flag) so nbytes != 0 b = get_last(b) df = _ndarray2df(b, columns, linereturnformat) # df.to_hdf( # output, "df", format="table", append=True, complib="blosc", complevel=9 # ) f.put( key="df", value=df, append=True, format="table", data_columns=DATA_COLUMNS, ) wmin = np.min((wmin, df.wav.min())) wmax = np.max((wmax, df.wav.max())) Nlines += len(df) pb.update( Nlines, message= f"Parsed {Nlines:,} / {Ntotal_lines_expected:,} lines. Wavenumber range {wmin:.2f}-{wmax:.2f} cm-1 is complete.", ) # Reinitialize for next read b = np.zeros( chunksize, dtype=dt) # receives the HITRAN 160-character data. f.get_storer("df").attrs.metadata = { "wavenumber_min": wmin, "wavenumber_max": wmax, "download_date": download_date, "download_url": urlname, "version": radis.__version__, } pb.done() # Done: add final checks # ... check on the created file that all lines are there : with pd.HDFStore(output, "r") as store: nrows = store.get_storer("df").nrows assert nrows == Nlines if nrows != INFO_HITEMP_LINE_COUNT[molecule]: raise AssertionError( f"Number of lines in local database ({nrows:,}) " + "differ from the expected number of lines for " + f"HITEMP {molecule}: {INFO_HITEMP_LINE_COUNT[molecule]}" + ". Check that there was no recent update on HITEMP. " + "Else it may be a download error ?") # Add database to ~/.radis register_database(databank_name, [output], molecule, wmin, wmax, download_date, urlname, verbose) # Fully unzipped : clean if clean_cache_files: os.remove(ds._findfile(urlname)) if verbose >= 3: from radis.misc.printer import printg printg("... removed downloaded cache file") return hdf2df( output, isotope=isotope, load_wavenum_min=load_wavenum_min, load_wavenum_max=load_wavenum_max, verbose=verbose, )