def hit2df(fname, count=-1, cache=False, verbose=True, drop_non_numeric=True): """ Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe Parameters ---------- fname: str HITRAN-HITEMP file name count: int number of items to read (-1 means all file) cache: boolean, or ``'regen'`` or ``'force'`` if ``True``, a pandas-readable HDF5 file is generated on first access, and later used. This saves on the datatype cast and conversion and improves performances a lot (but changes in the database are not taken into account). If False, no database is used. If ``'regen'``, temp file are reconstructed. Default ``False``. Other Parameters ---------------- drop_non_numeric: boolean if ``True``, non numeric columns are dropped. This improves performances, but make sure all the columns you need are converted to numeric formats before hand. Default ``True``. Note that if a cache file is loaded it will be left untouched. Returns ------- df: pandas Dataframe dataframe containing all lines and parameters References ---------- .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__ Notes ----- Performances: see CDSD-HITEMP parser See Also -------- :func:`~radis.io.cdsd.cdsd2df` """ if verbose >= 2: print("Opening file {0} (cache={1})".format(fname, cache)) columns = columns_2004 # Use cache file if possible fcache = splitext(fname)[0] + ".h5" check_cache_file(fcache=fcache, use_cached=cache, verbose=verbose) if cache and exists(fcache): return get_cache_file(fcache, verbose=verbose) # Detect the molecule by reading the start of the file try: with open(fname) as f: mol = get_molecule(int(f.read(2))) except UnicodeDecodeError as err: raise ValueError( "You're trying to read a binary file {0} ".format(fname) + "instead of an HITRAN file") from err # %% Start reading the full file df = parse_hitran_file(fname, columns, count) # %% Post processing # assert one molecule per database only. Else the groupbase data reading # above doesnt make sense nmol = len(set(df["id"])) if nmol == 0: raise ValueError("Databank looks empty") elif nmol != 1: # Crash, give explicity error messages try: secondline = df.iloc[1] except IndexError: secondline = "" raise ValueError( "Multiple molecules in database ({0}). Current ".format(nmol) + "spectral code only computes 1 species at the time. Use MergeSlabs. " + "Verify the parsing was correct by looking at the first row below: " + "\n{0}".format(df.iloc[0]) + "\n----------------\nand the second row " + "below: \n{0}".format(secondline)) # dd local quanta attributes, based on the HITRAN group df = parse_local_quanta(df, mol) # Add global quanta attributes, based on the HITRAN class df = parse_global_quanta(df, mol) # Remove non numerical attributes if drop_non_numeric: if "branch" in df: replace_PQR_with_m101(df) df = drop_object_format_columns(df, verbose=verbose) # cached file mode but cached file doesn't exist yet (else we had returned) if cache: if verbose: print("Generating cached file: {0}".format(fcache)) try: save_to_hdf( df, fcache, metadata={}, version=radis.__version__, key="df", overwrite=True, verbose=verbose, ) except: if verbose: print(sys.exc_info()) print( "An error occured in cache file generation. Lookup access rights" ) pass return df
def cdsd2df( fname, version="hitemp", count=-1, cache=False, verbose=True, drop_non_numeric=True ): """ Convert a CDSD-HITEMP [1]_ or CDSD-4000 [2]_ file to a Pandas dataframe Parameters ---------- fname: str CDSD file name version: str ('4000', 'hitemp') CDSD version count: int number of items to read (-1 means all file) cache: boolean, or 'regen' if ``True``, a pandas-readable HDF5 file is generated on first access, and later used. This saves on the datatype cast and conversion and improves performances a lot (but changes in the database are not taken into account). If ``False``, no database is used. If 'regen', temp file are reconstructed. Default ``False``. Other Parameters ---------------- drop_non_numeric: boolean if ``True``, non numeric columns are dropped. This improves performances, but make sure all the columns you need are converted to numeric formats before hand. Default ``True``. Note that if a cache file is loaded it will be left untouched. Returns ------- df: pandas Dataframe dataframe containing all lines and parameters Notes ----- CDSD-4000 Database can be downloaded from [3]_ Performances: I had huge performance trouble with this function, because the files are huge (500k lines) and the format is to special (no space between numbers...) to apply optimized methods such as pandas's. A line by line reading isn't so bad, using struct to parse each line. However, we waste typing determining what every line is. I ended up using the fromfiles functions from numpy, not considering *\\n* (line return) as a special character anymore, and a second call to numpy to cast the correct format. That ended up being twice as fast. - initial: 20s / loop - with mmap: worse - w/o readline().rstrip('\\n'): still 20s - numpy fromfiles: 17s - no more readline, 2x fromfile 9s Think about using cache mode too: - no cache mode 9s - cache mode, first time 22s - cache mode, then 2s Moving to HDF5: On cdsd_02069_02070 (56 Mb) Reading:: cdsd2df(): 9.29 s cdsd2df(cache=True [old .txt version]): 2.3s cdsd2df(cache=True [new h5 version, table]): 910ms cdsd2df(cache=True [new h5 version, fixed]): 125ms Storage:: %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="fixed") 337ms %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="table") 1.03s References ---------- Note that CDSD-HITEMP is used as the line database for CO2 in HITEMP 2010 .. [1] `HITEMP 2010, Rothman et al., 2010 <https://www.sciencedirect.com/science/article/pii/S002240731000169X>`_ .. [2] `CDSD-4000 article, Tashkun et al., 2011 <https://www.sciencedirect.com/science/article/pii/S0022407311001154>`_ .. [3] `CDSD-4000 database <ftp://ftp.iao.ru/pub/CDSD-4000/>`_ See Also -------- :func:`~radis.io.hitran.hit2df` """ if verbose >= 2: print( "Opening file {0} (format=CDSD {1}, cache={2})".format( fname, version, cache ) ) if version == "hitemp": columns = columns_hitemp elif version == "4000": columns = columns_4000 else: raise ValueError("Unknown CDSD version: {0}".format(version)) # Use cache file if possible fcache = splitext(fname)[0] + ".h5" check_cache_file(fcache=fcache, use_cached=cache, verbose=verbose) if cache and exists(fcache): return get_cache_file(fcache, verbose=verbose) # %% Start reading the full file df = parse_binary_file(fname, columns, count) # Remove non numerical attributes if drop_non_numeric: replace_PQR_with_m101(df) df = drop_object_format_columns(df, verbose=verbose) # cached file mode but cached file doesn't exist yet (else we had returned) if cache: if verbose: print("Generating cached file: {0}".format(fcache)) try: save_to_hdf( df, fcache, metadata={}, version=radis.__version__, key="df", overwrite=True, verbose=verbose, ) except: if verbose: print("An error occured in cache file generation. Lookup access rights") pass return df
def fetch_astroquery(molecule, isotope, wmin, wmax, verbose=True, cache=True, metadata={}): ''' Wrapper to Astroquery [1]_ fetch function to download a line database Notes ----- Astroquery [1]_ is itself based on [HAPI]_ Parameters ---------- molecule: str, or int molecule name or identifier isotope: int isotope number wmin, wmax: float (cm-1) wavenumber min and max Other Parameters ---------------- verbose: boolean Default ``True`` cache: boolean if ``True``, tries to find a ``.h5`` cache file in the Astroquery :py:attr:`~astroquery.query.BaseQuery.cache_location`, that would match the requirements. If not found, downloads it and saves the line dataframe as a ``.h5`` file in the Astroquery. metadata: dict if ``cache=True``, check that the metadata in the cache file correspond to these attributes. Arguments ``molecule``, ``isotope``, ``wmin``, ``wmax`` are already added by default. References ---------- .. [1] `Astroquery <https://astroquery.readthedocs.io>`_ See Also -------- :py:func:`astroquery.hitran.reader.download_hitran`, :py:func:`astroquery.hitran.reader.read_hitran_file`, :py:attr:`~astroquery.query.BaseQuery.cache_location` ''' # Check input if not is_float(molecule): mol_id = get_molecule_identifier(molecule) else: mol_id = molecule molecule = get_molecule(mol_id) assert is_float(isotope) empty_range = False # If cache, tries to find from Astroquery: if cache: # Update metadata with physical properties from the database. metadata.update({ 'molecule': molecule, 'isotope': isotope, 'wmin': wmin, 'wmax': wmax }) fcache = join( Hitran.cache_location, CACHE_FILE_NAME.format( **{ 'molecule': molecule, 'isotope': isotope, 'wmin': wmin, 'wmax': wmax })) check_cache_file(fcache=fcache, use_cached=cache, metadata=metadata, verbose=verbose) if exists(fcache): try: return get_cache_file(fcache, verbose=verbose) except Exception as err: if verbose: printr( 'Problem reading cache file {0}:\n{1}\nDeleting it!'. format(fcache, str(err))) os.remove(fcache) # tbl = Hitran.query_lines_async(molecule_number=mol_id, # isotopologue_number=isotope, # min_frequency=wmin / u.cm, # max_frequency=wmax / u.cm) # # Download using the astroquery library try: response = Hitran.query_lines_async(molecule_number=mol_id, isotopologue_number=isotope, min_frequency=wmin / u.cm, max_frequency=wmax / u.cm) except KeyError as err: raise KeyError(str(err)+' <<w this error occured in Astroquery. Maybe these molecule '+\ '({0}) and isotope ({1}) are not supported'.format(molecule, isotope)) from err # Deal with usual errors if response.status_code == 404: # Maybe there are just no lines for this species in this range # In that case we usually end up with errors like: # (<class 'Exception'>, Exception('Query failed: 404 Client Error: # Not Found for url: http://hitran.org/lbl/api?numax=25000&numin=19000&iso_ids_list=69\n',), # <traceback object at 0x7f0967c91708>) if response.reason == 'Not Found': # Let's bet it's just that there are no lines in this range empty_range = True if verbose: print(( 'No lines for {0} (id={1}), iso={2} in range {3:.2f}-{4:.2f}cm-1. ' .format(molecule, mol_id, isotope, wmin, wmax))) else: raise ValueError( 'An error occured during the download of HITRAN files ' + 'for {0} (id={1}), iso={2} between {3:.2f}-{4:.2f}cm-1. '. format(molecule, mol_id, isotope, wmin, wmax) + 'Are you online?\n' + 'See details of the error below:\n\n {0}'.format( response.reason)) elif response.status_code == 500: raise ValueError('{0} while querying the HITRAN server: '.format(response.status_code)+\ '\n\n{0}'.format(response.text)) # Process response # Rename columns from Astroquery to RADIS format rename_columns = { 'molec_id': 'id', 'local_iso_id': 'iso', 'nu': 'wav', 'sw': 'int', 'a': 'A', 'gamma_air': 'airbrd', 'gamma_self': 'selbrd', 'elower': 'El', 'n_air': 'Tdpair', 'delta_air': 'Pshft', 'global_upper_quanta': 'globu', 'global_lower_quanta': 'globl', 'local_upper_quanta': 'locu', 'local_lower_quanta': 'locl', 'line_mixing_flag': 'lmix', 'gp': 'gp', 'gpp': 'gpp', } if not empty_range: # _fix_astroquery_file_format(filename) # Note: as of 0.9.16 we're not fixing astroquery_file_format anymore. # maybe we should. tbl = Hitran._parse_result(response) df = tbl.to_pandas() df = df.rename(columns=rename_columns) else: df = pd.DataFrame(columns=list(rename_columns.values())) # Cast type to float64 cast_type = { 'wav': np.float64, 'int': np.float64, 'A': np.float64, 'airbrd': np.float64, 'selbrd': np.float64, 'El': np.float64, 'Tdpair': np.float64, 'Pshft': np.float64, } for c, typ in cast_type.items(): df[c] = df[c].astype(typ) # cached file mode but cached file doesn't exist yet (else we had returned) if cache: if verbose: print('Generating cached file: {0}'.format(fcache)) try: save_to_hdf(df, fcache, metadata=metadata, version=radis.__version__, key='df', overwrite=True, verbose=verbose) except: if verbose: print(sys.exc_info()) print( 'An error occured in cache file generation. Lookup access rights' ) pass return df