def _get_linereturnformat(data, columns, fname=""): """ Get line return character & format (size). Notes ----- We cannot simply infer it from the OS : problem arise when file was written in an OS and read in another OS (for instance, line return characters are not converted when read from .egg files). Here we read the first line and infer the line return character for it """ # fname just for the error message # get format (size) of line return from radis.misc.basics import to_str linereturn = to_str(data[0][-1]) if to_str("\r\n") in linereturn: linereturnformat = "a2" elif to_str("\n") in linereturn or to_str("\r") in linereturn: linereturnformat = "a1" else: raise ValueError( "Unknown Line return format: {0}. Check that your file {1} has the HITRAN format. First line : {2}" .format(linereturn, fname, data[0])) return linereturnformat
def hit2df(fname, count=-1, cache=False, verbose=True): ''' Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe Parameters ---------- fname: str HITRAN-HITEMP file name count: int number of items to read (-1 means all file) cache: boolean if True, a pandas-readable HDF5 file is generated on first access, and later used. This saves on the datatype cast and conversion and improves performances a lot (but changes in the database are not taken into account). If False, no database is used. If 'regen', temp file are reconstructed. Default False. Returns ------- df: pandas Dataframe dataframe containing all lines and parameters References ---------- .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__ Notes ----- Performances: see CDSD-HITEMP parser ''' columns = columns_2004 if cache: # lookup if cached file exist. # fcache = fname+'.cached' fcache = splitext(fname)[0] + '.h5' if exists(fcache): if cache == 'regen': os.remove(fcache) if verbose: print('Deleted h5 cache file : {0}'.format(fcache)) else: if verbose: print('Using h5 file: {0}'.format(fcache)) # return pd.read_csv(fcache) return pd.read_hdf(fcache, 'df') # Detect the molecule by reading the start of the file with open(fname) as f: mol = get_molecule(int(f.read(2))) # %% Start reading the full file # To be faster, we read file totally in bytes mode with fromfiles. But that # requires to properly decode the line return character: # problem arise when file was written in an OS and read in another OS (for instance, # line return characters are not converted when read from .egg files). Here # we read the first line and infer the line return character for it # ... Create a dtype with the binary data format and the desired column names dtype = [(k, c[0]) for (k, c) in columns.items()] + [('_linereturn', 'a2')] # ... _linereturn is to capture the line return symbol. We delete it afterwards dt = _format_dtype(dtype) data = np.fromfile(fname, dtype=dt, count=1) # just read the first line # get format of line return from radis.misc.basics import to_str linereturn = to_str(data[0][-1]) if to_str('\r\n') in linereturn: linereturnformat = 'a2' elif to_str('\n') in linereturn or to_str('\r') in linereturn: linereturnformat = 'a1' else: raise ValueError( 'Line return format unknown: {0}. Please update RADIS'.format( linereturn)) # Now re-read with correct line return character # ... Create a dtype with the binary data format and the desired column names dtype = [(k, c[0]) for (k, c) in columns.items() ] + [('_linereturn', linereturnformat)] # ... _linereturn is to capture the line return symbol. We delete it afterwards dt = _format_dtype(dtype) data = np.fromfile(fname, dtype=dt, count=count) # ... Cast to new type # This requires to recast all the data already read, but is still the fastest # method I found to read a file directly (for performance benchmark see # CDSD-HITEMP parser) newtype = [c[0] if (c[1] == str) else c[1] for c in columns.values()] dtype = list(zip(list(columns.keys()), newtype)) + [('_linereturn', linereturnformat)] data = _cast_to_dtype(data, dtype) # %% Create dataframe df = pd.DataFrame(data.tolist(), columns=list(columns.keys()) + ['_linereturn']) # assert one molecule per database only. Else the groupbase data reading # above doesnt make sense nmol = len(set(df['id'])) if nmol == 0: raise ValueError('Databank looks empty') elif nmol != 1: # Crash, give explicity error messages try: secondline = df.iloc[1] except IndexError: secondline = '' raise ValueError('Multiple molecules in database ({0}). Current '.format(nmol)+\ 'spectral code only computes 1 species at the time. Use MergeSlabs. '+\ 'Verify the parsing was correct by looking at the first row below: '+\ '\n{0}'.format(df.iloc[0])+'\n----------------\nand the second row '+\ 'below: \n{0}'.format(secondline)) for k, c in columns.items(): if c[1] == str: df[k] = df[k].str.decode("utf-8") # %% Add local quanta attributes, based on the HITRAN group df = parse_local_quanta(df, mol) # %% Add global quanta attributes, based on the HITRAN class df = parse_global_quanta(df, mol) # Strip whitespaces around PQR columns (due to 2 columns jumped) if 'branch' in df: df['branch'] = df.branch.str.strip() # Delete dummy column than handled the line return character del df['_linereturn'] if cache: # cached file mode but cached file doesn't exist yet (else we had returned) if verbose: print('Generating cached file: {0}'.format(fcache)) try: # df.to_csv(fcache) _generate_cache_file(fcache, df) except: if verbose: print(sys.exc_info()) print( 'An error occured in cache file generation. Lookup access rights' ) pass return df
def parse_hitran_file(fname, columns, count): """Parse a file under HITRAN ``par`` format. Parsing is done in binary format with :py:func:`numpy.fromfile` so it's as fast as possible. Parameters ---------- fname: str filename columns: dict list of columns and their format count: int number of lines to read Returns ------- df: pandas DataFrame dataframe with lines Notes ----- Part common to hit2df and cdsd2df """ # To be faster, we read file totally in bytes mode with fromfiles. But that # requires to properly decode the line return character: # problem arise when file was written in an OS and read in another OS (for instance, # line return characters are not converted when read from .egg files). Here # we read the first line and infer the line return character for it # ... Create a dtype with the binary data format and the desired column names dtype = [(k, c[0]) for (k, c) in columns.items()] + [("_linereturn", "a2")] # ... _linereturn is to capture the line return symbol. We delete it afterwards dt = _format_dtype(dtype) data = np.fromfile(fname, dtype=dt, count=1) # just read the first line # get format of line return from radis.misc.basics import to_str linereturn = to_str(data[0][-1]) if to_str("\r\n") in linereturn: linereturnformat = "a2" elif to_str("\n") in linereturn or to_str("\r") in linereturn: linereturnformat = "a1" else: raise ValueError( "Unknown `Line return` format: {0}. Check that your file {1} has the HITRAN format." .format(linereturn, fname)) # Now re-read with correct line return character # ... Create a dtype with the binary data format and the desired column names dtype = [(k, c[0]) for (k, c) in columns.items() ] + [("_linereturn", linereturnformat)] # ... _linereturn is to capture the line return symbol. We delete it afterwards dt = _format_dtype(dtype) data = np.fromfile(fname, dtype=dt, count=count) # ... Cast to new type # This requires to recast all the data already read, but is still the fastest # method I found to read a file directly (for performance benchmark see # CDSD-HITEMP parser) newtype = [c[0] if (c[1] == str) else c[1] for c in columns.values()] dtype = list(zip(list(columns.keys()), newtype)) + [("_linereturn", linereturnformat)] data = _cast_to_dtype(data, dtype) # %% Create dataframe df = pd.DataFrame(data.tolist(), columns=list(columns.keys()) + ["_linereturn"]) # Delete dummy column than handled the line return character del df["_linereturn"] # Update format for k, c in columns.items(): if c[1] == str: df[k] = df[k].str.decode("utf-8") # Strip whitespaces around PQR columns (due to 2 columns jumped) if "branch" in df: # (only in CDSD) df["branch"] = df.branch.str.strip() return df