def h5_tables_gen(db_path, tables, tables_log, db=None) -> Iterator[Tuple[str, pd.HDFStore]]: """ Generate table names with associated coefficients :param tables: tables names search pattern or sequence of table names :param tables_log: tables names for metadata of data in `tables` :param db_path: :param cfg_out: not used but kept for the requirement of h5_dispenser_and_names_gen() argument :return: iterator that returns (table name, coefficients) updates cfg_in['tables'] - sets to list of found tables in store """ # will be filled by each table from cfg['in']['tables'] tbl_log_pattern = (tables_log[0] or '{}/logRuns') if len(tables_log) == 1 else tables_log[0] with FakeContextIfOpen(lambda f: pd.HDFStore(f, mode='r'), file=db_path, opened_file_object=db) as store: if len(tables) == 1: tables = h5find_tables(store, tables[0]) for tbl, tbl_log in zip_longest(tables, tables_log, fillvalue=tbl_log_pattern): yield tbl, tbl_log.format(tbl), store
def save_operation(h5source=None): """ update dict_matrices in h5file_dest. h5source may be used to copy from h5source :param h5source: opened h5py.File, if not None copy h5file_source//tbl_source//coef to h5file_dest//tbl//coef before update uses global: h5file_dest tbl_dest, tbl_source dict_matrices """ nonlocal dict_matrices with FakeContextIfOpen(lambda f: h5py.File(f, 'a'), h5file_dest) as h5dest: try: if (h5source is None): if (tbl_dest != tbl_source): h5source = h5dest else: raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}') elif (path_h5(h5dest) == h5source and tbl_dest == tbl_source): raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}') # Copy using provided paths: if h5source: path_coef = f'//{tbl_source}//coef' l.info(f'copying "coef" from {path_h5(h5source)}//{tbl_source} to {h5dest.filename}//{tbl_dest}') # Reuse previous calibration structure: # import pdb; pdb.set_trace() # h5source.copy('//' + tbl_source + '//coef', h5dest[tbl_dest + '//coef']) try: h5source.copy(path_coef, h5dest[tbl_dest]) # h5source[tbl_source].copy('', h5dest[tbl_dest], name='coef') except RuntimeError as e: # Unable to copy object (destination object already exists) replace_coefs_group_on_error(h5source, h5dest, path_coef, e) except KeyError: # Unable to open object (object 'incl_b11' doesn't exist)" l.warning('Creating "%s"', tbl_source) try: h5dest.create_group(tbl_source) except (ValueError, KeyError) as e: # already exists replace_coefs_group_on_error(h5source, h5dest, tbl_source, e) else: h5source.copy(path_coef, h5dest[tbl_dest]) except FileExistsError: if dict_matrices is None: raise if dict_matrices: # not is None: have_values = isinstance(dict_matrices, dict) l.info(f'updating {h5file_dest}/{tbl_dest}/{dict_matrices}') # .keys() if have_values: # Save provided values: for k in dict_matrices.keys(): path = f'{tbl_dest}{k}' data = dict_matrices[k] if isinstance(dict_matrices[k], (int, float)): data = np.atleast_1d(data) # Veusz can't load 0d single values try: b_isnan = np.isnan(data) if np.any(b_isnan): l.warning('not writing NaNs: %s%s...', k, np.flatnonzero(b_isnan)) h5dest[path][~b_isnan] = data[~b_isnan] else: h5dest[path][...] = data except TypeError as e: l.error('Replacing dataset "%s" TypeError: %s -> recreating...', path, '\n==> '.join([a for a in e.args if isinstance(a, str)])) # or if you want to replace the dataset with some other dataset of different shape: del h5dest[path] h5dest.create_dataset(path, data=data, dtype=np.float64) except KeyError as e: # Unable to open object (component not found) l.warning('Creating "%s"', path) h5dest.create_dataset(path, data=data, dtype=np.float64) else: paths = list(dict_matrices) dict_matrices = {} for rel_path in paths: path = tbl_source + rel_path try: dict_matrices[path] = h5source[path][...] except AttributeError: # 'ellipsis' object has no attribute 'encode' l.error( 'Skip update coef: dict_matrices must be None or its items must point to matrices %s', '\n==> '.join(a for a in e.args if isinstance(a, str))) continue h5dest[path][...] = dict_matrices[path] h5dest.flush() else: dict_matrices = {}
def h5copy_coef(h5file_source=None, h5file_dest=None, tbl=None, tbl_source=None, tbl_dest=None, dict_matrices: Union[Mapping[str, np.ndarray], Iterable[str], None] = None, ok_to_replace_group=False): """ Copy tbl from h5file_source to h5file_dest overwriting tbl + '/coef/H/A and '/coef/H/C' with H and C if provided :param h5file_source: name of any hdf5 file with existed coef to copy structure :param h5file_dest: name of hdf5 file to paste structure :param dict_matrices: dict of numpy arrays - to write or list of paths to coefs (to matrices) under tbl - to copy them # Example save H and C: 3x3 and 1x3, rotation and shift matrices >>> h5copy_coef(h5file_source,h5file_dest,tbl) dict_matrices={'//coef//H//A': H, '//coef//H//C': C}) """ if h5file_dest is None: h5file_dest = h5file_source if h5file_source is None: if h5file_dest is None: print('skipping: output not specified') return h5file_source = h5file_dest if tbl_source is None: tbl_source = tbl if tbl_dest is None: tbl_dest = tbl # class File_context: # """ # If input is string filename then acts like usual open context manager # else treat input as opened file object and do nothing # """ # # def __init__(self, h5file_init): # self.h5file_init = h5file_init # # def __enter__(self): # if isinstance(self.h5file_init, str): # self.h5file = h5py.File(self.h5file_init, 'a') # return self.h5file # else: # self.h5file = self.h5file_init # # def __exit__(self, exc_type, ex_value, ex_traceback): # if exc_type is None and isinstance(self.h5file_init, str): # self.h5file.close() # return False def path_h5(file): return Path(file.filename if isinstance(file, h5py._hl.files.File) else file) def save_operation(h5source=None): """ update dict_matrices in h5file_dest. h5source may be used to copy from h5source :param h5source: opened h5py.File, if not None copy h5file_source//tbl_source//coef to h5file_dest//tbl//coef before update uses global: h5file_dest tbl_dest, tbl_source dict_matrices """ nonlocal dict_matrices with FakeContextIfOpen(lambda f: h5py.File(f, 'a'), h5file_dest) as h5dest: try: if (h5source is None): if (tbl_dest != tbl_source): h5source = h5dest else: raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}') elif (path_h5(h5dest) == h5source and tbl_dest == tbl_source): raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}') # Copy using provided paths: if h5source: path_coef = f'//{tbl_source}//coef' l.info(f'copying "coef" from {path_h5(h5source)}//{tbl_source} to {h5dest.filename}//{tbl_dest}') # Reuse previous calibration structure: # import pdb; pdb.set_trace() # h5source.copy('//' + tbl_source + '//coef', h5dest[tbl_dest + '//coef']) try: h5source.copy(path_coef, h5dest[tbl_dest]) # h5source[tbl_source].copy('', h5dest[tbl_dest], name='coef') except RuntimeError as e: # Unable to copy object (destination object already exists) replace_coefs_group_on_error(h5source, h5dest, path_coef, e) except KeyError: # Unable to open object (object 'incl_b11' doesn't exist)" l.warning('Creating "%s"', tbl_source) try: h5dest.create_group(tbl_source) except (ValueError, KeyError) as e: # already exists replace_coefs_group_on_error(h5source, h5dest, tbl_source, e) else: h5source.copy(path_coef, h5dest[tbl_dest]) except FileExistsError: if dict_matrices is None: raise if dict_matrices: # not is None: have_values = isinstance(dict_matrices, dict) l.info(f'updating {h5file_dest}/{tbl_dest}/{dict_matrices}') # .keys() if have_values: # Save provided values: for k in dict_matrices.keys(): path = f'{tbl_dest}{k}' data = dict_matrices[k] if isinstance(dict_matrices[k], (int, float)): data = np.atleast_1d(data) # Veusz can't load 0d single values try: b_isnan = np.isnan(data) if np.any(b_isnan): l.warning('not writing NaNs: %s%s...', k, np.flatnonzero(b_isnan)) h5dest[path][~b_isnan] = data[~b_isnan] else: h5dest[path][...] = data except TypeError as e: l.error('Replacing dataset "%s" TypeError: %s -> recreating...', path, '\n==> '.join([a for a in e.args if isinstance(a, str)])) # or if you want to replace the dataset with some other dataset of different shape: del h5dest[path] h5dest.create_dataset(path, data=data, dtype=np.float64) except KeyError as e: # Unable to open object (component not found) l.warning('Creating "%s"', path) h5dest.create_dataset(path, data=data, dtype=np.float64) else: paths = list(dict_matrices) dict_matrices = {} for rel_path in paths: path = tbl_source + rel_path try: dict_matrices[path] = h5source[path][...] except AttributeError: # 'ellipsis' object has no attribute 'encode' l.error( 'Skip update coef: dict_matrices must be None or its items must point to matrices %s', '\n==> '.join(a for a in e.args if isinstance(a, str))) continue h5dest[path][...] = dict_matrices[path] h5dest.flush() else: dict_matrices = {} # or if you want to replace the dataset with some other dataset of different shape: # del f1['meas/frame1/data'] # h5dest.create_dataset(tbl_dest + '//coef_cal//H//A', data= A , dtype=np.float64) # h5dest.create_dataset(tbl_dest + '//coef_cal//H//C', data= C, dtype=np.float64) # h5dest[tbl_dest + '//coef//H//C'][:] = C def replace_coefs_group_on_error(h5source, h5dest, path, e=None): if ok_to_replace_group: l.warning(f'Replacing group "%s"', path) del h5dest[path] h5source.copy(path, h5dest[tbl_dest]) else: l.error('Skip copy coef' + (f': {standard_error_info(e)}!' if e else '!')) # try: with FakeContextIfOpen( (lambda f: h5py.File(f, 'r')) if h5file_source != h5file_dest else None, h5file_source) as h5source: save_operation(h5source) # if h5file_source != h5file_dest: # with h5py.File(h5file_source, 'r') as h5source: # save_operation(h5source) # else: # save_operation() # except Exception as e: # raise e.__class__('Error in save_operation()') # Confirm the changes were properly made and saved: b_ok = True with FakeContextIfOpen(lambda f: h5py.File(f, 'r'), h5file_dest) as h5dest: for k, v in dict_matrices.items(): if not np.allclose(h5dest[tbl_dest + k][...], v, equal_nan=True): l.error(f'h5copy_coef(): coef. {tbl_dest + k} not updated!') b_ok = False if b_ok and dict_matrices: print('h5copy_coef() have updated coef. Ok>')
def main(new_arg=None): """ :param new_arg: returns cfg if new_arg=='<cfg_from_args>' but it will be None if argument argv[1:] == '-h' or '-v' passed to this code argv[1] is cfgFile. It was used with cfg files: 'csv2h5_nav_supervisor.ini' 'csv2h5_IdrRedas.ini' 'csv2h5_Idronaut.ini' :return: """ global l cfg = cfg_from_args(my_argparser(), new_arg) if not cfg or not cfg['program'].get('return'): print('Can not initialise') return cfg elif cfg['program']['return'] == '<cfg_from_args>': # to help testing return cfg l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose']) print('\n' + this_prog_basename(__file__), end=' started. ') try: cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][ 'path'] = init_file_names(**{ **cfg['in'], 'path': cfg['in']['db_path'] }, b_interact=cfg['program']['b_interact']) set_field_if_no( cfg['in'], 'tables_log', '{}/logFiles' ) # will be filled by each table from cfg['in']['tables'] cfg['in']['query'] = query_time_range(**cfg['in']) set_field_if_no(cfg['out'], 'db_path', cfg['in']['db_path']) # cfg['out'] = init_file_names(cfg['out'], , path_field='db_path') except Ex_nothing_done as e: print(e.message) return () # args = parser.parse_args() # args.verbose= args.verbose[0] # try: # cfg= ini2dict(args.cfgFile) # cfg['in']['cfgFile']= args.cfgFile # except IOError as e: # print('\n==> '.join([a for a in e.args if isinstance(a,str)])) #e.message # raise(e) # Open text log if 'log' in cfg['program'].keys(): dir_create_if_need(os_path.dirname(cfg['program']['log'])) flog = open(cfg['program']['log'], 'a+', encoding='cp1251') cfg['out']['log'] = OrderedDict({'fileName': None, 'fileChangeTime': None}) # Prepare saving to csv if 'file_names_add_fun' in cfg['out']: file_names_add = eval( compile(cfg['out']['file_names_add_fun'], '', 'eval')) else: file_names_add = lambda i: '.csv' # f'_{i}.csv' # Prepare data for output store and open it if cfg['out']['tables'] == ['None']: # will not write new data table and its log cfg['out']['tables'] = None # cfg['out']['tables_log'] = None # for _runs cfg will be redefined (this only None case that have sense?) h5init(cfg['in'], cfg['out']) # store, dfLogOld = h5temp_open(**cfg['out']) cfg_fileN = os_path.splitext(cfg['in']['cfgFile'])[0] out_tables_log = cfg['out'].get('tables_log') if cfg_fileN.endswith('_runs') or (bool(out_tables_log) and 'logRuns' in out_tables_log[0]): # Will calculate only after filter # todo: calculate derived parameters before were they are bad (or replace all of them if any bad?) func_before_cycle = lambda x: None func_before_filter = lambda df, log_row, cfg: df func_after_filter = lambda df, cfg: log_runs(df, cfg, cfg['out']['log'] ) # this table will be added: cfg['out']['tables_log'] = [cfg['out']['tables'][0] + '/logRuns'] cfg['out'][ 'b_log_ready'] = True # to not apdate time range in h5_append() # Settings to not affect main data table and switch off not compatible options: cfg['out']['tables'] = [] cfg['out'][ 'b_skip_if_up_to_date'] = False # todo: If False check it: need delete all previous result of CTD_calc() or set min_time > its last log time. True not implemented? cfg['program'][ 'b_log_display'] = False # can not display multiple rows log if 'b_save_images' in cfg['extract_runs']: cfg['extract_runs']['path_images'] = cfg['out'][ 'db_path'].with_name('_subproduct') dir_create_if_need(cfg['extract_runs']['path_images']) else: if 'brown' in cfg_fileN.lower(): func_before_cycle = load_coef if 'Lat' in cfg['in']: func_before_filter = lambda *args, **kwargs: add_ctd_params( process_brown(*args, **kwargs), kwargs['cfg']) else: func_before_filter = process_brown else: func_before_cycle = lambda x: None def ctd_coord_and_params(df: pd.DataFrame, log_row, cfg): coord_data_col_ensure(df, log_row) return add_ctd_params(df, cfg) func_before_filter = ctd_coord_and_params func_after_filter = lambda df, cfg: df # nothing after filter func_before_cycle(cfg) # prepare: usually assign data to cfg['for'] if cfg['out'].get('path_csv'): dir_create_if_need(cfg['out']['path_csv']) # Load data Main circle ######################################### # Open input store and cicle through input table log records qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')" iSt = 1 dfLogOld, cfg['out']['db'], cfg['out'][ 'b_skip_if_up_to_date'] = h5temp_open(**cfg['out']) b_out_db_is_different = cfg['out']['db'] is not None and cfg['out'][ 'db_path_temp'] != cfg['in']['db_path'] # Cycle for each table, for each row in log: # for path_csv in gen_names_and_log(cfg['out'], dfLogOld): with FakeContextIfOpen( lambda f: pd.HDFStore(f, mode='r'), cfg['in']['db_path'], None if b_out_db_is_different else cfg['out']['db'] ) as cfg['in']['db']: # not opens ['in']['db'] if already opened to write for tbl in cfg['in']['tables']: if False: # Show table info nodes = sorted( cfg['out']['db'].root.__members__) # , key=number_key print(nodes) print(tbl, end='. ') df_log = cfg['in']['db'].select(cfg['in']['tables_log'].format(tbl) or tbl, where=cfg['in']['query']) if True: # try: if 'log' in cfg['program'].keys(): nRows = df_log.rows.size flog.writelines(datetime.now().strftime( '\n\n%d.%m.%Y %H:%M:%S> processed ') + f'{nRows} row' + ('s:' if nRows > 1 else ':')) for ifile, r in enumerate(df_log.itertuples(), start=iSt): # name=None print('.', end='') sys_stdout.flush() path_raw = PurePath(r.fileName) cfg['out']['log'].update(fileName=path_raw.name, fileChangeTime=r.fileChangeTime) # save current state cfg['in']['file_stem'] = cfg['out']['log'][ 'fileName'] # for exmple to can extract date in subprogram cfg['in']['fileChangeTime'] = cfg['out']['log'][ 'fileChangeTime'] if cfg['in']['b_skip_if_up_to_date']: have_older_data, have_duplicates = h5del_obsolete( cfg['out'], cfg['out']['log'], dfLogOld) if have_older_data: continue if have_duplicates: cfg['out']['b_remove_duplicates'] = True print('{}. {}'.format(ifile, path_raw.name), end=': ') # Load data qstr = qstr_trange_pattern.format(r.Index, r.DateEnd) df_raw = cfg['in']['db'].select(tbl, qstr) cols = df_raw.columns.tolist() # cfg['in']['lat'] and ['lon'] may be need in add_ctd_params() if Lat not in df_raw if 'Lat_en' in df_log.columns and 'Lat' not in cols: cfg['in']['lat'] = np.nanmean((r.Lat_st, r.Lat_en)) cfg['in']['lon'] = np.nanmean((r.Lon_st, r.Lon_en)) df = func_before_filter(df_raw, log_row=r, cfg=cfg) if df.size: # size is zero means save only log but not data # filter, updates cfg['out']['log']['rows'] df, _ = set_filterGlobal_minmax( df, cfg['filter'], cfg['out']['log']) if 'rows' not in cfg['out']['log']: l.warning('no data!') continue elif isinstance(cfg['out']['log']['rows'], int): print('filtered out {rows_filtered}, remains {rows}'. format_map(cfg['out']['log'])) if cfg['out']['log']['rows']: print('.', end='') else: l.warning('no data!') continue df = func_after_filter(df, cfg=cfg) # Append to Store h5_append(cfg['out'], df, cfg['out']['log'], log_dt_from_utc=cfg['in']['dt_from_utc']) # Copy to csv if cfg['out'].get('path_csv'): fname = '{:%y%m%d_%H%M}-{:%d_%H%M}'.format( r.Index, r.DateEnd) + file_names_add(ifile) if not 'data_columns' in cfg['out']: cfg['out']['data_columns'] = slice(0, -1) # all cols df.to_csv( # [cfg['out']['data_columns']] cfg['out']['path_csv'] / fname, date_format=cfg['out']['text_date_format'], float_format='%5.6g', index_label='Time' ) # to_string, line_terminator='\r\n' # Log to screen (if not prohibited explicitly) if cfg['out']['log'].get('Date0') is not None and ( ('b_log_display' not in cfg['program']) or cfg['program']['b_log_display']): str_log = '{fileName}:\t{Date0:%d.%m.%Y %H:%M:%S}-' \ '{DateEnd:%d. %H:%M:%S%z}\t{rows}rows'.format_map( cfg['out']['log']) # \t{Lat}\t{Lon}\t{strOldVal}->\t{mag} l.info(str_log) else: str_log = str(cfg['out']['log'].get('rows', '0')) # Log to logfile if 'log' in cfg['program'].keys(): flog.writelines('\n' + str_log) if b_out_db_is_different: try: if cfg['out']['tables'] is not None: print('') if cfg['out']['b_remove_duplicates']: h5remove_duplicates(cfg['out'], cfg_table_keys=('tables', 'tables_log')) # Create full indexes. Must be done because of using ptprepack in h5move_tables() below l.debug('Create index') for tblName in (cfg['out']['tables'] + cfg['out']['tables_log']): try: cfg['out']['db'].create_table_index(tblName, columns=['index'], kind='full') except Exception as e: l.warning( ': table {}. Index not created - error'.format( tblName), '\n==> '.join( [s for s in e.args if isinstance(s, str)])) except Exception as e: l.exception('The end. There are error ') import traceback, code from sys import exc_info as sys_exc_info tb = sys_exc_info()[2] # type, value, traceback.print_exc() last_frame = lambda tb=tb: last_frame(tb.tb_next ) if tb.tb_next else tb frame = last_frame().tb_frame ns = dict(frame.f_globals) ns.update(frame.f_locals) code.interact(local=ns) finally: cfg['out']['db'].close() if cfg['program']['log']: flog.close() if cfg['out']['db'].is_open: print('Wait store is closing...') sleep(2) failed_storages = h5move_tables(cfg['out']) print('Finishing...' if failed_storages else 'Ok.', end=' ') h5index_sort( cfg['out'], out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5", in_storages=failed_storages)
def get_runs_parameters(df_raw, times_min, times_max, cols_good_data: Union[str, Sequence[str], None], dt_from_utc: timedelta = timedelta(0), db=None, db_path=None, table_nav=None, table_nav_cols=('Lat', 'Lon', 'DepEcho', 'Speed', 'Course'), dt_search_nav_tolerance=timedelta(minutes=2)): """ :param df_raw: :param times_min: :param times_max: :param cols_good_data: cols of essential data that must be good (depth) :param dt_from_utc: :param db: :param db_path: :param table_nav: 'navigation' table to find data absent in df_raw. Note: tries to find only positive vals :param table_nav_cols: :param dt_search_nav_tolerance: :return: """ log = {} log_update = { } # {_st: DataFrame, _en: DataFrame} - dataframes of parameters for imin and imax for times_lim, suffix, log_time_col, i_search in ((times_min, '_st', 'Date0', 0), (times_max, '_en', 'DateEnd', -1)): log_update[suffix] = df_raw.asof( times_lim, subset=cols_good_data) # rows of last good data log[log_time_col] = timzone_view(log_update[suffix].index, dt_from_utc) # Search for nearest good values if have bad parameter p for (p, *isnan) in log_update[suffix].isna().T.itertuples(name=None): if i_search == -1: log_update[suffix].loc[isnan, p] = df_raw[p].asof(times_max[isnan]) else: # "asof()"-alternative for 1st notna: take 1st good element in each interval for time_nan, time_min, time_max in zip( times_lim[isnan], times_min[isnan], times_max[isnan]): s_search = df_raw.loc[time_min:time_max, p] try: log_update[suffix].at[time_nan, p] = s_search[ s_search.notna()].iat[ 0] # same as .at[s_search.first_valid_index()] except IndexError: l.warning( 'no good values for parameter "%s" in run started %s', p, time_nan) continue log_update[suffix] = log_update[suffix].add_suffix(suffix) log.update( # pd.DataFrame(, index=log_update['_st'].index).rename_axis('Date0') {**dict( [(k, v.values) for st_en in zip(log_update['_st'].items(), log_update['_en'].items()) for k, v in st_en]), # flatten pares }) if table_nav: time_points = log_update['_st'].index.append(log_update['_en'].index) with FakeContextIfOpen(lambda f: pd.HDFStore(f, mode='r'), db_path, db) as store: df_nav, dt = h5select( # all starts then all ends in row store, table_nav, columns=table_nav_cols, time_points=time_points, dt_check_tolerance=dt_search_nav_tolerance) # {:0.0f}s'.format(cfg['out']['dt_search_nav_tolerance'].total_seconds()) # todo: allow filter for individual columns. solution: use multiple calls for columns that need filtering with appropriate query_range_pattern argument of h5select() isnan = df_nav.isna() for col in df_nav.columns[isnan.any(axis=0)]: # not works: # df_nav_col, dt_col = h5select( # for current parameter's name # cfg['in']['db'], cfg['in']['table_nav'], # columns=[col], # query_range_lims=time_points[[0,-1]], # time_points=time_points[isnan[col]], # query_range_pattern = f"index>=Timestamp('{{}}') & index<=Timestamp('{{}}') & {col} > 0 ", # dt_check_tolerance=cfg['out']['dt_search_nav_tolerance'] # ) # Note: tries to find only positive vals: df_nav_col = store.select( table_nav, where="index>=Timestamp('{}') & index<=Timestamp('{}') & {} > 0" .format( *(time_points[[0, -1]] + np.array( (-dt_search_nav_tolerance, dt_search_nav_tolerance))), col), columns=[col]) try: vals = df_nav_col[col].values vals = vals[inearestsorted(df_nav_col.index, time_points[isnan[col]])] except IndexError: continue # not found if vals.any(): df_nav.loc[isnan[col], col] = vals # df_nav['nearestNav'] = dt.astype('m8[s]').view(np.int64) df_edges_items_list = [ df_edge.add_suffix(suffix).items() for suffix, df_edge in ( ('_st', df_nav.iloc[:len(log_update['_st'])]), ('_en', df_nav.iloc[len(log_update['_st']):len(df_nav)])) ] for st_en in zip(*df_edges_items_list): for name, series in st_en: # If have from data table already => update needed elements only if name in log: b_need = np.isnan(log.get(name)) if b_need.any(): b_have = np.isfinite(series.values) # from loaded nav in points b_use = b_need & b_have if b_use.any(): log[name][b_use] = series.values[b_use] # # from all nav (not loaded) # b_need &= ~b_have # # if b_need.any(): # # load range to search nearest good val. for specified fields and tolerance # df = cfg['in']['db'].select(cfg['in']['table_nav'], where=query_range_pattern.format(st_en.index), columns=name) # df_nav = h5select( # for current parameter's name # cfg['in']['db'], cfg['in']['table_nav'], # columns=name, # query_range_lims=st_en # time_points=log_update['_st'].index.append(log_update['_en'].index), # dt_check_tolerance=cfg['out']['dt_search_nav_tolerance'] # ) continue # else: # b_need = np.isnan(series.values) # for # Else update all elements at once log[name] = series.values return log
def csv_to_h5_vaex(read_csv_args, to_hdf_args, dates_formats: Mapping[str, str], correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None, processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None, out_cols: Optional[Sequence] = None, continue_row=False): """ Read csv and write to hdf5 :param read_csv_args: dict, must have keys: filepath_or_buffer, chunksize :param to_hdf_args: vaex_format: bool how to write chanks: True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file False: appending to single pandas hdf5 table path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5') mode: default = 'w' if not continue_row else 'a', key: hdf5 group name in hdf5 file where store data ... :param dates_formats: column: csv column name wich need to be convert from str to DateTime, date_format: date formats :param processing: dict with keys: ((_input cols_), (_output cols_)) and values: function(_input cols_) that will be used returning _output cols_ :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing :param continue_row: csv row number (excluding header) to start with shifting index. If output file exist and continue_row = True then continue converting starting from row equal to last index in it, useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0. If continue_row = integer then start from this row, giving starting index = continue_row :param correct_fun: function applied to each chunk (which is a frame of column data of type str) immediately after reading by read_csv() :return: """ from astropy.io import ascii if not to_hdf_args.get('path_or_buf'): # give default name to output file to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix('.vaex.h5') # prepare vaex/pandas storing open_for_pandas_to_hdf = None tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names(path_out_str=str(to_hdf_args['path_or_buf'])) ichunk = None # find csv row to start msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.' if continue_row is True: # isinstance(continue_same_csv, bool) try: hdf5_list = glob.glob(tmp_search_pattern) if len(hdf5_list): # continue interrupted csv_to_h5() hdf5_list.sort() file_last = hdf5_list[-1] lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list)) "table/columns/index" else: # add next csv data file_last = to_hdf_args['path_or_buf'] with h5py.File(file_last, mode='r') as to_hdf_buf: continue_row = to_hdf_buf['table/columns/index/data'][-1] except (OSError) as e: msg_start += ' No output file.' continue_row = None except KeyError as e: msg_start += ' No data in output file.' continue_row = None else: msg_start += ' Starting from last csv row in output file:' elif continue_row: msg_start += ' Starting from specified csv data row:' if continue_row: lf.info('{:s} {:s}...', msg_start, continue_row) read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row else: lf.info('{:s} Beging from csv row 0, giving it index 0...', msg_start) dtypes = read_csv_args['dtype'] # Set default output cols if out_cols is None and processing: cols_in_used = set() cols_out_used = set() for (c_in, c_out) in processing.keys(): cols_in_used.update(c_in) cols_out_used.update(c_out) cols2del = cols_in_used.difference(cols_out_used) out_cols = dtypes.keys() for col in cols2del: del out_cols[col] cols_out_used = set(out_cols if out_cols is not None else dtypes.keys()) # prepare conversion to user specified types str_cols = [] int_and_nans_cols = [] other_cols = [] for col, typ in dtypes.items(): if out_cols and col not in cols_out_used: continue kind = typ[0] (str_cols if kind == 'S' else int_and_nans_cols if kind == 'I' else other_cols).append(col) str_not_dates = list(set(str_cols).difference(dates_formats.keys())) min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates} with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \ FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf: read_csv_args.update({ 'filepath_or_buffer': read_csv_buf, 'memory_map': True, 'dtype': 'string' # switch off read_csv dtypes convertion (because if it fails it is hard to correct: }) # to read same csv place by pandas) to_hdf_args.update({ 'path_or_buf': to_hdf_buf, 'format': 'table', 'data_columns': True, 'append': True, 'min_itemsize': min_itemsize }) rows_processed = 0 rows_in_chunk = read_csv_args['chunksize'] # alternative to pd.read_csv(**read_csv_args) but without dataframes tbls = ascii.read(read_csv_buf, format='csv', guess=False, delimiter=read_csv_args['delimiter'], data_start=read_csv_args['skiprows'], names=read_csv_args['names'], fast_reader={'chunk_size': read_csv_args['chunksize'], 'chunk_generator': True})
def csv_to_h5( read_csv_args, to_hdf_args, dates_formats: Mapping[str, str], correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None, processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None, out_cols: Optional[Sequence] = None, continue_row=False, vaex_format: Optional[bool]=None ): """ Read csv and write to hdf5 :param read_csv_args: dict, must have keys: filepath_or_buffer, chunksize :param to_hdf_args: path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5') mode: default = 'w' if not continue_row else 'a', key: hdf5 group name in hdf5 file where store data ... :param dates_formats: column: csv column name wich need to be convert from str to DateTime, date_format: date formats :param processing: dict with keys: ((_input cols_), (_output cols_)) and values: function(_input cols_) that will return _output cols_ :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing :param continue_row: csv row number (excluding header) to start with shifting index. If output file exist and continue_row = True then continue converting starting from row equal to last index in it, useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0. If continue_row = integer then start from this row, giving starting index = continue_row :param correct_fun: function applied to each chunk returned by read_csv() which is a frame of column data of type str :param vaex_format: bool how to write chunks: - True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file - False: appending to single pandas hdf5 table - None: evaluates to True if to_hdf_args['path_or_buf'] has next to last suffix ".vaex" else to False :return: """ if to_hdf_args.get('path_or_buf'): if vaex_format is None: vaex_format = Path(str(to_hdf_args['path_or_buf']).strip()).suffixes[:-1] == ['.vaex'] else: # give default name to output file to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix( f'{".vaex" if vaex_format else ""}.h5' ) # Deal with vaex/pandas storing difference if vaex_format: open_for_pandas_to_hdf = None tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names( path_out_str=str(to_hdf_args['path_or_buf']) ) ichunk = None else: def open_for_pandas_to_hdf(path_or_buf): return pd.HDFStore( to_hdf_args['path_or_buf'], to_hdf_args.get('mode', 'a' if continue_row else 'w') ) # Find csv row to start msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.' if continue_row is True: # isinstance(continue_same_csv, bool) try: if vaex_format: hdf5_list = glob.glob(tmp_search_pattern) if len(hdf5_list): # continue interrupted csv_to_h5() hdf5_list.sort() file_last = hdf5_list[-1] lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list)) "table/columns/index" else: # add next csv data file_last = to_hdf_args['path_or_buf'] with h5py.File(file_last, mode='r') as to_hdf_buf: continue_row = to_hdf_buf['table/columns/index/data'][-1] + 1 else: with pd.HDFStore(to_hdf_args['path_or_buf'], mode='r') as to_hdf_buf: continue_row = to_hdf_buf.select(to_hdf_args['key'], columns=[], start=-1).index[-1] + 1 except (OSError) as e: msg_start += ' No output file.' continue_row = None except KeyError as e: msg_start += ' No data in output file.' continue_row = None else: msg_start += ' Starting from next to last loaded csv row:' elif continue_row: msg_start += ' Starting from specified csv data row:' if continue_row: lf.info('{:s} {:d}...', msg_start, continue_row) read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row else: lf.info('{:s} begining from csv row 0, giving it index 0...', msg_start) dtypes = read_csv_args['dtype'] # Set default output cols if not set if out_cols is None and processing: # we will out all we will have except processing inputs if they are not mentioned in processing outputs cols_in_used = set() cols_out_used = set() for (c_in, c_out) in processing.keys(): cols_in_used.update(c_in) cols_out_used.update(c_out) cols2del = cols_in_used.difference(cols_out_used) out_cols = dtypes.keys() for col in cols2del: del out_cols[col] cols_out_used = set(out_cols if out_cols is not None else dtypes.keys()) # Group cols for conversion by types specified str_cols = [] int_and_nans_cols = [] other_cols = [] for col, typ in dtypes.items(): if out_cols and col not in cols_out_used: continue kind = typ[0] (str_cols if kind == 'S' else int_and_nans_cols if kind == 'I' else other_cols).append(col) str_not_dates = list(set(str_cols).difference(dates_formats.keys())) min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates} # Read csv, process, write hdf5 with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \ FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf: read_csv_args.update({ 'filepath_or_buffer': read_csv_buf, 'memory_map': True, 'dtype': 'string' # switch off read_csv dtypes convertion (because if it fails it is hard to correct: }) # to read same csv place by pandas) to_hdf_args.update({ 'path_or_buf': to_hdf_buf, 'format': 'table', 'data_columns': True, 'append': True, 'min_itemsize': min_itemsize }) # rows_processed = 0 # rows_in_chunk = read_csv_args['chunksize'] for ichunk, chunk in enumerate(pd.read_csv(**read_csv_args)): if continue_row: if chunk.size == 0: ichunk = np.ceil(continue_row / read_csv_args['chunksize']).astype(int) - 1 break # continue_row is > data rows else: chunk.index += continue_row lf.extra['id'] = f'chunk start row {chunk.index[0]:d}' if ichunk % 10 == 0: print(f'{ichunk}', end=' ') else: print('.', end='') if correct_fun: correct_fun(chunk) # Convert to user specified types # 1. dates str to DateTime for col, f in dates_formats.items(): # the convertion of 'bytes' to 'strings' is needed for pd.to_datetime() try: chunk[col] = pd.to_datetime(chunk[col], format=f) except ValueError as e: lf.error( 'Conversion to datetime("{:s}" formatted as "{:s}") {:s} -> ' 'Replacing malformed strings by NaT...', col, f, standard_error_info(e)) chunk[col] = pd.to_datetime(chunk[col], format=f, exact=False, errors='coerce') # 2. str to numeric for other_cols and int_and_nans_cols (which is limited support pandas extension dtypes) # but we use numpy types instead replasing nans by -1 to able write to hdf5 chunk[other_cols] = chunk[other_cols].fillna('NaN') # <NA> to numpy recognized eq meaning string chunk[int_and_nans_cols] = chunk[int_and_nans_cols].fillna('-1') for col in (int_and_nans_cols + other_cols): # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes): typ = dtypes[col] if col in int_and_nans_cols: is_integer = True typ = f'i{typ[1:]}' # typ.numpy_dtype else: is_integer = np.dtype(typ).kind == 'i' try: chunk[col] = chunk[col].astype(typ) continue except (ValueError, OverflowError) as e: # Cleaning. In case of OverflowError we do it here to prevent ValueError while handling of OverflowError below. pattern_match = r'^[\d]$' if is_integer else r'^-?[\d.]$' ibad = ~chunk[col].str.match(pattern_match) rep_val = '-1' if is_integer else 'NaN' # ibad = np.flatnonzero(chunk[col] == re.search(r'(?:")(.*)(?:")', e.args[0]).group(1), 'ascii') lf.error('Conversion {:s}("{:s}") {:s} -> replacing {:d} values not maching pattern "{:s}" with "{' ':s}" and again...', typ, col, standard_error_info(e), ibad.sum(), pattern_match, rep_val) chunk.loc[ibad, col] = rep_val # astype(str).replace(regex=True, to_replace=r'^.*[^\d.].*$', value= try: chunk[col] = chunk[col].astype(typ) except (OverflowError, ValueError) as e: # May be bad value from good symbols: r'^\d*\.\d*\.+\d*$' but instead checking it we do coerce_to_exact_dtype() on ValueError here too lf.error('Conversion {:s}("{:s}") {:s} -> Replacing malformed strings and big numbers' ' by NaN ...', typ, col, standard_error_info(e)) chunk[col] = coerce_to_exact_dtype(chunk[col], dtype=typ) # Limit big strings length and convert StringDtype to str to can save by to_hdf() for col, max_len in min_itemsize.items(): # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes): chunk[col] = chunk[col].str.slice(stop=max_len) # apply(lambda x: x[:max_len]) not handles <NA> chunk[str_not_dates] = chunk[str_not_dates].astype(str) # Apply specified data processing if processing: for (cols_in, c_out), fun in processing.items(): cnv_result = fun(chunk[list(cols_in)]) chunk[list(c_out)] = cnv_result # # Bad rows check # is_different = chunk['wlaWID'].fillna('') != chunk['wlaAPIHartStandard'].fillna('') # if is_different.any(): # i_bad = np.flatnonzero(is_different.values) # lf.debug('have wlaWID != wlaAPIHartStandard in rows {:s}', chunk.index[i_bad]) # # chunk= chunk.drop(chunk.index[i_bad]) # - deleting # pass # Check unique index # if chunk['wlaWID'].duplicated() try: if vaex_format: df = vaex.from_pandas(chunk if out_cols is None else chunk[out_cols]) df.export_hdf5(tmp_save_pattern.format(ichunk)) else: # better to move this command upper and proc. by vaex instead of pandas (chunk if out_cols is None else chunk[out_cols]).to_hdf(**to_hdf_args) #rows_processed += rows_in_chunk # think we red always the same length exept last which length value will not be used except Exception as e: lf.exception('write error') pass try: del lf.extra['id'] except KeyError: lf.info('was no more data rows to read') # If vaex store was specified then we have chunk files that we combine now by export_hdf5(): if vaex_format: h5pandas_to_vaex_combine(tmp_search_pattern, str(to_hdf_args['path_or_buf']), check_files_number=ichunk+1)