示例#1
0
def h5_tables_gen(db_path,
                  tables,
                  tables_log,
                  db=None) -> Iterator[Tuple[str, pd.HDFStore]]:
    """
    Generate table names with associated coefficients
    :param tables: tables names search pattern or sequence of table names
    :param tables_log: tables names for metadata of data in `tables`
    :param db_path:
    :param cfg_out: not used but kept for the requirement of h5_dispenser_and_names_gen() argument
    :return: iterator that returns (table name, coefficients)
    updates cfg_in['tables'] - sets to list of found tables in store
    """
    # will be filled by each table from cfg['in']['tables']
    tbl_log_pattern = (tables_log[0] or
                       '{}/logRuns') if len(tables_log) == 1 else tables_log[0]
    with FakeContextIfOpen(lambda f: pd.HDFStore(f, mode='r'),
                           file=db_path,
                           opened_file_object=db) as store:
        if len(tables) == 1:
            tables = h5find_tables(store, tables[0])
        for tbl, tbl_log in zip_longest(tables,
                                        tables_log,
                                        fillvalue=tbl_log_pattern):
            yield tbl, tbl_log.format(tbl), store
示例#2
0
    def save_operation(h5source=None):
        """
        update dict_matrices in h5file_dest. h5source may be used to copy from h5source
        :param h5source: opened h5py.File, if not None copy h5file_source//tbl_source//coef to h5file_dest//tbl//coef before update
        uses global:
            h5file_dest
            tbl_dest, tbl_source
            dict_matrices
        """
        nonlocal dict_matrices

        with FakeContextIfOpen(lambda f: h5py.File(f, 'a'), h5file_dest) as h5dest:
            try:
                if (h5source is None):
                    if (tbl_dest != tbl_source):
                        h5source = h5dest
                    else:
                        raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}')
                elif (path_h5(h5dest) == h5source and tbl_dest == tbl_source):
                    raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}')

                # Copy using provided paths:
                if h5source:
                    path_coef = f'//{tbl_source}//coef'
                    l.info(f'copying "coef" from {path_h5(h5source)}//{tbl_source} to {h5dest.filename}//{tbl_dest}')
                    # Reuse previous calibration structure:
                    # import pdb; pdb.set_trace()
                    # h5source.copy('//' + tbl_source + '//coef', h5dest[tbl_dest + '//coef'])
                    try:
                        h5source.copy(path_coef, h5dest[tbl_dest])
                        # h5source[tbl_source].copy('', h5dest[tbl_dest], name='coef')
                    except RuntimeError as e:  # Unable to copy object (destination object already exists)
                        replace_coefs_group_on_error(h5source, h5dest, path_coef, e)
                    except KeyError: # Unable to open object (object 'incl_b11' doesn't exist)"
                        l.warning('Creating "%s"', tbl_source)

                        try:
                            h5dest.create_group(tbl_source)
                        except (ValueError, KeyError) as e:  # already exists
                            replace_coefs_group_on_error(h5source, h5dest, tbl_source, e)
                        else:
                            h5source.copy(path_coef, h5dest[tbl_dest])

            except FileExistsError:
                if dict_matrices is None:
                    raise

            if dict_matrices:  # not is None:
                have_values = isinstance(dict_matrices, dict)
                l.info(f'updating {h5file_dest}/{tbl_dest}/{dict_matrices}')  # .keys()

                if have_values:  # Save provided values:
                    for k in dict_matrices.keys():
                        path = f'{tbl_dest}{k}'
                        data = dict_matrices[k]
                        if isinstance(dict_matrices[k], (int, float)):
                            data = np.atleast_1d(data)  # Veusz can't load 0d single values
                        try:
                            b_isnan = np.isnan(data)
                            if np.any(b_isnan):
                                l.warning('not writing NaNs: %s%s...', k, np.flatnonzero(b_isnan))
                                h5dest[path][~b_isnan] = data[~b_isnan]
                            else:
                                h5dest[path][...] = data
                        except TypeError as e:
                            l.error('Replacing dataset "%s" TypeError: %s -> recreating...', path,
                                    '\n==> '.join([a for a in e.args if isinstance(a, str)]))
                            # or if you want to replace the dataset with some other dataset of different shape:
                            del h5dest[path]
                            h5dest.create_dataset(path, data=data, dtype=np.float64)
                        except KeyError as e:  # Unable to open object (component not found)
                            l.warning('Creating "%s"', path)
                            h5dest.create_dataset(path, data=data, dtype=np.float64)
                else:
                    paths = list(dict_matrices)
                    dict_matrices = {}
                    for rel_path in paths:
                        path = tbl_source + rel_path
                        try:
                            dict_matrices[path] = h5source[path][...]
                        except AttributeError:  # 'ellipsis' object has no attribute 'encode'
                            l.error(
                                'Skip update coef: dict_matrices must be None or its items must point to matrices %s',
                                '\n==> '.join(a for a in e.args if isinstance(a, str)))
                            continue
                        h5dest[path][...] = dict_matrices[path]

                h5dest.flush()
            else:
                dict_matrices = {}
示例#3
0
def h5copy_coef(h5file_source=None, h5file_dest=None, tbl=None, tbl_source=None, tbl_dest=None,
                dict_matrices: Union[Mapping[str, np.ndarray], Iterable[str], None] = None, ok_to_replace_group=False):
    """
    Copy tbl from h5file_source to h5file_dest overwriting tbl + '/coef/H/A and '/coef/H/C' with H and C if provided
    :param h5file_source: name of any hdf5 file with existed coef to copy structure
    :param h5file_dest: name of hdf5 file to paste structure
    :param dict_matrices: dict of numpy arrays - to write or list of paths to coefs (to matrices) under tbl - to copy them
    # Example save H and C: 3x3 and 1x3, rotation and shift matrices
    >>> h5copy_coef(h5file_source,h5file_dest,tbl)
            dict_matrices={'//coef//H//A': H,
                           '//coef//H//C': C})
    """

    if h5file_dest is None:
        h5file_dest = h5file_source
    if h5file_source is None:
        if h5file_dest is None:
            print('skipping: output not specified')
            return
        h5file_source = h5file_dest

    if tbl_source is None:
        tbl_source = tbl
    if tbl_dest is None:
        tbl_dest = tbl

    # class File_context:
    #     """
    #     If input is string filename then acts like usual open context manager
    #     else treat input as opened file object and do nothing
    #     """
    #
    #     def __init__(self, h5file_init):
    #         self.h5file_init = h5file_init
    #
    #     def __enter__(self):
    #         if isinstance(self.h5file_init, str):
    #             self.h5file = h5py.File(self.h5file_init, 'a')
    #             return self.h5file
    #         else:
    #             self.h5file = self.h5file_init
    #
    #     def __exit__(self, exc_type, ex_value, ex_traceback):
    #         if exc_type is None and isinstance(self.h5file_init, str):
    #             self.h5file.close()
    #         return False

    def path_h5(file):
        return Path(file.filename if isinstance(file, h5py._hl.files.File) else file)

    def save_operation(h5source=None):
        """
        update dict_matrices in h5file_dest. h5source may be used to copy from h5source
        :param h5source: opened h5py.File, if not None copy h5file_source//tbl_source//coef to h5file_dest//tbl//coef before update
        uses global:
            h5file_dest
            tbl_dest, tbl_source
            dict_matrices
        """
        nonlocal dict_matrices

        with FakeContextIfOpen(lambda f: h5py.File(f, 'a'), h5file_dest) as h5dest:
            try:
                if (h5source is None):
                    if (tbl_dest != tbl_source):
                        h5source = h5dest
                    else:
                        raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}')
                elif (path_h5(h5dest) == h5source and tbl_dest == tbl_source):
                    raise FileExistsError(f'Can not copy to itself {h5dest.filename}//{tbl_dest}')

                # Copy using provided paths:
                if h5source:
                    path_coef = f'//{tbl_source}//coef'
                    l.info(f'copying "coef" from {path_h5(h5source)}//{tbl_source} to {h5dest.filename}//{tbl_dest}')
                    # Reuse previous calibration structure:
                    # import pdb; pdb.set_trace()
                    # h5source.copy('//' + tbl_source + '//coef', h5dest[tbl_dest + '//coef'])
                    try:
                        h5source.copy(path_coef, h5dest[tbl_dest])
                        # h5source[tbl_source].copy('', h5dest[tbl_dest], name='coef')
                    except RuntimeError as e:  # Unable to copy object (destination object already exists)
                        replace_coefs_group_on_error(h5source, h5dest, path_coef, e)
                    except KeyError: # Unable to open object (object 'incl_b11' doesn't exist)"
                        l.warning('Creating "%s"', tbl_source)

                        try:
                            h5dest.create_group(tbl_source)
                        except (ValueError, KeyError) as e:  # already exists
                            replace_coefs_group_on_error(h5source, h5dest, tbl_source, e)
                        else:
                            h5source.copy(path_coef, h5dest[tbl_dest])

            except FileExistsError:
                if dict_matrices is None:
                    raise

            if dict_matrices:  # not is None:
                have_values = isinstance(dict_matrices, dict)
                l.info(f'updating {h5file_dest}/{tbl_dest}/{dict_matrices}')  # .keys()

                if have_values:  # Save provided values:
                    for k in dict_matrices.keys():
                        path = f'{tbl_dest}{k}'
                        data = dict_matrices[k]
                        if isinstance(dict_matrices[k], (int, float)):
                            data = np.atleast_1d(data)  # Veusz can't load 0d single values
                        try:
                            b_isnan = np.isnan(data)
                            if np.any(b_isnan):
                                l.warning('not writing NaNs: %s%s...', k, np.flatnonzero(b_isnan))
                                h5dest[path][~b_isnan] = data[~b_isnan]
                            else:
                                h5dest[path][...] = data
                        except TypeError as e:
                            l.error('Replacing dataset "%s" TypeError: %s -> recreating...', path,
                                    '\n==> '.join([a for a in e.args if isinstance(a, str)]))
                            # or if you want to replace the dataset with some other dataset of different shape:
                            del h5dest[path]
                            h5dest.create_dataset(path, data=data, dtype=np.float64)
                        except KeyError as e:  # Unable to open object (component not found)
                            l.warning('Creating "%s"', path)
                            h5dest.create_dataset(path, data=data, dtype=np.float64)
                else:
                    paths = list(dict_matrices)
                    dict_matrices = {}
                    for rel_path in paths:
                        path = tbl_source + rel_path
                        try:
                            dict_matrices[path] = h5source[path][...]
                        except AttributeError:  # 'ellipsis' object has no attribute 'encode'
                            l.error(
                                'Skip update coef: dict_matrices must be None or its items must point to matrices %s',
                                '\n==> '.join(a for a in e.args if isinstance(a, str)))
                            continue
                        h5dest[path][...] = dict_matrices[path]

                h5dest.flush()
            else:
                dict_matrices = {}

            # or if you want to replace the dataset with some other dataset of different shape:
            # del f1['meas/frame1/data']
            # h5dest.create_dataset(tbl_dest + '//coef_cal//H//A', data= A  , dtype=np.float64)
            # h5dest.create_dataset(tbl_dest + '//coef_cal//H//C', data= C, dtype=np.float64)
            # h5dest[tbl_dest + '//coef//H//C'][:] = C

    def replace_coefs_group_on_error(h5source, h5dest, path, e=None):
        if ok_to_replace_group:
            l.warning(f'Replacing group "%s"', path)
            del h5dest[path]
            h5source.copy(path, h5dest[tbl_dest])
        else:
            l.error('Skip copy coef' + (f': {standard_error_info(e)}!' if e else '!'))

    # try:
    with FakeContextIfOpen(
            (lambda f: h5py.File(f, 'r')) if h5file_source != h5file_dest else None,
            h5file_source) as h5source:
        save_operation(h5source)

    # if h5file_source != h5file_dest:
    #     with h5py.File(h5file_source, 'r') as h5source:
    #         save_operation(h5source)
    # else:
    #     save_operation()
    # except Exception as e:
    #     raise e.__class__('Error in save_operation()')

    # Confirm the changes were properly made and saved:
    b_ok = True
    with FakeContextIfOpen(lambda f: h5py.File(f, 'r'), h5file_dest) as h5dest:
        for k, v in dict_matrices.items():
            if not np.allclose(h5dest[tbl_dest + k][...], v, equal_nan=True):
                l.error(f'h5copy_coef(): coef. {tbl_dest + k} not updated!')
                b_ok = False
    if b_ok and dict_matrices:
        print('h5copy_coef() have updated coef. Ok>')
示例#4
0
def main(new_arg=None):
    """

    :param new_arg: returns cfg if new_arg=='<cfg_from_args>' but it will be None if argument
     argv[1:] == '-h' or '-v' passed to this code
    argv[1] is cfgFile. It was used with cfg files:
        'csv2h5_nav_supervisor.ini'
        'csv2h5_IdrRedas.ini'
        'csv2h5_Idronaut.ini'
    :return:
    """

    global l
    cfg = cfg_from_args(my_argparser(), new_arg)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')
    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**{
                **cfg['in'], 'path': cfg['in']['db_path']
            },
                                      b_interact=cfg['program']['b_interact'])
        set_field_if_no(
            cfg['in'], 'tables_log', '{}/logFiles'
        )  # will be filled by each table from cfg['in']['tables']
        cfg['in']['query'] = query_time_range(**cfg['in'])
        set_field_if_no(cfg['out'], 'db_path', cfg['in']['db_path'])
        # cfg['out'] = init_file_names(cfg['out'], , path_field='db_path')
    except Ex_nothing_done as e:
        print(e.message)
        return ()

    # args = parser.parse_args()
    # args.verbose= args.verbose[0]
    # try:
    #     cfg= ini2dict(args.cfgFile)
    #     cfg['in']['cfgFile']= args.cfgFile
    # except IOError as e:
    #     print('\n==> '.join([a for a in e.args if isinstance(a,str)])) #e.message
    #     raise(e)
    # Open text log
    if 'log' in cfg['program'].keys():
        dir_create_if_need(os_path.dirname(cfg['program']['log']))
        flog = open(cfg['program']['log'], 'a+', encoding='cp1251')

    cfg['out']['log'] = OrderedDict({'fileName': None, 'fileChangeTime': None})

    # Prepare saving to csv
    if 'file_names_add_fun' in cfg['out']:
        file_names_add = eval(
            compile(cfg['out']['file_names_add_fun'], '', 'eval'))
    else:
        file_names_add = lambda i: '.csv'  # f'_{i}.csv'

    # Prepare data for output store and open it
    if cfg['out']['tables'] == ['None']:
        # will not write new data table and its log
        cfg['out']['tables'] = None
        # cfg['out']['tables_log'] = None  # for _runs cfg will be redefined (this only None case that have sense?)

    h5init(cfg['in'], cfg['out'])
    # store, dfLogOld = h5temp_open(**cfg['out'])

    cfg_fileN = os_path.splitext(cfg['in']['cfgFile'])[0]
    out_tables_log = cfg['out'].get('tables_log')
    if cfg_fileN.endswith('_runs') or (bool(out_tables_log)
                                       and 'logRuns' in out_tables_log[0]):

        # Will calculate only after filter  # todo: calculate derived parameters before were they are bad (or replace all of them if any bad?)
        func_before_cycle = lambda x: None
        func_before_filter = lambda df, log_row, cfg: df
        func_after_filter = lambda df, cfg: log_runs(df, cfg, cfg['out']['log']
                                                     )

        # this table will be added:
        cfg['out']['tables_log'] = [cfg['out']['tables'][0] + '/logRuns']
        cfg['out'][
            'b_log_ready'] = True  # to not apdate time range in h5_append()

        # Settings to not affect main data table and switch off not compatible options:
        cfg['out']['tables'] = []
        cfg['out'][
            'b_skip_if_up_to_date'] = False  # todo: If False check it: need delete all previous result of CTD_calc() or set min_time > its last log time. True not implemented?
        cfg['program'][
            'b_log_display'] = False  # can not display multiple rows log
        if 'b_save_images' in cfg['extract_runs']:
            cfg['extract_runs']['path_images'] = cfg['out'][
                'db_path'].with_name('_subproduct')
            dir_create_if_need(cfg['extract_runs']['path_images'])
    else:
        if 'brown' in cfg_fileN.lower():
            func_before_cycle = load_coef
            if 'Lat' in cfg['in']:
                func_before_filter = lambda *args, **kwargs: add_ctd_params(
                    process_brown(*args, **kwargs), kwargs['cfg'])
            else:
                func_before_filter = process_brown
        else:
            func_before_cycle = lambda x: None

            def ctd_coord_and_params(df: pd.DataFrame, log_row, cfg):
                coord_data_col_ensure(df, log_row)
                return add_ctd_params(df, cfg)

            func_before_filter = ctd_coord_and_params
        func_after_filter = lambda df, cfg: df  # nothing after filter

    func_before_cycle(cfg)  # prepare: usually assign data to cfg['for']
    if cfg['out'].get('path_csv'):
        dir_create_if_need(cfg['out']['path_csv'])
    # Load data Main circle #########################################
    # Open input store and cicle through input table log records
    qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"
    iSt = 1

    dfLogOld, cfg['out']['db'], cfg['out'][
        'b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    b_out_db_is_different = cfg['out']['db'] is not None and cfg['out'][
        'db_path_temp'] != cfg['in']['db_path']
    # Cycle for each table, for each row in log:
    # for path_csv in gen_names_and_log(cfg['out'], dfLogOld):
    with FakeContextIfOpen(
            lambda f: pd.HDFStore(f, mode='r'), cfg['in']['db_path'],
            None if b_out_db_is_different else cfg['out']['db']
    ) as cfg['in']['db']:  # not opens ['in']['db'] if already opened to write

        for tbl in cfg['in']['tables']:
            if False:  # Show table info
                nodes = sorted(
                    cfg['out']['db'].root.__members__)  # , key=number_key
                print(nodes)
            print(tbl, end='. ')

            df_log = cfg['in']['db'].select(cfg['in']['tables_log'].format(tbl)
                                            or tbl,
                                            where=cfg['in']['query'])
            if True:  # try:
                if 'log' in cfg['program'].keys():
                    nRows = df_log.rows.size
                    flog.writelines(datetime.now().strftime(
                        '\n\n%d.%m.%Y %H:%M:%S> processed ') + f'{nRows} row' +
                                    ('s:' if nRows > 1 else ':'))

                for ifile, r in enumerate(df_log.itertuples(),
                                          start=iSt):  # name=None
                    print('.', end='')
                    sys_stdout.flush()

                    path_raw = PurePath(r.fileName)
                    cfg['out']['log'].update(fileName=path_raw.name,
                                             fileChangeTime=r.fileChangeTime)
                    # save current state
                    cfg['in']['file_stem'] = cfg['out']['log'][
                        'fileName']  # for exmple to can extract date in subprogram
                    cfg['in']['fileChangeTime'] = cfg['out']['log'][
                        'fileChangeTime']

                    if cfg['in']['b_skip_if_up_to_date']:
                        have_older_data, have_duplicates = h5del_obsolete(
                            cfg['out'], cfg['out']['log'], dfLogOld)
                        if have_older_data:
                            continue
                        if have_duplicates:
                            cfg['out']['b_remove_duplicates'] = True
                    print('{}. {}'.format(ifile, path_raw.name), end=': ')

                    # Load data
                    qstr = qstr_trange_pattern.format(r.Index, r.DateEnd)
                    df_raw = cfg['in']['db'].select(tbl, qstr)
                    cols = df_raw.columns.tolist()

                    # cfg['in']['lat'] and ['lon'] may be need in add_ctd_params() if Lat not in df_raw
                    if 'Lat_en' in df_log.columns and 'Lat' not in cols:
                        cfg['in']['lat'] = np.nanmean((r.Lat_st, r.Lat_en))
                        cfg['in']['lon'] = np.nanmean((r.Lon_st, r.Lon_en))

                    df = func_before_filter(df_raw, log_row=r, cfg=cfg)

                    if df.size:  # size is zero means save only log but not data
                        # filter, updates cfg['out']['log']['rows']
                        df, _ = set_filterGlobal_minmax(
                            df, cfg['filter'], cfg['out']['log'])
                    if 'rows' not in cfg['out']['log']:
                        l.warning('no data!')
                        continue
                    elif isinstance(cfg['out']['log']['rows'], int):
                        print('filtered out {rows_filtered}, remains {rows}'.
                              format_map(cfg['out']['log']))
                        if cfg['out']['log']['rows']:
                            print('.', end='')
                        else:
                            l.warning('no data!')
                            continue

                    df = func_after_filter(df, cfg=cfg)

                    # Append to Store
                    h5_append(cfg['out'],
                              df,
                              cfg['out']['log'],
                              log_dt_from_utc=cfg['in']['dt_from_utc'])

                    # Copy to csv
                    if cfg['out'].get('path_csv'):
                        fname = '{:%y%m%d_%H%M}-{:%d_%H%M}'.format(
                            r.Index, r.DateEnd) + file_names_add(ifile)
                        if not 'data_columns' in cfg['out']:
                            cfg['out']['data_columns'] = slice(0,
                                                               -1)  # all cols
                        df.to_csv(  # [cfg['out']['data_columns']]
                            cfg['out']['path_csv'] / fname,
                            date_format=cfg['out']['text_date_format'],
                            float_format='%5.6g',
                            index_label='Time'
                        )  # to_string, line_terminator='\r\n'

                    # Log to screen (if not prohibited explicitly)
                    if cfg['out']['log'].get('Date0') is not None and (
                        ('b_log_display' not in cfg['program'])
                            or cfg['program']['b_log_display']):
                        str_log = '{fileName}:\t{Date0:%d.%m.%Y %H:%M:%S}-' \
                                  '{DateEnd:%d. %H:%M:%S%z}\t{rows}rows'.format_map(
                            cfg['out']['log'])  # \t{Lat}\t{Lon}\t{strOldVal}->\t{mag}
                        l.info(str_log)
                    else:
                        str_log = str(cfg['out']['log'].get('rows', '0'))
                    # Log to logfile
                    if 'log' in cfg['program'].keys():
                        flog.writelines('\n' + str_log)

    if b_out_db_is_different:
        try:
            if cfg['out']['tables'] is not None:
                print('')
                if cfg['out']['b_remove_duplicates']:
                    h5remove_duplicates(cfg['out'],
                                        cfg_table_keys=('tables',
                                                        'tables_log'))
                # Create full indexes. Must be done because of using ptprepack in h5move_tables() below
                l.debug('Create index')
                for tblName in (cfg['out']['tables'] +
                                cfg['out']['tables_log']):
                    try:
                        cfg['out']['db'].create_table_index(tblName,
                                                            columns=['index'],
                                                            kind='full')
                    except Exception as e:
                        l.warning(
                            ': table {}. Index not created - error'.format(
                                tblName), '\n==> '.join(
                                    [s for s in e.args if isinstance(s, str)]))
        except Exception as e:
            l.exception('The end. There are error ')

            import traceback, code
            from sys import exc_info as sys_exc_info
            tb = sys_exc_info()[2]  # type, value,
            traceback.print_exc()
            last_frame = lambda tb=tb: last_frame(tb.tb_next
                                                  ) if tb.tb_next else tb
            frame = last_frame().tb_frame
            ns = dict(frame.f_globals)
            ns.update(frame.f_locals)
            code.interact(local=ns)
        finally:

            cfg['out']['db'].close()
            if cfg['program']['log']:
                flog.close()
            if cfg['out']['db'].is_open:
                print('Wait store is closing...')
                sleep(2)

            failed_storages = h5move_tables(cfg['out'])
            print('Finishing...' if failed_storages else 'Ok.', end=' ')
            h5index_sort(
                cfg['out'],
                out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
                in_storages=failed_storages)
示例#5
0
def get_runs_parameters(df_raw,
                        times_min,
                        times_max,
                        cols_good_data: Union[str, Sequence[str], None],
                        dt_from_utc: timedelta = timedelta(0),
                        db=None,
                        db_path=None,
                        table_nav=None,
                        table_nav_cols=('Lat', 'Lon', 'DepEcho', 'Speed',
                                        'Course'),
                        dt_search_nav_tolerance=timedelta(minutes=2)):
    """

    :param df_raw:
    :param times_min:
    :param times_max:
    :param cols_good_data: cols of essential data that must be good (depth)
    :param dt_from_utc:
    :param db:
    :param db_path:
    :param table_nav: 'navigation' table to find data absent in df_raw. Note: tries to find only positive vals
    :param table_nav_cols:
    :param dt_search_nav_tolerance:
    :return:
    """

    log = {}
    log_update = {
    }  # {_st: DataFrame, _en: DataFrame} - dataframes of parameters for imin and imax
    for times_lim, suffix, log_time_col, i_search in ((times_min, '_st',
                                                       'Date0', 0),
                                                      (times_max, '_en',
                                                       'DateEnd', -1)):
        log_update[suffix] = df_raw.asof(
            times_lim, subset=cols_good_data)  # rows of last good data
        log[log_time_col] = timzone_view(log_update[suffix].index, dt_from_utc)

        # Search for nearest good values if have bad parameter p
        for (p, *isnan) in log_update[suffix].isna().T.itertuples(name=None):
            if i_search == -1:
                log_update[suffix].loc[isnan,
                                       p] = df_raw[p].asof(times_max[isnan])
            else:
                # "asof()"-alternative for 1st notna: take 1st good element in each interval
                for time_nan, time_min, time_max in zip(
                        times_lim[isnan], times_min[isnan], times_max[isnan]):
                    s_search = df_raw.loc[time_min:time_max, p]

                    try:
                        log_update[suffix].at[time_nan, p] = s_search[
                            s_search.notna()].iat[
                                0]  # same as .at[s_search.first_valid_index()]
                    except IndexError:
                        l.warning(
                            'no good values for parameter "%s" in run started %s',
                            p, time_nan)
                        continue
        log_update[suffix] = log_update[suffix].add_suffix(suffix)
    log.update(  # pd.DataFrame(, index=log_update['_st'].index).rename_axis('Date0')
        {**dict(
            [(k, v.values) for st_en in zip(log_update['_st'].items(), log_update['_en'].items()) for k, v in st_en]),
         # flatten pares
         })

    if table_nav:
        time_points = log_update['_st'].index.append(log_update['_en'].index)
        with FakeContextIfOpen(lambda f: pd.HDFStore(f, mode='r'), db_path,
                               db) as store:
            df_nav, dt = h5select(  # all starts then all ends in row
                store,
                table_nav,
                columns=table_nav_cols,
                time_points=time_points,
                dt_check_tolerance=dt_search_nav_tolerance)

        # {:0.0f}s'.format(cfg['out']['dt_search_nav_tolerance'].total_seconds())
        # todo: allow filter for individual columns. solution: use multiple calls for columns that need filtering with appropriate query_range_pattern argument of h5select()
        isnan = df_nav.isna()
        for col in df_nav.columns[isnan.any(axis=0)]:

            # not works:
            # df_nav_col, dt_col = h5select(  # for current parameter's name
            #         cfg['in']['db'], cfg['in']['table_nav'],
            #         columns=[col],
            #         query_range_lims=time_points[[0,-1]],
            #         time_points=time_points[isnan[col]],
            #         query_range_pattern = f"index>=Timestamp('{{}}') & index<=Timestamp('{{}}') & {col} > 0 ",
            #         dt_check_tolerance=cfg['out']['dt_search_nav_tolerance']
            #         )

            # Note: tries to find only positive vals:
            df_nav_col = store.select(
                table_nav,
                where="index>=Timestamp('{}') & index<=Timestamp('{}') & {} > 0"
                .format(
                    *(time_points[[0, -1]] + np.array(
                        (-dt_search_nav_tolerance, dt_search_nav_tolerance))),
                    col),
                columns=[col])
            try:
                vals = df_nav_col[col].values
                vals = vals[inearestsorted(df_nav_col.index,
                                           time_points[isnan[col]])]
            except IndexError:
                continue  # not found
            if vals.any():
                df_nav.loc[isnan[col], col] = vals

        # df_nav['nearestNav'] = dt.astype('m8[s]').view(np.int64)
        df_edges_items_list = [
            df_edge.add_suffix(suffix).items() for suffix, df_edge in (
                ('_st', df_nav.iloc[:len(log_update['_st'])]),
                ('_en', df_nav.iloc[len(log_update['_st']):len(df_nav)]))
        ]

        for st_en in zip(*df_edges_items_list):
            for name, series in st_en:
                # If have from data table already => update needed elements only
                if name in log:
                    b_need = np.isnan(log.get(name))
                    if b_need.any():
                        b_have = np.isfinite(series.values)
                        # from loaded nav in points
                        b_use = b_need & b_have
                        if b_use.any():
                            log[name][b_use] = series.values[b_use]
                        # # from all nav (not loaded)
                        # b_need &= ~b_have
                        #
                        # if b_need.any():
                        #     # load range to search nearest good val. for specified fields and tolerance
                        #     df = cfg['in']['db'].select(cfg['in']['table_nav'], where=query_range_pattern.format(st_en.index), columns=name)

                        # df_nav = h5select(  # for current parameter's name
                        #     cfg['in']['db'], cfg['in']['table_nav'],
                        #     columns=name,
                        #     query_range_lims=st_en
                        #     time_points=log_update['_st'].index.append(log_update['_en'].index),
                        #     dt_check_tolerance=cfg['out']['dt_search_nav_tolerance']
                        #     )
                    continue
                # else:
                #     b_need = np.isnan(series.values)
                #     for

                # Else update all elements at once
                log[name] = series.values
    return log
示例#6
0
def csv_to_h5_vaex(read_csv_args, to_hdf_args, dates_formats: Mapping[str, str],
              correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None,
              processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None,
              out_cols: Optional[Sequence] = None, continue_row=False):
    """
    Read csv and write to hdf5
    :param read_csv_args: dict, must have keys:
        filepath_or_buffer, chunksize
    :param to_hdf_args:
        vaex_format: bool how to write chanks:
            True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file
            False: appending to single pandas hdf5 table
        path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5')
        mode: default = 'w' if not continue_row else 'a',
        key: hdf5 group name in hdf5 file where store data
        ...
    :param dates_formats:
        column: csv column name wich need to be convert from str to DateTime,
        date_format: date formats
    :param processing: dict with
        keys: ((_input cols_), (_output cols_)) and
        values: function(_input cols_) that will be used returning _output cols_
    :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing
    :param continue_row: csv row number (excluding header) to start with shifting index.
    If output file exist and continue_row = True then continue converting starting from row equal to last index in it,
    useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0.
    If continue_row = integer then start from this row, giving starting index = continue_row
    :param correct_fun: function applied to each chunk (which is a frame of column data of type str) immediately after reading by read_csv()
    :return:
    """
    from astropy.io import ascii

    if not to_hdf_args.get('path_or_buf'):  # give default name to output file
        to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix('.vaex.h5')

    # prepare vaex/pandas storing
    open_for_pandas_to_hdf = None
    tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names(path_out_str=str(to_hdf_args['path_or_buf']))
    ichunk = None

    # find csv row to start
    msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.'
    if continue_row is True:  # isinstance(continue_same_csv, bool)
        try:
            hdf5_list = glob.glob(tmp_search_pattern)
            if len(hdf5_list):      # continue interrupted csv_to_h5()
                hdf5_list.sort()
                file_last = hdf5_list[-1]
                lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list))
                "table/columns/index"
            else:                   # add next csv data
                file_last = to_hdf_args['path_or_buf']
            with h5py.File(file_last, mode='r') as to_hdf_buf:
                continue_row = to_hdf_buf['table/columns/index/data'][-1]
        except (OSError) as e:
            msg_start += ' No output file.'
            continue_row = None
        except KeyError as e:
            msg_start += ' No data in output file.'
            continue_row = None
        else:
            msg_start += ' Starting from last csv row in output file:'
    elif continue_row:
        msg_start += ' Starting from specified csv data row:'
    if continue_row:
        lf.info('{:s} {:s}...', msg_start, continue_row)
        read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row
    else:
        lf.info('{:s} Beging from csv row 0, giving it index 0...', msg_start)

    dtypes = read_csv_args['dtype']

    # Set default output cols
    if out_cols is None and processing:
        cols_in_used = set()
        cols_out_used = set()
        for (c_in, c_out) in processing.keys():
            cols_in_used.update(c_in)
            cols_out_used.update(c_out)
        cols2del = cols_in_used.difference(cols_out_used)
        out_cols = dtypes.keys()
        for col in cols2del:
            del out_cols[col]
    cols_out_used = set(out_cols if out_cols is not None else dtypes.keys())

    # prepare conversion to user specified types
    str_cols = []
    int_and_nans_cols = []
    other_cols = []
    for col, typ in dtypes.items():
        if out_cols and col not in cols_out_used:
            continue
        kind = typ[0]
        (str_cols if kind == 'S' else
         int_and_nans_cols if kind == 'I' else
         other_cols).append(col)

    str_not_dates = list(set(str_cols).difference(dates_formats.keys()))
    min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates}

    with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \
            FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf:
        read_csv_args.update({
            'filepath_or_buffer': read_csv_buf,
            'memory_map': True,
            'dtype': 'string'  # switch off read_csv dtypes convertion (because if it fails it is hard to correct:
            })  # to read same csv place by pandas)
        to_hdf_args.update({
            'path_or_buf': to_hdf_buf,
            'format': 'table',
            'data_columns': True,
            'append': True,
            'min_itemsize': min_itemsize
            })
        rows_processed = 0
        rows_in_chunk = read_csv_args['chunksize']

        # alternative to pd.read_csv(**read_csv_args) but without dataframes
        tbls = ascii.read(read_csv_buf, format='csv', guess=False, delimiter=read_csv_args['delimiter'],
                          data_start=read_csv_args['skiprows'], names=read_csv_args['names'],
                          fast_reader={'chunk_size': read_csv_args['chunksize'],
                                       'chunk_generator': True})
示例#7
0
def csv_to_h5(
        read_csv_args,
        to_hdf_args,
        dates_formats: Mapping[str, str],
        correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None,
        processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None,
        out_cols: Optional[Sequence] = None,
        continue_row=False,
        vaex_format: Optional[bool]=None
        ):
    """
    Read csv and write to hdf5
    :param read_csv_args: dict, must have keys:
        filepath_or_buffer, chunksize
    :param to_hdf_args:
        path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5')
        mode: default = 'w' if not continue_row else 'a',
        key: hdf5 group name in hdf5 file where store data
        ...
    :param dates_formats:
        column: csv column name wich need to be convert from str to DateTime,
        date_format: date formats
    :param processing: dict with
        keys: ((_input cols_), (_output cols_)) and
        values: function(_input cols_) that will return _output cols_
    :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing
    :param continue_row: csv row number (excluding header) to start with shifting index.
    If output file exist and continue_row = True then continue converting starting from row equal to last index in it,
    useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0.
    If continue_row = integer then start from this row, giving starting index = continue_row
    :param correct_fun: function applied to each chunk returned by read_csv() which is a frame of column data of type str
    :param vaex_format: bool how to write chunks:
    - True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file
    - False: appending to single pandas hdf5 table
    - None: evaluates to True if to_hdf_args['path_or_buf'] has next to last suffix ".vaex" else to False

    :return:
    """
    if to_hdf_args.get('path_or_buf'):
        if vaex_format is None:
            vaex_format = Path(str(to_hdf_args['path_or_buf']).strip()).suffixes[:-1] == ['.vaex']
    else:  # give default name to output file
        to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix(
            f'{".vaex" if vaex_format else ""}.h5'
            )

    # Deal with vaex/pandas storing difference
    if vaex_format:
        open_for_pandas_to_hdf = None
        tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names(
            path_out_str=str(to_hdf_args['path_or_buf'])
            )
        ichunk = None
    else:
        def open_for_pandas_to_hdf(path_or_buf):
            return pd.HDFStore(
                to_hdf_args['path_or_buf'],
                to_hdf_args.get('mode', 'a' if continue_row else 'w')
                )

    # Find csv row to start
    msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.'
    if continue_row is True:  # isinstance(continue_same_csv, bool)
        try:
            if vaex_format:

                hdf5_list = glob.glob(tmp_search_pattern)
                if len(hdf5_list):      # continue interrupted csv_to_h5()
                    hdf5_list.sort()
                    file_last = hdf5_list[-1]
                    lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list))
                    "table/columns/index"
                else:                   # add next csv data
                    file_last = to_hdf_args['path_or_buf']
                with h5py.File(file_last, mode='r') as to_hdf_buf:
                    continue_row = to_hdf_buf['table/columns/index/data'][-1] + 1
            else:
                with pd.HDFStore(to_hdf_args['path_or_buf'], mode='r') as to_hdf_buf:
                    continue_row = to_hdf_buf.select(to_hdf_args['key'], columns=[], start=-1).index[-1] + 1
        except (OSError) as e:
            msg_start += ' No output file.'
            continue_row = None
        except KeyError as e:
            msg_start += ' No data in output file.'
            continue_row = None
        else:
            msg_start += ' Starting from next to last loaded csv row:'
    elif continue_row:
        msg_start += ' Starting from specified csv data row:'
    if continue_row:
        lf.info('{:s} {:d}...', msg_start, continue_row)
        read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row
    else:
        lf.info('{:s} begining from csv row 0, giving it index 0...', msg_start)

    dtypes = read_csv_args['dtype']

    # Set default output cols if not set
    if out_cols is None and processing:
        # we will out all we will have except processing inputs if they are not mentioned in processing outputs
        cols_in_used = set()
        cols_out_used = set()
        for (c_in, c_out) in processing.keys():
            cols_in_used.update(c_in)
            cols_out_used.update(c_out)
        cols2del = cols_in_used.difference(cols_out_used)
        out_cols = dtypes.keys()
        for col in cols2del:
            del out_cols[col]
    cols_out_used = set(out_cols if out_cols is not None else dtypes.keys())

    # Group cols for conversion by types specified
    str_cols = []
    int_and_nans_cols = []
    other_cols = []
    for col, typ in dtypes.items():
        if out_cols and col not in cols_out_used:
            continue
        kind = typ[0]
        (str_cols if kind == 'S' else
         int_and_nans_cols if kind == 'I' else
         other_cols).append(col)

    str_not_dates = list(set(str_cols).difference(dates_formats.keys()))
    min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates}

    # Read csv, process, write hdf5
    with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \
            FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf:
        read_csv_args.update({
            'filepath_or_buffer': read_csv_buf,
            'memory_map': True,
            'dtype': 'string'  # switch off read_csv dtypes convertion (because if it fails it is hard to correct:
            })  # to read same csv place by pandas)
        to_hdf_args.update({
            'path_or_buf': to_hdf_buf,
            'format': 'table',
            'data_columns': True,
            'append': True,
            'min_itemsize': min_itemsize
            })
        # rows_processed = 0
        # rows_in_chunk = read_csv_args['chunksize']

        for ichunk, chunk in enumerate(pd.read_csv(**read_csv_args)):
            if continue_row:
                if chunk.size == 0:
                    ichunk = np.ceil(continue_row / read_csv_args['chunksize']).astype(int) - 1
                    break  # continue_row is > data rows
                else:
                    chunk.index += continue_row

            lf.extra['id'] = f'chunk start row {chunk.index[0]:d}'
            if ichunk % 10 == 0:
                print(f'{ichunk}', end=' ')
            else:
                print('.', end='')

            if correct_fun:
                correct_fun(chunk)

            # Convert to user specified types

            # 1. dates str to DateTime
            for col, f in dates_formats.items():
                # the convertion of 'bytes' to 'strings' is needed for pd.to_datetime()
                try:
                    chunk[col] = pd.to_datetime(chunk[col], format=f)
                except ValueError as e:
                    lf.error(
                        'Conversion to datetime("{:s}" formatted as "{:s}") {:s} -> '
                        'Replacing malformed strings by NaT...', col, f, standard_error_info(e))
                    chunk[col] = pd.to_datetime(chunk[col], format=f, exact=False, errors='coerce')

            # 2. str to numeric for other_cols and int_and_nans_cols (which is limited support pandas extension dtypes)
            # but we use numpy types instead replasing nans by -1 to able write to hdf5
            chunk[other_cols] = chunk[other_cols].fillna('NaN')  # <NA> to numpy recognized eq meaning string
            chunk[int_and_nans_cols] = chunk[int_and_nans_cols].fillna('-1')
            for col in (int_and_nans_cols + other_cols):  # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes):
                typ = dtypes[col]
                if col in int_and_nans_cols:
                    is_integer = True
                    typ = f'i{typ[1:]}'  # typ.numpy_dtype
                else:
                    is_integer = np.dtype(typ).kind == 'i'
                try:
                    chunk[col] = chunk[col].astype(typ)
                    continue
                except (ValueError, OverflowError) as e:
                    # Cleaning. In case of OverflowError we do it here to prevent ValueError while handling of OverflowError below.
                    pattern_match = r'^[\d]$' if is_integer else r'^-?[\d.]$'
                    ibad = ~chunk[col].str.match(pattern_match)
                    rep_val = '-1' if is_integer else 'NaN'
                    # ibad = np.flatnonzero(chunk[col] == re.search(r'(?:")(.*)(?:")', e.args[0]).group(1), 'ascii')
                    lf.error('Conversion {:s}("{:s}") {:s} -> replacing {:d} values not maching pattern "{:s}" with "{'
                             ':s}" and again...', typ, col, standard_error_info(e), ibad.sum(), pattern_match, rep_val)
                    chunk.loc[ibad, col] = rep_val
                    # astype(str).replace(regex=True, to_replace=r'^.*[^\d.].*$', value=
                try:
                    chunk[col] = chunk[col].astype(typ)
                except (OverflowError,
                        ValueError) as e:  # May be bad value from good symbols: r'^\d*\.\d*\.+\d*$' but instead checking it we do coerce_to_exact_dtype() on ValueError here too
                    lf.error('Conversion {:s}("{:s}") {:s} -> Replacing malformed strings and big numbers'
                    ' by NaN ...', typ, col, standard_error_info(e))
                    chunk[col] = coerce_to_exact_dtype(chunk[col], dtype=typ)

            # Limit big strings length and convert StringDtype to str to can save by to_hdf()
            for col, max_len in min_itemsize.items():  # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes):
                chunk[col] = chunk[col].str.slice(stop=max_len)  # apply(lambda x: x[:max_len]) not handles <NA>
            chunk[str_not_dates] = chunk[str_not_dates].astype(str)

            # Apply specified data processing
            if processing:
                for (cols_in, c_out), fun in processing.items():
                    cnv_result = fun(chunk[list(cols_in)])
                    chunk[list(c_out)] = cnv_result

            # # Bad rows check
            # is_different = chunk['wlaWID'].fillna('') != chunk['wlaAPIHartStandard'].fillna('')
            # if is_different.any():
            #     i_bad = np.flatnonzero(is_different.values)
            #     lf.debug('have wlaWID != wlaAPIHartStandard in rows {:s}', chunk.index[i_bad])
            #     # chunk= chunk.drop(chunk.index[i_bad])   # - deleting
            #     pass

            # Check unique index
            # if chunk['wlaWID'].duplicated()

            try:
                if vaex_format:
                    df = vaex.from_pandas(chunk if out_cols is None else chunk[out_cols])
                    df.export_hdf5(tmp_save_pattern.format(ichunk))
                else:  # better to move this command upper and proc. by vaex instead of pandas
                    (chunk if out_cols is None else chunk[out_cols]).to_hdf(**to_hdf_args)
                #rows_processed += rows_in_chunk  # think we red always the same length exept last which length value will not be used

            except Exception as e:
                lf.exception('write error')
                pass
        try:
            del lf.extra['id']
        except KeyError:
            lf.info('was no more data rows to read')

    # If vaex store was specified then we have chunk files that we combine now by export_hdf5():
    if vaex_format:
        h5pandas_to_vaex_combine(tmp_search_pattern, str(to_hdf_args['path_or_buf']), check_files_number=ichunk+1)