예제 #1
0
파일: gpx2h5.py 프로젝트: cycle13/h5toGrid
def df_filter_and_save_to_h5(df,
                             input,
                             out,
                             filter=None,
                             sort_time=None) -> Union[str, int]:
    """

    :param out: out cfg, must have fields:
      - log
    :param in: cfg dict with fields... dt_from_utc: to correct out['log'] time  #???
    :param filter (optional)
    :param df:
    :param key:
    :return: 'continue' if no data else 0
    Modifies out: adds field 'tables_have_wrote': Set[Tuple[str, str]]
    """
    df_t_index, itm = multiindex_timeindex(df.index)
    # sorting will break multiindex?
    df_t_index, b_ok = time_corr(
        df_t_index, input, sort_time)  # need sort in tracks/segments only
    df.index = multiindex_replace(df.index, df_t_index, itm)

    if filter:
        rows_in = len(df)
        bGood = filterGlobal_minmax(df, df.index, filter)
        df = df[bGood & b_ok]
        out['log']['rows'] = len(df)
        print('filtered out {} from {}.'.format(rows_in - out['log']['rows'],
                                                rows_in))
    else:
        df = df[b_ok]
        out['log']['rows'] = len(df)
    if df.empty:
        print('No data => skip file')
        return 'continue'

    # # Log statistic
    # out['log']['Date0'  ]= timzone_view(df_t_index[ 0], input['dt_from_utc'])
    # out['log']['DateEnd']= timzone_view(df_t_index[-1], input['dt_from_utc'])
    # # Add separatiion row of NaN and save to store
    # if out['b_insert_separator'] and itm is None:
    #     # 0 (can not use np.nan in int) [tim[-1].to_datetime() + timedelta(seconds = 0.5/cfg['fs'])]
    #     df_dummy.index= (df.index[-1] + (df.index[-1] - df.index[-2])/2,)
    #     df= df.append(df_dummy)

    # store.append(tables[key], df, data_columns= True, index= False)
    # # Log to store #or , index=False?
    # dfLog= pd.DataFrame.from_records(out['log'], exclude= ['Date0'], index= [out['log']['Date0']]) #
    # #dfLog= pd.DataFrame.from_dict(out['log']) #, index= 'Date0'
    # store.append(tables_log[key], dfLog, data_columns= True, expectedrows= input['nfiles'], index=False) #append

    h5_append(out,
              df,
              out['log'],
              log_dt_from_utc=input['dt_from_utc'],
              tim=df_t_index)
    return 0
예제 #2
0
def main(new_arg=None):
    """

    :param new_arg: returns cfg if new_arg=='<cfg_from_args>' but it will be None if argument
     argv[1:] == '-h' or '-v' passed to this code
    argv[1] is cfgFile. It was used with cfg files:
        'csv2h5_nav_supervisor.ini'
        'csv2h5_IdrRedas.ini'
        'csv2h5_Idronaut.ini'
    :return:
    """

    global l
    cfg = cfg_from_args(my_argparser(), new_arg)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')
    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**{
                **cfg['in'], 'path': cfg['in']['db_path']
            },
                                      b_interact=cfg['program']['b_interact'])
        set_field_if_no(
            cfg['in'], 'tables_log', '{}/logFiles'
        )  # will be filled by each table from cfg['in']['tables']
        cfg['in']['query'] = query_time_range(**cfg['in'])
        set_field_if_no(cfg['out'], 'db_path', cfg['in']['db_path'])
        # cfg['out'] = init_file_names(cfg['out'], , path_field='db_path')
    except Ex_nothing_done as e:
        print(e.message)
        return ()

    # args = parser.parse_args()
    # args.verbose= args.verbose[0]
    # try:
    #     cfg= ini2dict(args.cfgFile)
    #     cfg['in']['cfgFile']= args.cfgFile
    # except IOError as e:
    #     print('\n==> '.join([a for a in e.args if isinstance(a,str)])) #e.message
    #     raise(e)
    # Open text log
    if 'log' in cfg['program'].keys():
        dir_create_if_need(os_path.dirname(cfg['program']['log']))
        flog = open(cfg['program']['log'], 'a+', encoding='cp1251')

    cfg['out']['log'] = OrderedDict({'fileName': None, 'fileChangeTime': None})

    # Prepare saving to csv
    if 'file_names_add_fun' in cfg['out']:
        file_names_add = eval(
            compile(cfg['out']['file_names_add_fun'], '', 'eval'))
    else:
        file_names_add = lambda i: '.csv'  # f'_{i}.csv'

    # Prepare data for output store and open it
    if cfg['out']['tables'] == ['None']:
        # will not write new data table and its log
        cfg['out']['tables'] = None
        # cfg['out']['tables_log'] = None  # for _runs cfg will be redefined (this only None case that have sense?)

    h5init(cfg['in'], cfg['out'])
    # store, dfLogOld = h5temp_open(**cfg['out'])

    cfg_fileN = os_path.splitext(cfg['in']['cfgFile'])[0]
    out_tables_log = cfg['out'].get('tables_log')
    if cfg_fileN.endswith('_runs') or (bool(out_tables_log)
                                       and 'logRuns' in out_tables_log[0]):

        # Will calculate only after filter  # todo: calculate derived parameters before were they are bad (or replace all of them if any bad?)
        func_before_cycle = lambda x: None
        func_before_filter = lambda df, log_row, cfg: df
        func_after_filter = lambda df, cfg: log_runs(df, cfg, cfg['out']['log']
                                                     )

        # this table will be added:
        cfg['out']['tables_log'] = [cfg['out']['tables'][0] + '/logRuns']
        cfg['out'][
            'b_log_ready'] = True  # to not apdate time range in h5_append()

        # Settings to not affect main data table and switch off not compatible options:
        cfg['out']['tables'] = []
        cfg['out'][
            'b_skip_if_up_to_date'] = False  # todo: If False check it: need delete all previous result of CTD_calc() or set min_time > its last log time. True not implemented?
        cfg['program'][
            'b_log_display'] = False  # can not display multiple rows log
        if 'b_save_images' in cfg['extract_runs']:
            cfg['extract_runs']['path_images'] = cfg['out'][
                'db_path'].with_name('_subproduct')
            dir_create_if_need(cfg['extract_runs']['path_images'])
    else:
        if 'brown' in cfg_fileN.lower():
            func_before_cycle = load_coef
            if 'Lat' in cfg['in']:
                func_before_filter = lambda *args, **kwargs: add_ctd_params(
                    process_brown(*args, **kwargs), kwargs['cfg'])
            else:
                func_before_filter = process_brown
        else:
            func_before_cycle = lambda x: None

            def ctd_coord_and_params(df: pd.DataFrame, log_row, cfg):
                coord_data_col_ensure(df, log_row)
                return add_ctd_params(df, cfg)

            func_before_filter = ctd_coord_and_params
        func_after_filter = lambda df, cfg: df  # nothing after filter

    func_before_cycle(cfg)  # prepare: usually assign data to cfg['for']
    if cfg['out'].get('path_csv'):
        dir_create_if_need(cfg['out']['path_csv'])
    # Load data Main circle #########################################
    # Open input store and cicle through input table log records
    qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"
    iSt = 1

    dfLogOld, cfg['out']['db'], cfg['out'][
        'b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    b_out_db_is_different = cfg['out']['db'] is not None and cfg['out'][
        'db_path_temp'] != cfg['in']['db_path']
    # Cycle for each table, for each row in log:
    # for path_csv in gen_names_and_log(cfg['out'], dfLogOld):
    with FakeContextIfOpen(
            lambda f: pd.HDFStore(f, mode='r'), cfg['in']['db_path'],
            None if b_out_db_is_different else cfg['out']['db']
    ) as cfg['in']['db']:  # not opens ['in']['db'] if already opened to write

        for tbl in cfg['in']['tables']:
            if False:  # Show table info
                nodes = sorted(
                    cfg['out']['db'].root.__members__)  # , key=number_key
                print(nodes)
            print(tbl, end='. ')

            df_log = cfg['in']['db'].select(cfg['in']['tables_log'].format(tbl)
                                            or tbl,
                                            where=cfg['in']['query'])
            if True:  # try:
                if 'log' in cfg['program'].keys():
                    nRows = df_log.rows.size
                    flog.writelines(datetime.now().strftime(
                        '\n\n%d.%m.%Y %H:%M:%S> processed ') + f'{nRows} row' +
                                    ('s:' if nRows > 1 else ':'))

                for ifile, r in enumerate(df_log.itertuples(),
                                          start=iSt):  # name=None
                    print('.', end='')
                    sys_stdout.flush()

                    path_raw = PurePath(r.fileName)
                    cfg['out']['log'].update(fileName=path_raw.name,
                                             fileChangeTime=r.fileChangeTime)
                    # save current state
                    cfg['in']['file_stem'] = cfg['out']['log'][
                        'fileName']  # for exmple to can extract date in subprogram
                    cfg['in']['fileChangeTime'] = cfg['out']['log'][
                        'fileChangeTime']

                    if cfg['in']['b_skip_if_up_to_date']:
                        have_older_data, have_duplicates = h5del_obsolete(
                            cfg['out'], cfg['out']['log'], dfLogOld)
                        if have_older_data:
                            continue
                        if have_duplicates:
                            cfg['out']['b_remove_duplicates'] = True
                    print('{}. {}'.format(ifile, path_raw.name), end=': ')

                    # Load data
                    qstr = qstr_trange_pattern.format(r.Index, r.DateEnd)
                    df_raw = cfg['in']['db'].select(tbl, qstr)
                    cols = df_raw.columns.tolist()

                    # cfg['in']['lat'] and ['lon'] may be need in add_ctd_params() if Lat not in df_raw
                    if 'Lat_en' in df_log.columns and 'Lat' not in cols:
                        cfg['in']['lat'] = np.nanmean((r.Lat_st, r.Lat_en))
                        cfg['in']['lon'] = np.nanmean((r.Lon_st, r.Lon_en))

                    df = func_before_filter(df_raw, log_row=r, cfg=cfg)

                    if df.size:  # size is zero means save only log but not data
                        # filter, updates cfg['out']['log']['rows']
                        df, _ = set_filterGlobal_minmax(
                            df, cfg['filter'], cfg['out']['log'])
                    if 'rows' not in cfg['out']['log']:
                        l.warning('no data!')
                        continue
                    elif isinstance(cfg['out']['log']['rows'], int):
                        print('filtered out {rows_filtered}, remains {rows}'.
                              format_map(cfg['out']['log']))
                        if cfg['out']['log']['rows']:
                            print('.', end='')
                        else:
                            l.warning('no data!')
                            continue

                    df = func_after_filter(df, cfg=cfg)

                    # Append to Store
                    h5_append(cfg['out'],
                              df,
                              cfg['out']['log'],
                              log_dt_from_utc=cfg['in']['dt_from_utc'])

                    # Copy to csv
                    if cfg['out'].get('path_csv'):
                        fname = '{:%y%m%d_%H%M}-{:%d_%H%M}'.format(
                            r.Index, r.DateEnd) + file_names_add(ifile)
                        if not 'data_columns' in cfg['out']:
                            cfg['out']['data_columns'] = slice(0,
                                                               -1)  # all cols
                        df.to_csv(  # [cfg['out']['data_columns']]
                            cfg['out']['path_csv'] / fname,
                            date_format=cfg['out']['text_date_format'],
                            float_format='%5.6g',
                            index_label='Time'
                        )  # to_string, line_terminator='\r\n'

                    # Log to screen (if not prohibited explicitly)
                    if cfg['out']['log'].get('Date0') is not None and (
                        ('b_log_display' not in cfg['program'])
                            or cfg['program']['b_log_display']):
                        str_log = '{fileName}:\t{Date0:%d.%m.%Y %H:%M:%S}-' \
                                  '{DateEnd:%d. %H:%M:%S%z}\t{rows}rows'.format_map(
                            cfg['out']['log'])  # \t{Lat}\t{Lon}\t{strOldVal}->\t{mag}
                        l.info(str_log)
                    else:
                        str_log = str(cfg['out']['log'].get('rows', '0'))
                    # Log to logfile
                    if 'log' in cfg['program'].keys():
                        flog.writelines('\n' + str_log)

    if b_out_db_is_different:
        try:
            if cfg['out']['tables'] is not None:
                print('')
                if cfg['out']['b_remove_duplicates']:
                    h5remove_duplicates(cfg['out'],
                                        cfg_table_keys=('tables',
                                                        'tables_log'))
                # Create full indexes. Must be done because of using ptprepack in h5move_tables() below
                l.debug('Create index')
                for tblName in (cfg['out']['tables'] +
                                cfg['out']['tables_log']):
                    try:
                        cfg['out']['db'].create_table_index(tblName,
                                                            columns=['index'],
                                                            kind='full')
                    except Exception as e:
                        l.warning(
                            ': table {}. Index not created - error'.format(
                                tblName), '\n==> '.join(
                                    [s for s in e.args if isinstance(s, str)]))
        except Exception as e:
            l.exception('The end. There are error ')

            import traceback, code
            from sys import exc_info as sys_exc_info
            tb = sys_exc_info()[2]  # type, value,
            traceback.print_exc()
            last_frame = lambda tb=tb: last_frame(tb.tb_next
                                                  ) if tb.tb_next else tb
            frame = last_frame().tb_frame
            ns = dict(frame.f_globals)
            ns.update(frame.f_locals)
            code.interact(local=ns)
        finally:

            cfg['out']['db'].close()
            if cfg['program']['log']:
                flog.close()
            if cfg['out']['db'].is_open:
                print('Wait store is closing...')
                sleep(2)

            failed_storages = h5move_tables(cfg['out'])
            print('Finishing...' if failed_storages else 'Ok.', end=' ')
            h5index_sort(
                cfg['out'],
                out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
                in_storages=failed_storages)
예제 #3
0
def main(config: ConfigType) -> None:  #
    with pd.HDFStore(cfg_in['db_path'], mode='r') as store:
        df = store[cfg_in['table']][cfg_in['min_date']:cfg_in['max_date']]
        k = store.get_node(f"{cfg_in['table']}/coef")[cfg_in['col']].read()

    n_rows_before = df.shape[0]
    lf.info('Loaded data {0[0]} - {0[1]}: {1} rows. Filtering {2[col_out]}...',
            df.index[[0, -1]], n_rows_before, config)
    # print(f"Loaded data {df.index[0]} - {df.index[-1]}: {n_rows_before} rows. Filtering {cfg_in['col']}...")
    p_name = config['col_out']
    df[p_name] = np.polyval(k, df[cfg_in['col']])

    # Battery compensation
    kBat = [1.7314032932363, -11.9301097967443]
    df[p_name] -= np.polyval(kBat, df['Battery'])
    MIN_P = 6  # P filtered below: to not delete spikes that may be used to find other spikes using ~constant period

    if config['cols_order']:
        df = df.loc[:, config['cols_order']]
    else:
        df.drop(cfg_in['col'], axis='columns', inplace=True)

    i_burst, mean_burst_size, max_hole = i_bursts_starts(df.index)

    i_col = df.columns.get_loc(p_name)

    if cfg_in['b_show']:
        from matplotlib import pyplot as plt
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.grid(True, alpha=.85, color='white', axis='y', linestyle='-')
        fig.subplots_adjust(top=.89)

        fig.show()
    else:
        ax = None

    # 'db': store
    cfg_out = {
        'table':
        cfg_in['table'],
        'table_log':
        f"{cfg_in['table']}/logFiles",
        'log': {},
        'db_path':
        Path(config['db_path']) if 'db_path' in config else
        (cfg_in['db_path'].with_name(f"{cfg_in['db_path'].stem}_filt_s.h5"))
    }

    def h5_names_gen(cfg_in, cfg_out: Mapping[str, Any],
                     **kwargs) -> Iterator[None]:
        #cfg_out['log']['fileName'] = pname.name[-cfg_out['logfield_fileName_len']:-4]
        cfg_out['log']['fileChangeTime'] = datetime.fromtimestamp(
            cfg_in['db_path'].stat().st_mtime)
        yield None

    h5init(cfg_in,
           cfg_out)  # cfg_in for full path if cfg_out['db_path'] only name
    n_rows_after = 0
    #with pd.HDFStore(out_path) as store:  #, mode='w'
    for _, _ in h5_dispenser_and_names_gen(
            cfg_in, cfg_out,
            h5_names_gen):  # handles temporary db for h5_append()
        try:
            if h5remove_table(cfg_out['db'], cfg_in['table']):
                lf.info('previous table removed')
        except Exception as e:  # no such table?
            pass

        for st, en in pairwise(i_burst):
            cfg_out['log']['fileName'] = str(st)
            sl = slice(st, en)
            ind_ok = filter_periodic_spike(df.iloc[sl, i_col], ax=ax)

            # Filtering
            bad_p = df.loc[ind_ok, p_name] < MIN_P
            n_bad = bad_p.sum()
            if n_bad:
                lf.info('filtering {} > {}: deleting {} values in frame {}',
                        p_name, MIN_P, n_bad,
                        pattern_log_dt.format(*df.index[[0, -1]]))
                ind_ok = ind_ok[~bad_p]
                if not ind_ok.size:
                    continue
                # df.loc[bad_p, p_name] = np.NaN

            # save result
            h5_append(cfg_out, df.loc[ind_ok], cfg_out['log'])
            n_rows_after += ind_ok.size

    # Temporary db to compressed db with pandas index
    if n_rows_after:  # check needed because ``ptprepack`` in h5index_sort() not closes hdf5 source if it not finds data
        failed_storages = h5move_tables(cfg_out)
        print('Ok.', end=' ')
        h5index_sort(cfg_out,
                     out_storage_name=f"{cfg_out['db_path'].stem}-resorted.h5",
                     in_storages=failed_storages)

    lf.info(
        f'Removed {n_rows_before - n_rows_after} rows. Saved {n_rows_after} rows to {cfg_out["db_path"]}...'
    )
예제 #4
0
def main(new_arg=None, **kwargs):
    """

    :param new_arg: list of strings, command line arguments
    :kwargs: dicts for each section: to overwrite values in them (overwrites even high priority values, other values remains)
    Note: if new_arg=='<cfg_from_args>' returns cfg but it will be None if argument
     argv[1:] == '-h' or '-v' passed to this code
    argv[1] is cfgFile. It was used with cfg files:
        'csv2h5_nav_supervisor.ini'
        'csv2h5_IdrRedas.ini'
        'csv2h5_Idronaut.ini'

    :return:
    """
    global l

    cfg = cfg_from_args(my_argparser(), new_arg, **kwargs)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')
    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**cfg['in'],
                                      b_interact=cfg['program']['b_interact'])
    except Ex_nothing_done as e:
        print(e.message)
        return ()

    bOld_FF00FF = False
    # if 'TermGrunt' in sys.argv[1] FF00FF' in str(cfg['in']['path']):  # 'TermGrunt.h5'  ? args.path.endswith ('bin'):
    #     bOld_FF00FF = True
    #     cfg['in'].update({
    #     'header': 'TERM',
    #     'dt_from_utc': timedelta(hours=-1),
    #     'fs': 1, 'b_time_fromtimestamp': True,
    #     'b_time_fromtimestamp_source': False})
    # else:  # 'Katran.h5'
    #     cfg['in'].update({
    #     'delimiter_hex': '000000E6',
    #     'header': 'P, Temp, Cond',
    #     'dt_from_utc': timedelta(hours=0),
    #     'fs': 10, 'b_time_fromtimestamp': False,
    #     'b_time_fromtimestamp_source': False})

    set_field_if_no(
        cfg['in'], 'dtype', 'uint{:d}'.format(2**(3 + np.searchsorted(
            2**np.array([3, 4, 5, 6, 7]) > np.array(
                8 * (cfg['in']['data_word_len'] - 1)), 1))))

    # Prepare cpecific format loading and writing
    set_field_if_no(cfg['in'], 'coltime', [])
    cfg['in'] = init_input_cols(cfg['in'])
    cfg['out']['names'] = np.array(cfg['in']['dtype'].names)[ \
        cfg['in']['cols_loaded_save_b']]
    cfg['out']['formats'] = [
        cfg['in']['dtype'].fields[n][0] for n in cfg['out']['names']
    ]
    cfg['out']['dtype'] = np.dtype({
        'formats': cfg['out']['formats'],
        'names': cfg['out']['names']
    })
    h5init(cfg['in'], cfg['out'])

    # cfg['Period'] = 1.0 / cfg['in']['fs']  # instead Second can use Milli / Micro / Nano:
    # cfg['pdPeriod'] = pd.to_timedelta(cfg['Period'], 's')
    # #pd.datetools.Second(cfg['Period'])\
    #     if 1 % cfg['in']['fs'] == 0 else\
    #     pd.datetools.Nano(cfg['Period'] * 1e9)

    # log table of loaded files. columns: Start time, file name, and its index in array off all loaded data:
    log_item = cfg['out']['log'] = {
    }  # fields will have: 'fileName': None, 'fileChangeTime': None, 'rows': 0

    strLog = ''
    # from collections import namedtuple
    # type_log_files = namedtuple('type_log_files', ['label','iStart'])
    # log.sort(axis=0, order='log_item['Date0']')#sort files by time

    dfLogOld, cfg['out']['db'], cfg['out'][
        'b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    if 'log' in cfg['program'].keys():
        f = open(PurePath(sys_argv[0]).parent / cfg['program']['log'],
                 'a',
                 encoding='cp1251')
        f.writelines(
            datetime.now().strftime('\n\n%d.%m.%Y %H:%M:%S> processed ' +
                                    str(cfg['in']['nfiles']) + ' file' +
                                    's:' if cfg['in']['nfiles'] > 1 else ':'))
    b_remove_duplicates = False  # normally no duplicates but will if detect
    # Config specially for readBinFramed
    set_field_if_no(cfg['in'], 'b_byte_order_is_big_endian', True)
    set_field_if_no(cfg['in'], 'b_baklan', False)
    set_field_if_no(cfg['in'], 'b_time_fromtimestamp_source', False)
    cfg['out']['fs'] = cfg['in']['fs']
    if True:
        ## Main circle ############################################################
        for i1_file, path_in in h5_dispenser_and_names_gen(
                cfg['in'], cfg['out']):
            l.info('{}. {}: '.format(i1_file, path_in.name))

            # Loading data
            if bOld_FF00FF:
                V = readFF00FF(path_in, cfg)
                iFrame = np.arange(len(V))
            else:
                V, iFrame = readBinFramed(path_in, cfg['in'])
            if ('b_time_fromtimestamp' in cfg['in'] and cfg['in']['b_time_fromtimestamp']) or \
                    ('b_time_fromtimestamp_source' in cfg['in'] and cfg['in']['b_time_fromtimestamp_source']):
                path_in_rec = os_path.join(
                    'd:\\workData\\_source\\BalticSea\\151021_T1Grunt_Pregol\\_source\\not_corrected',
                    os_path.basename(path_in)[:-3] + 'txt'
                ) if cfg['in']['b_time_fromtimestamp_source'] else path_in
                log_item['Date0'] = datetime.fromtimestamp(
                    os_path.getmtime(path_in_rec))  # getctime is bad
                log_item['Date0'] -= iFrame[-1] * timedelta(
                    seconds=1 / cfg['in']['fs']
                )  # use for computer filestamp at end of recording
            else:
                log_item['Date0'] = datetime.strptime(
                    path_in.stem, cfg['in']['filename2timestart_format'])
            log_item['Date0'] += cfg['in']['dt_from_utc']
            tim = log_item['Date0'] + iFrame * timedelta(
                seconds=1 / cfg['in']['fs']
            )  # tim = pd.date_range(log_item['Date0'], periods=np.size(V, 0), freq=cfg['pdPeriod'])
            df = pd.DataFrame(
                V.view(dtype=cfg['out']['dtype']),  # np.uint16
                columns=cfg['out']['names'],
                index=tim)
            # pd.DataFrame(V, columns=cfg['out']['names'], dtype=cfg['out']['formats'], index=tim)
            if df.empty:  # log['rows']==0
                print('No data => skip file')
                continue

            df, tim = set_filterGlobal_minmax(df,
                                              cfg_filter=cfg['filter'],
                                              log=log_item,
                                              dict_to_save_last_time=cfg['in'])
            if log_item['rows_filtered']:
                print('filtered out {}, remains {}'.format(
                    log_item['rows_filtered'], log_item['rows']))
            if not log_item['rows']:
                l.warning('no data! => skip file')
                continue
            elif log_item['rows']:
                print(
                    '.', end=''
                )  # , divisions=d.divisions), divisions=pd.date_range(tim[0], tim[-1], freq='1D')
            else:
                l.warning('no data! => skip file')
                continue

            # Append to Store
            h5_append(cfg['out'], df.astype('int32'), log_item)

            if 'txt' in cfg['program'].keys():  # can be saved as text too
                np.savetxt(cfg['program']['txt'],
                           V,
                           delimiter='\t',
                           newline='\n',
                           header=cfg['in']['header'] + log_item['fileName'],
                           fmt='%d',
                           comments='')

    try:
        if b_remove_duplicates:
            for tblName in (cfg['out']['table'] +
                            cfg['out']['tableLog_names']):
                cfg['out']['db'][tblName].drop_duplicates(
                    keep='last', inplace=True)  # subset='fileName',?
        if len(strLog):
            print('Create index', end=', ')
            for tblName in (cfg['out']['table'] +
                            cfg['out']['tableLog_names']):
                cfg['out']['db'].create_table_index(tblName,
                                                    columns=['index'],
                                                    kind='full')
        else:
            print('done nothing')
    except Exception as e:
        l.exception('The end. There are error ')

        import traceback, code
        from sys import exc_info as sys_exc_info

        tb = sys_exc_info()[2]  # type, value,
        traceback.print_exc()
        last_frame = lambda tb=tb: last_frame(tb.tb_next) if tb.tb_next else tb
        frame = last_frame().tb_frame
        ns = dict(frame.f_globals)
        ns.update(frame.f_locals)
        code.interact(local=ns)
    # sort index if have any processed data (needed because ``ptprepack`` not closses hdf5 source if it not finds data)
    if cfg['in'].get('time_last'):
        failed_storages = h5move_tables(cfg['out'])
        print('Ok.', end=' ')
        h5index_sort(
            cfg['out'],
            out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
            in_storages=failed_storages)
예제 #5
0
change_db_path(cfg['out'])
log = {}
try:  # set chanks to mean data interval between holes
    cfg['out']['chunksize'] = int(mean_burst_size)  # np.median(np.diff(i_burst[:-1]))
except ValueError:  # some default value if no holes
    cfg['out']['chunksize'] = 100000

h5init(cfg['in'], cfg['out'])  # cfg['in'] = {}
try:
    cfg['out']['b_skip_if_up_to_date'] = False  # not copy prev data: True not implemented
    dfLogOld, store, cfg['out']['b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    # with pd.HDFStore(fileOut, mode='w') as store:
    # Append to Store
    if df.empty:  # log['rows']==0
        print('No data => skip file')
    h5_append(cfg['out'], df, log)
    b_appended = True
except Exception as e:
    b_appended = False
finally:
    store.close()

if b_appended:
    if store.is_open:
        print('Wait store is closing...')
        # from time import sleep
        # sleep(2)
    failed_storages = h5move_tables(cfg['out'])
    print('Ok.', end=' ')
    h5index_sort(cfg['out'], out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
                 in_storages=failed_storages)