Пример #1
0
def load_coef(cfg):
    set_field_if_no(cfg, 'for', {})
    cfg['for']['k_names'], cfg['for']['kk'] = \
        np.loadtxt(cfg['in']['path_coef'],
                   dtype=[('name', 'S10'), ('k', '4<f4')],
                   skiprows=1, unpack=True)  # k0	k1*x      	k2*x^2     	k3*x^3	CKO
    cfg['for']['kk'] = np.fliplr(cfg['for']['kk'])
Пример #2
0
def h5_names_gen(cfg_in, cfg_out: Mapping[str, Any],
                 **kwargs) -> Iterator[Path]:
    """
    """
    set_field_if_no(cfg_out, 'log', {})
    for tbl in cfg_out['tables']:
        cfg_out['log']['fileName'], cfg_out['log']['fileChangeTime'] = cfg_in[
            'time_interval']
        try:
            yield tbl  # Traceback error line pointing here is wrong
        except GeneratorExit:
            print('Something wrong?')
            return
Пример #3
0
def parse_csv(filename, cfg_in):
    """
    Guess csv structure

    :param filename:
    :param cfg_in:
    :param known_structure: list of strings formats in order of columns, from start
    but may be not all (next is auto treeted)
    :return: lst_types, offset, headers 


    * quotechar - specifies a one-character string to use as the
        quoting character.  It defaults to '"'.
    * delimiter - specifies a one-character string to use as the
        field separator.  It defaults to ','.
    * skipinitialspace - specifies how to interpret whitespace which
        immediately follows a delimiter.  It defaults to False, which
        means that whitespace immediately following a delimiter is part
        of the following field.
    * lineterminator -  specifies the character sequence which should
        terminate rows.
    * quoting - controls when quotes should be generated by the writer.
        It can take on any of the following module constants:

        csv.QUOTE_MINIMAL means only when required, for example, when a
            field contains either the quotechar or the delimiter
        csv.QUOTE_ALL means that quotes are always placed around fields.
        csv.QUOTE_NONNUMERIC means that quotes are always placed around
            fields which do not parse as integers or floating point
            numbers.
        csv.QUOTE_NONE means that quotes are never placed around fields.
    * escapechar - specifies a one-character string used to escape
        the delimiter when quoting is set to QUOTE_NONE.
    * doublequote - controls the handling of quotes inside fields.  When
        True, two consecutive quotes are interpreted as one during read,
        and when writing, each quote character embedded in the data is
        written as two quotes
    Example:
    parse_csv(filename, ['%H:%M:%S'])
    """
    set_field_if_no(cfg_in, 'types', [])
    set_field_if_no(cfg_in, 'delimiter')
    with open(filename, 'rb') as fh:
        ext = os_path.splitext(filename)[1]
        # Load a file object:
        try:
            # If you are sure that file is csv use CSVTableSet(fh)
            from magic import MagicException  # because any_tableset uses libmagic
            table_set = any_tableset(fh,
                                     mimetype=None,
                                     extension=ext,
                                     delimiter=cfg_in['delimiter'])
        except (ImportError, MagicException) as e:
            print('There are error ', standard_error_info(e),
                  '\n=> Loading file as csv without trying other formats')
            table_set = CSVTableSet(fh, delimiter=cfg_in['delimiter'])

        # A table set is a collection of tables:
        row_set = table_set.tables[0]
        # A row set is an iterator over the table, but it can only
        # be run once. To peek, a sample is provided:

        # guess header names and the offset of the header:
        offset, headers = headers_guess(row_set.sample)  # tolerance=1
        row_set.register_processor(headers_processor(headers))
        # add one to begin with content, not the header:
        row_set.register_processor(offset_processor(offset + 1))
        # guess column types:
        lst_types = type_guess(row_set.sample, strict=True)
        row_sample = next(row_set.sample)

        # check not detected types
        def formats2types(formats_str):
            for f in formats_str:
                if f:
                    if is_date_format(f):
                        yield (types.DateType(f))
                    else:
                        yield (TimeType())
                else:
                    yield (None)

        known_types = formats2types(cfg_in['types'])

        for n, (t, s, kt) in enumerate(zip(lst_types, row_sample,
                                           known_types)):
            if t.result_type == types.StringType.result_type:
                # not auto detected? -> check known_types
                if kt.test(s.value):
                    lst_types[n] = kt  # t= kt
                else:  # known_types fits element
                    print(
                        "col'"
                        's#{:d} value "{}" type not match provided type of {}'.
                        format(n, s.value, type(kt)))
                    # kt = types.DateType('mm/dd/yyyy')
                    # kt.test('0'+s.value)
                    # detect?
            else:
                pass
        # not works for time type:
        # print(jts.headers_and_typed_as_jts(headers,
        #       list(map(jts.celltype_as_string, lst_types))).as_json())
        return lst_types, offset, headers
Пример #4
0
            # Check obtained format is the same
            if cfg['in']['types'] != lst_types_cur:
                if not len(cfg['in']['types']):
                    cfg['in']['types'] = lst_types_cur
                else:
                    print('file {} has another format!'.format(nameFE))

            if cfg['in']['skiprows'] != skiprows_cur:
                if cfg['in']['skiprows'] is None:
                    cfg['in']['skiprows'] = skiprows_cur
                else:
                    print('file {} has another format!'.format(nameFE))

            # Convert to numpy types
            set_field_if_no(cfg['in'], 'max_text_width',
                            2000)  # big width for 2D blocks
            types2numpy = {
                t.result_type: np_type
                for t, np_type in zip(types.TYPES, '')
            }

            #     {
            #     types.StringType.result_type: '|S{:.0f}'.format(cfg['in']['max_text_width'])
            # }
            cfg['in']['dtype'] = np.array([np.float64] *
                                          len(cfg['in']['types']))
            for k, typ in enumerate(cfg['in']['types']):
                if typ.result_type == types.StringType.result_type:
                    pass
                cfg['in']['dtype'][k] = types2numpy[typ.result_type]
Пример #5
0
def main(new_arg=None):
    """

    :param new_arg: returns cfg if new_arg=='<cfg_from_args>' but it will be None if argument
     argv[1:] == '-h' or '-v' passed to this code
    argv[1] is cfgFile. It was used with cfg files:
        'csv2h5_nav_supervisor.ini'
        'csv2h5_IdrRedas.ini'
        'csv2h5_Idronaut.ini'
    :return:
    """

    global l
    cfg = cfg_from_args(my_argparser(), new_arg)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')
    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**{
                **cfg['in'], 'path': cfg['in']['db_path']
            },
                                      b_interact=cfg['program']['b_interact'])
        set_field_if_no(
            cfg['in'], 'tables_log', '{}/logFiles'
        )  # will be filled by each table from cfg['in']['tables']
        cfg['in']['query'] = query_time_range(**cfg['in'])
        set_field_if_no(cfg['out'], 'db_path', cfg['in']['db_path'])
        # cfg['out'] = init_file_names(cfg['out'], , path_field='db_path')
    except Ex_nothing_done as e:
        print(e.message)
        return ()

    # args = parser.parse_args()
    # args.verbose= args.verbose[0]
    # try:
    #     cfg= ini2dict(args.cfgFile)
    #     cfg['in']['cfgFile']= args.cfgFile
    # except IOError as e:
    #     print('\n==> '.join([a for a in e.args if isinstance(a,str)])) #e.message
    #     raise(e)
    # Open text log
    if 'log' in cfg['program'].keys():
        dir_create_if_need(os_path.dirname(cfg['program']['log']))
        flog = open(cfg['program']['log'], 'a+', encoding='cp1251')

    cfg['out']['log'] = OrderedDict({'fileName': None, 'fileChangeTime': None})

    # Prepare saving to csv
    if 'file_names_add_fun' in cfg['out']:
        file_names_add = eval(
            compile(cfg['out']['file_names_add_fun'], '', 'eval'))
    else:
        file_names_add = lambda i: '.csv'  # f'_{i}.csv'

    # Prepare data for output store and open it
    if cfg['out']['tables'] == ['None']:
        # will not write new data table and its log
        cfg['out']['tables'] = None
        # cfg['out']['tables_log'] = None  # for _runs cfg will be redefined (this only None case that have sense?)

    h5init(cfg['in'], cfg['out'])
    # store, dfLogOld = h5temp_open(**cfg['out'])

    cfg_fileN = os_path.splitext(cfg['in']['cfgFile'])[0]
    out_tables_log = cfg['out'].get('tables_log')
    if cfg_fileN.endswith('_runs') or (bool(out_tables_log)
                                       and 'logRuns' in out_tables_log[0]):

        # Will calculate only after filter  # todo: calculate derived parameters before were they are bad (or replace all of them if any bad?)
        func_before_cycle = lambda x: None
        func_before_filter = lambda df, log_row, cfg: df
        func_after_filter = lambda df, cfg: log_runs(df, cfg, cfg['out']['log']
                                                     )

        # this table will be added:
        cfg['out']['tables_log'] = [cfg['out']['tables'][0] + '/logRuns']
        cfg['out'][
            'b_log_ready'] = True  # to not apdate time range in h5_append()

        # Settings to not affect main data table and switch off not compatible options:
        cfg['out']['tables'] = []
        cfg['out'][
            'b_skip_if_up_to_date'] = False  # todo: If False check it: need delete all previous result of CTD_calc() or set min_time > its last log time. True not implemented?
        cfg['program'][
            'b_log_display'] = False  # can not display multiple rows log
        if 'b_save_images' in cfg['extract_runs']:
            cfg['extract_runs']['path_images'] = cfg['out'][
                'db_path'].with_name('_subproduct')
            dir_create_if_need(cfg['extract_runs']['path_images'])
    else:
        if 'brown' in cfg_fileN.lower():
            func_before_cycle = load_coef
            if 'Lat' in cfg['in']:
                func_before_filter = lambda *args, **kwargs: add_ctd_params(
                    process_brown(*args, **kwargs), kwargs['cfg'])
            else:
                func_before_filter = process_brown
        else:
            func_before_cycle = lambda x: None

            def ctd_coord_and_params(df: pd.DataFrame, log_row, cfg):
                coord_data_col_ensure(df, log_row)
                return add_ctd_params(df, cfg)

            func_before_filter = ctd_coord_and_params
        func_after_filter = lambda df, cfg: df  # nothing after filter

    func_before_cycle(cfg)  # prepare: usually assign data to cfg['for']
    if cfg['out'].get('path_csv'):
        dir_create_if_need(cfg['out']['path_csv'])
    # Load data Main circle #########################################
    # Open input store and cicle through input table log records
    qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"
    iSt = 1

    dfLogOld, cfg['out']['db'], cfg['out'][
        'b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    b_out_db_is_different = cfg['out']['db'] is not None and cfg['out'][
        'db_path_temp'] != cfg['in']['db_path']
    # Cycle for each table, for each row in log:
    # for path_csv in gen_names_and_log(cfg['out'], dfLogOld):
    with FakeContextIfOpen(
            lambda f: pd.HDFStore(f, mode='r'), cfg['in']['db_path'],
            None if b_out_db_is_different else cfg['out']['db']
    ) as cfg['in']['db']:  # not opens ['in']['db'] if already opened to write

        for tbl in cfg['in']['tables']:
            if False:  # Show table info
                nodes = sorted(
                    cfg['out']['db'].root.__members__)  # , key=number_key
                print(nodes)
            print(tbl, end='. ')

            df_log = cfg['in']['db'].select(cfg['in']['tables_log'].format(tbl)
                                            or tbl,
                                            where=cfg['in']['query'])
            if True:  # try:
                if 'log' in cfg['program'].keys():
                    nRows = df_log.rows.size
                    flog.writelines(datetime.now().strftime(
                        '\n\n%d.%m.%Y %H:%M:%S> processed ') + f'{nRows} row' +
                                    ('s:' if nRows > 1 else ':'))

                for ifile, r in enumerate(df_log.itertuples(),
                                          start=iSt):  # name=None
                    print('.', end='')
                    sys_stdout.flush()

                    path_raw = PurePath(r.fileName)
                    cfg['out']['log'].update(fileName=path_raw.name,
                                             fileChangeTime=r.fileChangeTime)
                    # save current state
                    cfg['in']['file_stem'] = cfg['out']['log'][
                        'fileName']  # for exmple to can extract date in subprogram
                    cfg['in']['fileChangeTime'] = cfg['out']['log'][
                        'fileChangeTime']

                    if cfg['in']['b_skip_if_up_to_date']:
                        have_older_data, have_duplicates = h5del_obsolete(
                            cfg['out'], cfg['out']['log'], dfLogOld)
                        if have_older_data:
                            continue
                        if have_duplicates:
                            cfg['out']['b_remove_duplicates'] = True
                    print('{}. {}'.format(ifile, path_raw.name), end=': ')

                    # Load data
                    qstr = qstr_trange_pattern.format(r.Index, r.DateEnd)
                    df_raw = cfg['in']['db'].select(tbl, qstr)
                    cols = df_raw.columns.tolist()

                    # cfg['in']['lat'] and ['lon'] may be need in add_ctd_params() if Lat not in df_raw
                    if 'Lat_en' in df_log.columns and 'Lat' not in cols:
                        cfg['in']['lat'] = np.nanmean((r.Lat_st, r.Lat_en))
                        cfg['in']['lon'] = np.nanmean((r.Lon_st, r.Lon_en))

                    df = func_before_filter(df_raw, log_row=r, cfg=cfg)

                    if df.size:  # size is zero means save only log but not data
                        # filter, updates cfg['out']['log']['rows']
                        df, _ = set_filterGlobal_minmax(
                            df, cfg['filter'], cfg['out']['log'])
                    if 'rows' not in cfg['out']['log']:
                        l.warning('no data!')
                        continue
                    elif isinstance(cfg['out']['log']['rows'], int):
                        print('filtered out {rows_filtered}, remains {rows}'.
                              format_map(cfg['out']['log']))
                        if cfg['out']['log']['rows']:
                            print('.', end='')
                        else:
                            l.warning('no data!')
                            continue

                    df = func_after_filter(df, cfg=cfg)

                    # Append to Store
                    h5_append(cfg['out'],
                              df,
                              cfg['out']['log'],
                              log_dt_from_utc=cfg['in']['dt_from_utc'])

                    # Copy to csv
                    if cfg['out'].get('path_csv'):
                        fname = '{:%y%m%d_%H%M}-{:%d_%H%M}'.format(
                            r.Index, r.DateEnd) + file_names_add(ifile)
                        if not 'data_columns' in cfg['out']:
                            cfg['out']['data_columns'] = slice(0,
                                                               -1)  # all cols
                        df.to_csv(  # [cfg['out']['data_columns']]
                            cfg['out']['path_csv'] / fname,
                            date_format=cfg['out']['text_date_format'],
                            float_format='%5.6g',
                            index_label='Time'
                        )  # to_string, line_terminator='\r\n'

                    # Log to screen (if not prohibited explicitly)
                    if cfg['out']['log'].get('Date0') is not None and (
                        ('b_log_display' not in cfg['program'])
                            or cfg['program']['b_log_display']):
                        str_log = '{fileName}:\t{Date0:%d.%m.%Y %H:%M:%S}-' \
                                  '{DateEnd:%d. %H:%M:%S%z}\t{rows}rows'.format_map(
                            cfg['out']['log'])  # \t{Lat}\t{Lon}\t{strOldVal}->\t{mag}
                        l.info(str_log)
                    else:
                        str_log = str(cfg['out']['log'].get('rows', '0'))
                    # Log to logfile
                    if 'log' in cfg['program'].keys():
                        flog.writelines('\n' + str_log)

    if b_out_db_is_different:
        try:
            if cfg['out']['tables'] is not None:
                print('')
                if cfg['out']['b_remove_duplicates']:
                    h5remove_duplicates(cfg['out'],
                                        cfg_table_keys=('tables',
                                                        'tables_log'))
                # Create full indexes. Must be done because of using ptprepack in h5move_tables() below
                l.debug('Create index')
                for tblName in (cfg['out']['tables'] +
                                cfg['out']['tables_log']):
                    try:
                        cfg['out']['db'].create_table_index(tblName,
                                                            columns=['index'],
                                                            kind='full')
                    except Exception as e:
                        l.warning(
                            ': table {}. Index not created - error'.format(
                                tblName), '\n==> '.join(
                                    [s for s in e.args if isinstance(s, str)]))
        except Exception as e:
            l.exception('The end. There are error ')

            import traceback, code
            from sys import exc_info as sys_exc_info
            tb = sys_exc_info()[2]  # type, value,
            traceback.print_exc()
            last_frame = lambda tb=tb: last_frame(tb.tb_next
                                                  ) if tb.tb_next else tb
            frame = last_frame().tb_frame
            ns = dict(frame.f_globals)
            ns.update(frame.f_locals)
            code.interact(local=ns)
        finally:

            cfg['out']['db'].close()
            if cfg['program']['log']:
                flog.close()
            if cfg['out']['db'].is_open:
                print('Wait store is closing...')
                sleep(2)

            failed_storages = h5move_tables(cfg['out'])
            print('Finishing...' if failed_storages else 'Ok.', end=' ')
            h5index_sort(
                cfg['out'],
                out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
                in_storages=failed_storages)
Пример #6
0
def main(new_arg=None):
    cfg = cfg_from_args(my_argparser(), new_arg)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')

    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**cfg['in'],
                                      b_interact=cfg['program']['b_interact'],
                                      cfg_search_parent=cfg['out'])
        h5init(cfg['in'], cfg['out'])
    except Ex_nothing_done as e:
        print(e.message)
        exit()

    df_dummy = pd.DataFrame(
        np.full(1,
                np.NaN,
                dtype=np.dtype({
                    'formats': ['float64', 'float64'],
                    'names': cfg['out']['tracks_cols'][1:]
                })),
        index=(pd.NaT, ))  # used for insert separator lines

    if 'routes_cols' not in cfg['in']:
        cfg['in']['routes_cols'] = cfg['in']['waypoints_cols']
    if 'routes_cols' not in cfg['out']:
        cfg['out']['routes_cols'] = cfg['out'][
            'waypoints_cols']  # cfg['in']['routes_cols']  #
    # Writing
    if True:  # try:
        l.warning('processing ' + str(cfg['in']['nfiles']) + ' file' +
                  's:' if cfg['in']['nfiles'] > 1 else ':')
        cfg['out']['log'] = {}
        set_field_if_no(cfg['out'], 'table_prefix',
                        PurePath(cfg['in']['path']).stem)
        cfg['out']['table_prefix'] = cfg['out']['table_prefix'].replace(
            '-', '')
        if len([t for t in cfg['out']['tables'] if len(t)]) > 1:
            cfg['out']['tables'] = \
                [cfg['out']['table_prefix'] + '_' + s for s in cfg['out']['tables']]
            cfg['out']['tables_log'] = \
                [cfg['out']['table_prefix'] + '_' + s for s in cfg['out']['tables_log']]

        tables = dict(zip(df_names, cfg['out']['tables']))
        tables_log = dict(zip(df_names, cfg['out']['tables_log']))
        # Can not save path to DB (useless?) so set  for this max file name length:
        set_field_if_no(cfg['out'], 'logfield_fileName_len', 50)
        cfg['out']['index_level2_cols'] = cfg['in']['routes_cols'][0]

        # ###############################################################
        # ## Cumulate all data in cfg['out']['path_temp'] ##################
        ## Main circle ############################################################
        for i1_file, path_gpx in h5_dispenser_and_names_gen(
                cfg['in'], cfg['out']):
            l.info('{}. {}: '.format(i1_file, path_gpx.name))
            # Loading data
            dfs = gpxConvert(cfg, path_gpx)
            print('write', end=': ')
            sys_stdout.flush()
            for key, df in dfs.items():
                if (not tables.get(key)) or df.empty:
                    continue
                elif key == 'tracks':
                    # Save last time to can filter next file
                    cfg['in']['time_last'] = df.index[-1]

                sort_time = False if key in {'waypoints', 'routes'} else None

                # monkey patching
                if 'tracker' in tables[key]:
                    # Also {} must be in tables[key]. todo: better key+'_fun_tracker' in cfg['out']?
                    # Trackers processing
                    trackers_numbers = {
                        '0-3106432': '1',
                        '0-2575092': '2',
                        '0-3124620': '3',
                        '0-3125300': '4',
                        '0-3125411': '5',
                        '0-3126104': '6'
                    }
                    tables_pattern = tables[key]
                    tables_log_pattern = tables_log[key]

                    df['comment'] = df['comment'].str.split(" @",
                                                            n=1,
                                                            expand=True)[0]
                    # split data and save to multipe tables
                    df_all = df.set_index(['comment', df.index])
                    for sn, n in trackers_numbers.items(
                    ):  # set(df_all.index.get_level_values(0))
                        try:
                            df = df_all.loc[sn]
                        except KeyError:
                            continue
                        # redefine saving parameters
                        cfg['out']['table'] = tables_pattern.format(
                            trackers_numbers[sn])
                        cfg['out']['table_log'] = tables_log_pattern.format(
                            trackers_numbers[sn])
                        call_with_valid_kwargs(df_filter_and_save_to_h5,
                                               df**cfg,
                                               input=cfg['in'],
                                               sort_time=sort_time)
                else:
                    cfg['out']['table'] = tables[key]
                    cfg['out']['table_log'] = tables_log[key]
                    call_with_valid_kwargs(df_filter_and_save_to_h5,
                                           df,
                                           **cfg,
                                           input=cfg['in'],
                                           sort_time=sort_time)

    # try:
    # if cfg['out']['b_remove_duplicates']:
    #     for tbls in cfg['out']['tables_have_wrote']:
    #         for tblName in tbls:
    #             cfg['out']['db'][tblName].drop_duplicates(keep='last', inplace= True)
    # print('Create index', end=', ')

    # create_table_index calls create_table which docs sais "cannot index Time64Col() or ComplexCol"
    # so load it, index, then save
    # level2_index = None
    # df = cfg['out']['db'][tblName] # last commented
    # df.set_index([navp_all_index, level2_index])
    # df.sort_index()

    # cfg['out']['db'][tblName].sort_index(inplace=True)

    # if df is not None:  # resave
    #     df_log = cfg['out']['db'][tblName]
    #     cfg['out']['db'].remove(tbls[0])
    #     cfg['out']['db'][tbls[0]] = df
    #     cfg['out']['db'][tbls[1]] = df_log

    try:
        pass
    except Exception as e:
        print('The end. There are error ', standard_error_info(e))

    #     import traceback, code
    #     from sys import exc_info as sys_exc_info
    #
    #     tb = sys_exc_info()[2]  # type, value,
    #     traceback.print_exc()
    #     last_frame = lambda tb=tb: last_frame(tb.tb_next) if tb.tb_next else tb
    #     frame = last_frame().tb_frame
    #     ns = dict(frame.f_globals)
    #     ns.update(frame.f_locals)
    #     code.interact(local=ns)
    # finally:
    #     cfg['out']['db'].close()
    #     failed_storages= h5move_tables(cfg['out'], cfg['out']['tables_have_wrote'])

    try:
        failed_storages = h5move_tables(cfg['out'],
                                        tbl_names=cfg['out'].get(
                                            'tables_have_wrote', set()))
        print('Finishing...' if failed_storages else 'Ok.', end=' ')
        # Sort if have any processed data that needs it (not the case for the routes and waypoints), else don't because ``ptprepack`` not closes hdf5 source if it not finds data
        if cfg['in'].get('time_last'):
            cfg['out']['b_remove_duplicates'] = True
            h5index_sort(
                cfg['out'],
                out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
                in_storages=failed_storages,
                tables=cfg['out'].get('tables_have_wrote', set()))
    except Ex_nothing_done:
        print('ok')
Пример #7
0
def h5_append(cfg_out: Dict[str, Any],
              df: Union[pd.DataFrame, dd.DataFrame],
              log,
              log_dt_from_utc=pd.Timedelta(0),
              tim=None):
    '''
    Append dataframe to Store: df to cfg_out['table'] ``table`` node and
    append chield table with 1 row metadata including 'index' and 'DateEnd' which
    is calculated as first and last elements of df.index

    :param df: pandas or dask datarame to append. If dask then log_dt_from_utc must be None (not assign log metadata here)
    :param log: dict which will be appended to child tables, cfg_out['tables_log']
    :param cfg_out: dict with fields:
        db: opened hdf5 store in write mode
        table: name of table to update (or tables: list, then used only 1st element)
        table_log: name of chield table (or tables_log: list, then used only 1st element)
        tables: None - to return with done nothing!
                list of str - to assign cfg_out['table'] = cfg_out['tables'][0]
        tables_log: list of str - to assign cfg_out['table_log'] = cfg_out['tables_log'][0]
        b_insert_separator: (optional), freq (optional)
        data_columns: optional, list of column names to write.
        chunksize: may be None but then must be chunksize_percent to calcW ake Up:
            chunksize = len(df) * chunksize_percent / 100
    :param log_dt_from_utc: 0 or pd.Timedelta - to correct start and end time: index and DateEnd.
        Note: if log_dt_from_utc is None then start and end time: 'Date0' and 'DateEnd' fields of log must be filled right already
    :return: None
    :updates:
        log:
            'Date0' and 'DateEnd'
        cfg_out: only if not defined already:
            cfg_out['table_log'] = cfg_out['tables_log'][0]
            table_log
            tables_have_wrote list appended (or created) with tuple `(table, table_log)`
    '''

    df_len = len(df) if tim is None else len(
        tim)  # use computed values if possible for faster dask
    if df_len:  # dask.dataframe.empty is not implemented
        if cfg_out.get('b_insert_separator'):
            # Add separatiion row of NaN
            msg_func = f'{df_len}rows+1dummy'
            cfg_out.setdefault('fs')
            df = h5_append_dummy_row(df, cfg_out['fs'], tim)
            df_len += 1
        else:
            msg_func = f'{df_len}rows'

        # Save to store
        # check/set tables names
        if 'tables' in cfg_out:
            if cfg_out['tables'] is None:
                l.info('selected(%s)... ', msg_func)
                return
            set_field_if_no(cfg_out, 'table', cfg_out['tables'][0])

        l.info('h5_append(%s)... ', msg_func)
        set_field_if_no(cfg_out, 'nfiles', 1)

        if 'chunksize' in cfg_out and cfg_out['chunksize'] is None:
            if ('chunksize_percent' in cfg_out):  # based on first file
                cfg_out['chunksize'] = int(
                    df_len * cfg_out['chunksize_percent'] / 1000) * 10
                if cfg_out['chunksize'] < 10000:
                    cfg_out['chunksize'] = 10000
            else:
                cfg_out['chunksize'] = 10000

                if df_len <= 10000 and isinstance(df, dd.DataFrame):
                    df = df.compute()  # dask not writes "all NaN" rows

            # , compute=False
            # cfg_out['db'].append(cfg_out['table'], df, data_columns=True, index=False,
            #              chunksize=cfg_out['chunksize'])
        table = None
        try:
            table = df_data_append_fun(df, cfg_out['table'], cfg_out)
        except ValueError as e:
            table = h5append_on_inconsistent_index(cfg_out, cfg_out['table'],
                                                   df, df_data_append_fun, e,
                                                   msg_func)
        except TypeError as e:  # (, AttributeError)?
            if isinstance(df, dd.DataFrame):
                last_nan_row = df.loc[df.index.compute()[-1]].compute()
                # df.compute().query("index >= Timestamp('{}')".format(df.index.compute()[-1].tz_convert(None))) ??? works
                # df.query("index > Timestamp('{}')".format(t_end.tz_convert(None)), meta) #df.query(f"index > {t_end}").compute()
                if all(last_nan_row.isna()):
                    l.exception(
                        f'{msg_func}: dask not writes separator? Repeating using pandas'
                    )
                    table = df_data_append_fun(
                        last_nan_row,
                        cfg_out['table'],
                        cfg_out,
                        min_itemsize={
                            c: 1
                            for c in (cfg_out['data_columns'] if cfg_out.
                                      get('data_columns', True
                                          ) is not True else df.columns)
                        })
                    # sometimes pandas/dask get bug (thinks int is a str?): When I add row of NaNs it tries to find ``min_itemsize`` and obtain NaN (for float too, why?) this lead to error
                else:
                    l.exception(msg_func)
            else:
                l.error('%s: Can not write to store. %s', msg_func,
                        standard_error_info(e))
                raise (e)
        except Exception as e:
            l.error(f'%s: Can not write to store. %s', msg_func,
                    standard_error_info(e))
            raise (e)

    # run even if df is empty becouse of possible needs to write log only
    table_log = h5add_log(cfg_out, df, log, tim, log_dt_from_utc)

    _t = (table, table_log)
    if 'tables_have_wrote' in cfg_out:
        cfg_out['tables_have_wrote'].add(_t)
    else:
        cfg_out['tables_have_wrote'] = {_t}
Пример #8
0
def h5add_log(cfg_out: Dict[str, Any], df,
              log: Union[pd.DataFrame, Mapping, None], tim, log_dt_from_utc):
    """
    Updates (or creates if need) metadata table
    :param cfg_out: dict with fields:
     - b_log_ready: if False or '' then updates log['Date0'], log['DateEnd'].
     - db: handle of opened hdf5 store
     - some of following fields (next will be tried if previous not defined):
         - table_log: str, path of log table
         - tables_log: List[str], path of log table in first element
         - table: str, path of log table will be consructed by adding '/log'
         - tables: List[str], path of log table will be consructed by adding '/log' to first element
     - logfield_fileName_len: optiondal, fixed length of string format of 'fileName' hdf5 column
    :param df:
    :param log: Mapping records or dataframe. updates 'Date0' and 'DateEnd' if no 'Date0' or it is {} or None
    :param tim:
    :param log_dt_from_utc:
    :return:
    """
    if cfg_out.get('b_log_ready') and (isinstance(log, Mapping) and not log):
        return

    # synchro "tables_log" and more user friendly but not so universal to code "table_log"

    if cfg_out.get('table_log'):
        table_log = cfg_out['table_log']
    else:
        table_log = cfg_out.get('tables_log')
        if table_log:
            if '{}' in table_log[0]:
                table_log = table_log[0].format(cfg_out['table'])
            else:
                table_log = table_log[0]

        else:  # set default for (1st) data table
            try:
                table_log = f"{cfg_out['table']}'/log'"
            except KeyError:
                table_log = f"{cfg_out['tables'][0]}'/log'"

    set_field_if_no(cfg_out, 'logfield_fileName_len', 255)

    if (log.get('Date0') is None) or not cfg_out.get('b_log_ready'):
        # or (table_log.split('/')[-1].startswith('logFiles')):
        log['Date0'], log['DateEnd'] = timzone_view(
            (tim if tim is not None else df.index.compute() if isinstance(
                df, dd.DataFrame) else df.index)[[0, -1]], log_dt_from_utc)
    # dfLog = pd.DataFrame.from_dict(log, np.dtype(np.unicode_, cfg_out['logfield_fileName_len']))
    if not isinstance(log, pd.DataFrame):
        try:
            log = pd.DataFrame(log).set_index('Date0')
        except ValueError as e:  # , Exception
            log = pd.DataFrame.from_records(
                log,
                exclude=['Date0'],
                index=log['Date0']
                if isinstance(log['Date0'], pd.DatetimeIndex) else
                [log['Date0']])  # index='Date0' not work for dict

    try:
        return df_log_append_fun(log, table_log, cfg_out)
    except ValueError as e:
        return h5append_on_inconsistent_index(cfg_out, table_log, log,
                                              df_log_append_fun, e,
                                              'append log')
    except ClosedFileError as e:
        l.warning('Check code: On reopen store update store variable')
Пример #9
0
def main(new_arg=None, **kwargs):
    """

    :param new_arg: list of strings, command line arguments
    :kwargs: dicts for each section: to overwrite values in them (overwrites even high priority values, other values remains)
    Note: if new_arg=='<cfg_from_args>' returns cfg but it will be None if argument
     argv[1:] == '-h' or '-v' passed to this code
    argv[1] is cfgFile. It was used with cfg files:
        'csv2h5_nav_supervisor.ini'
        'csv2h5_IdrRedas.ini'
        'csv2h5_Idronaut.ini'

    :return:
    """
    global l

    cfg = cfg_from_args(my_argparser(), new_arg, **kwargs)
    if not cfg or not cfg['program'].get('return'):
        print('Can not initialise')
        return cfg
    elif cfg['program']['return'] == '<cfg_from_args>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    print('\n' + this_prog_basename(__file__), end=' started. ')
    try:
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(**cfg['in'],
                                      b_interact=cfg['program']['b_interact'])
    except Ex_nothing_done as e:
        print(e.message)
        return ()

    bOld_FF00FF = False
    # if 'TermGrunt' in sys.argv[1] FF00FF' in str(cfg['in']['path']):  # 'TermGrunt.h5'  ? args.path.endswith ('bin'):
    #     bOld_FF00FF = True
    #     cfg['in'].update({
    #     'header': 'TERM',
    #     'dt_from_utc': timedelta(hours=-1),
    #     'fs': 1, 'b_time_fromtimestamp': True,
    #     'b_time_fromtimestamp_source': False})
    # else:  # 'Katran.h5'
    #     cfg['in'].update({
    #     'delimiter_hex': '000000E6',
    #     'header': 'P, Temp, Cond',
    #     'dt_from_utc': timedelta(hours=0),
    #     'fs': 10, 'b_time_fromtimestamp': False,
    #     'b_time_fromtimestamp_source': False})

    set_field_if_no(
        cfg['in'], 'dtype', 'uint{:d}'.format(2**(3 + np.searchsorted(
            2**np.array([3, 4, 5, 6, 7]) > np.array(
                8 * (cfg['in']['data_word_len'] - 1)), 1))))

    # Prepare cpecific format loading and writing
    set_field_if_no(cfg['in'], 'coltime', [])
    cfg['in'] = init_input_cols(cfg['in'])
    cfg['out']['names'] = np.array(cfg['in']['dtype'].names)[ \
        cfg['in']['cols_loaded_save_b']]
    cfg['out']['formats'] = [
        cfg['in']['dtype'].fields[n][0] for n in cfg['out']['names']
    ]
    cfg['out']['dtype'] = np.dtype({
        'formats': cfg['out']['formats'],
        'names': cfg['out']['names']
    })
    h5init(cfg['in'], cfg['out'])

    # cfg['Period'] = 1.0 / cfg['in']['fs']  # instead Second can use Milli / Micro / Nano:
    # cfg['pdPeriod'] = pd.to_timedelta(cfg['Period'], 's')
    # #pd.datetools.Second(cfg['Period'])\
    #     if 1 % cfg['in']['fs'] == 0 else\
    #     pd.datetools.Nano(cfg['Period'] * 1e9)

    # log table of loaded files. columns: Start time, file name, and its index in array off all loaded data:
    log_item = cfg['out']['log'] = {
    }  # fields will have: 'fileName': None, 'fileChangeTime': None, 'rows': 0

    strLog = ''
    # from collections import namedtuple
    # type_log_files = namedtuple('type_log_files', ['label','iStart'])
    # log.sort(axis=0, order='log_item['Date0']')#sort files by time

    dfLogOld, cfg['out']['db'], cfg['out'][
        'b_skip_if_up_to_date'] = h5temp_open(**cfg['out'])
    if 'log' in cfg['program'].keys():
        f = open(PurePath(sys_argv[0]).parent / cfg['program']['log'],
                 'a',
                 encoding='cp1251')
        f.writelines(
            datetime.now().strftime('\n\n%d.%m.%Y %H:%M:%S> processed ' +
                                    str(cfg['in']['nfiles']) + ' file' +
                                    's:' if cfg['in']['nfiles'] > 1 else ':'))
    b_remove_duplicates = False  # normally no duplicates but will if detect
    # Config specially for readBinFramed
    set_field_if_no(cfg['in'], 'b_byte_order_is_big_endian', True)
    set_field_if_no(cfg['in'], 'b_baklan', False)
    set_field_if_no(cfg['in'], 'b_time_fromtimestamp_source', False)
    cfg['out']['fs'] = cfg['in']['fs']
    if True:
        ## Main circle ############################################################
        for i1_file, path_in in h5_dispenser_and_names_gen(
                cfg['in'], cfg['out']):
            l.info('{}. {}: '.format(i1_file, path_in.name))

            # Loading data
            if bOld_FF00FF:
                V = readFF00FF(path_in, cfg)
                iFrame = np.arange(len(V))
            else:
                V, iFrame = readBinFramed(path_in, cfg['in'])
            if ('b_time_fromtimestamp' in cfg['in'] and cfg['in']['b_time_fromtimestamp']) or \
                    ('b_time_fromtimestamp_source' in cfg['in'] and cfg['in']['b_time_fromtimestamp_source']):
                path_in_rec = os_path.join(
                    'd:\\workData\\_source\\BalticSea\\151021_T1Grunt_Pregol\\_source\\not_corrected',
                    os_path.basename(path_in)[:-3] + 'txt'
                ) if cfg['in']['b_time_fromtimestamp_source'] else path_in
                log_item['Date0'] = datetime.fromtimestamp(
                    os_path.getmtime(path_in_rec))  # getctime is bad
                log_item['Date0'] -= iFrame[-1] * timedelta(
                    seconds=1 / cfg['in']['fs']
                )  # use for computer filestamp at end of recording
            else:
                log_item['Date0'] = datetime.strptime(
                    path_in.stem, cfg['in']['filename2timestart_format'])
            log_item['Date0'] += cfg['in']['dt_from_utc']
            tim = log_item['Date0'] + iFrame * timedelta(
                seconds=1 / cfg['in']['fs']
            )  # tim = pd.date_range(log_item['Date0'], periods=np.size(V, 0), freq=cfg['pdPeriod'])
            df = pd.DataFrame(
                V.view(dtype=cfg['out']['dtype']),  # np.uint16
                columns=cfg['out']['names'],
                index=tim)
            # pd.DataFrame(V, columns=cfg['out']['names'], dtype=cfg['out']['formats'], index=tim)
            if df.empty:  # log['rows']==0
                print('No data => skip file')
                continue

            df, tim = set_filterGlobal_minmax(df,
                                              cfg_filter=cfg['filter'],
                                              log=log_item,
                                              dict_to_save_last_time=cfg['in'])
            if log_item['rows_filtered']:
                print('filtered out {}, remains {}'.format(
                    log_item['rows_filtered'], log_item['rows']))
            if not log_item['rows']:
                l.warning('no data! => skip file')
                continue
            elif log_item['rows']:
                print(
                    '.', end=''
                )  # , divisions=d.divisions), divisions=pd.date_range(tim[0], tim[-1], freq='1D')
            else:
                l.warning('no data! => skip file')
                continue

            # Append to Store
            h5_append(cfg['out'], df.astype('int32'), log_item)

            if 'txt' in cfg['program'].keys():  # can be saved as text too
                np.savetxt(cfg['program']['txt'],
                           V,
                           delimiter='\t',
                           newline='\n',
                           header=cfg['in']['header'] + log_item['fileName'],
                           fmt='%d',
                           comments='')

    try:
        if b_remove_duplicates:
            for tblName in (cfg['out']['table'] +
                            cfg['out']['tableLog_names']):
                cfg['out']['db'][tblName].drop_duplicates(
                    keep='last', inplace=True)  # subset='fileName',?
        if len(strLog):
            print('Create index', end=', ')
            for tblName in (cfg['out']['table'] +
                            cfg['out']['tableLog_names']):
                cfg['out']['db'].create_table_index(tblName,
                                                    columns=['index'],
                                                    kind='full')
        else:
            print('done nothing')
    except Exception as e:
        l.exception('The end. There are error ')

        import traceback, code
        from sys import exc_info as sys_exc_info

        tb = sys_exc_info()[2]  # type, value,
        traceback.print_exc()
        last_frame = lambda tb=tb: last_frame(tb.tb_next) if tb.tb_next else tb
        frame = last_frame().tb_frame
        ns = dict(frame.f_globals)
        ns.update(frame.f_locals)
        code.interact(local=ns)
    # sort index if have any processed data (needed because ``ptprepack`` not closses hdf5 source if it not finds data)
    if cfg['in'].get('time_last'):
        failed_storages = h5move_tables(cfg['out'])
        print('Ok.', end=' ')
        h5index_sort(
            cfg['out'],
            out_storage_name=f"{cfg['out']['db_path'].stem}-resorted.h5",
            in_storages=failed_storages)
Пример #10
0
def main(new_arg=None, veusze=None):
    """
    Note: if vsz data source have 'Ag_old_inv' variable then not invert coef. Else invert to use in vsz which not invert coefs
    :param new_arg:
    :return:
    """
    global l
    p = veuszPropagate.my_argparser()
    p_groups = {
        g.title: g
        for g in p._action_groups if g.title.split(' ')[-1] != 'arguments'
    }  # skips special argparse groups
    p_groups['in'].add(
        '--channels_list',
        help=
        'channels needed zero calibration: "magnetometer" or "M" for magnetometer and any else for accelerometer, use "M, A" for both, empty to skip '
    )
    p_groups['in'].add(
        '--widget',
        help=
        'path to Veusz widget property which contains coefficients. For example "/fitV(force)/grid1/graph/fit1/values"'
    )
    p_groups['in'].add(
        '--data_for_coef',
        default='max_incl_of_fit_t',
        help=
        'Veusz data to use as coef. If used with widget then this data is appended to data from widget'
    )

    p_groups['out'].add('--out.path', help='path to db where write coef')
    p_groups['out'].add(
        '--re_tbl_from_vsz_name',
        help=
        'regex to extract hdf5 table name from to Veusz file name (last used "\D*\d*")'
        # ? why not simly specify table name?
    )
    # todo:  "b_update_existed" arg will be used here for exported images. Check whether False works or prevent open vsz

    cfg = cfg_from_args(p, new_arg)

    if not Path(cfg['program']['log']).is_absolute():
        cfg['program']['log'] = str(
            Path(__file__).parent.joinpath(
                cfg['program']['log']))  # l.root.handlers[0].baseFilename
    if not cfg:
        return
    if new_arg == '<return_cfg>':  # to help testing
        return cfg

    l = init_logging(logging, None, cfg['program']['log'],
                     cfg['program']['verbose'])
    veuszPropagate.l = l
    print('\n' + this_prog_basename(__file__), 'started', end=' ')
    if cfg['out']['b_images_only']:
        print('in images only mode.')
    try:
        print('Output pattern ')
        # Using cfg['out'] to store pattern information
        if not Path(cfg['in']['pattern_path']).is_absolute():
            cfg['in']['pattern_path'] = str(cfg['in']['path'].parent.joinpath(
                cfg['in']['pattern_path']))
        set_field_if_no(cfg['out'], 'path', cfg['in']['pattern_path'])
        cfg['out']['paths'], cfg['out']['nfiles'], cfg['out'][
            'path'] = init_file_names(**cfg['out'],
                                      b_interact=cfg['program']['b_interact'])
    except Ex_nothing_done as e:
        print(e.message, ' - no pattern')
        return  # or raise FileNotFoundError?
    try:
        print(end='Data ')
        cfg['in']['paths'], cfg['in']['nfiles'], cfg['in'][
            'path'] = init_file_names(
                **cfg['in'], b_interact=False)  # do not bother user 2nd time
    except Ex_nothing_done as e:
        print(e.message)
        return  # or raise FileNotFoundError?
    if not cfg['out']['export_dir']:
        cfg['out']['export_dir'] = Path(cfg['out']['path']).parent
    if cfg['program']['before_next'] and 'restore_config' in cfg['program'][
            'before_next']:
        cfg['in_saved'] = cfg['in'].copy()
    # cfg['loop'] = asyncio.get_event_loop()
    # cfg['export_timeout_s'] = 600
    cfg['out']['export_dir'] = dir_from_cfg(cfg['out']['path'].parent,
                                            cfg['out']['export_dir'])

    veuszPropagate.load_vsz = veuszPropagate.load_vsz_closure(
        cfg['program']['veusz_path'],
        b_execute_vsz=cfg['program']['b_execute_vsz'])
    gen_veusz_and_logs = veuszPropagate.load_to_veusz(
        veuszPropagate.ge_names(cfg), cfg, veusze)

    names_get = ['Inclination_mean_use1', 'logVext1_m__s'
                 ]  # \, 'Inclination_mean_use2', 'logVext2_m__s'
    names_get_fits = ['fit']  # , 'fit2'
    vsz_data = {n: [] for n in names_get}
    for n in names_get_fits:
        vsz_data[n] = []

    # prepare collecting all coef in text also
    names_get_txt_results = ['fit1result']  # , 'fit2result'
    txt_results = {n: {} for n in names_get_txt_results}

    i_file = 0
    for veusze, log in gen_veusz_and_logs:
        if not veusze:
            continue
        i_file += 1
        print(i_file)
        if cfg['out']['re_tbl_from_vsz_name']:
            table = cfg['out']['re_tbl_from_vsz_name'].match(
                log['out_name']).group()
        else:
            table = re.sub(
                '^[\d_]*', '',
                log['out_name'])  # delete all first digits (date part)

        for n in names_get:
            vsz_data[n].append(veusze.GetData(n)[0])
        for n in [cfg['in']['data_for_coef']]:
            vsz_data[n] = list(veusze.GetData(n)[0])

        # Save velocity coefficients into //{table}//coef//Vabs{i} where i - fit number enumeretad from 0
        for i, name_out in enumerate(names_get_fits):  # ['fit1', 'fit2']
            coef = veusze.Get(
                cfg['in']['widget']
            )  # veusze.Root['fitV(inclination)']['grid1']['graph'][name_out].values.val
            if 'a' in coef:
                coef_list = [
                    coef[k] for k in ['d', 'c', 'b', 'a'] if k in coef
                ]
            else:
                coef_list = [
                    coef[k] for k in sorted(coef.keys(), key=digits_first)
                ]
            if cfg['in']['data_for_coef']:
                coef_list += vsz_data[cfg['in']['data_for_coef']]

            vsz_data[name_out].append(coef_list)
            h5copy_coef(None,
                        cfg['out']['path'],
                        table,
                        dict_matrices={
                            f'//coef//Vabs{i}':
                            coef_list,
                            f'//coef//date':
                            np.float64([
                                np.NaN,
                                np.datetime64(datetime.now()).astype(np.int64)
                            ])
                        })
            # h5savecoef(cfg['out']['path'], path=f'//{table}//coef//Vabs{i}', coef=coef_list)
            txt_results[names_get_txt_results[i]][table] = str(coef)

        # Zeroing matrix - calculated in Veusz by rotation on old0pitch old0roll
        Rcor = veusze.GetData(
            'Rcor'
        )[0]  # zeroing angles tuned by "USEcalibr0V_..." in Veusz Custom definitions

        if len(cfg['in']['channels']):
            l.info(
                'Applying zero calibration matrix of peach = {} and roll = {} degrees'
                .format(np.rad2deg(veusze.GetData('old0pitch')[0][0]),
                        np.rad2deg(veusze.GetData('old0roll')[0][0])))
            with h5py.File(cfg['out']['path'], 'a') as h5:
                for channel in cfg['in']['channels']:
                    (col_str, coef_str) = channel_cols(channel)
                    # h5savecoef(cfg['out']['path'], path=f'//{table}//coef//Vabs{i}', coef=coef_list), dict_matrices={'//coef//' + coef_str + '//A': coefs[tbl][channel]['A'], '//coef//' + coef_str + '//C': coefs[tbl][channel]['b']})

                    # Currently used inclinometers have electronics rotated on 180deg. Before we inserted additional
                    # rotation operation in Veusz by inverting A_old. Now we want iclude this information in database coef only.
                    try:  # Checking that A_old_inv exist
                        A_old_inv = veusze.GetData('Ag_old_inv')
                        is_old_used = True  # Rcor is not account for electronic is rotated.
                    except KeyError:
                        is_old_used = False  # Rcor is account for rotated electronic.

                    if is_old_used:  # The rotation is done in vsz (A_old in vsz is inverted) so need rotate it back to
                        # use in vsz without such invertion

                        # Rotate on 180 deg (note: this is not inversion)
                        A_old_inv = h5[f'//{table}//coef//{coef_str}//A'][...]
                        A_old = np.dot(A_old_inv,
                                       [[1, 0, 0], [0, -1, 0], [0, 0, -1]
                                        ])  # adds 180 deg to roll
                    else:
                        A_old = h5[f'//{table}//coef//{coef_str}//A'][...]
                    # A_old now accounts for rotated electronic

                    A = np.dot(Rcor, A_old)
                    h5copy_coef(None,
                                h5,
                                table,
                                dict_matrices={f'//coef//{coef_str}//A': A})

        # veusze.Root['fitV(inclination)']['grid1']['graph2'][name_out].function.val
        print(vsz_data)
        veuszPropagate.export_images(
            veusze,
            cfg['out'],
            f"_{log['out_name']}",
            b_skip_if_exists=not cfg['out']['b_update_existed'])

        # vsz_data = veusz_data(veusze, cfg['in']['data_yield_prefix'])
        # # caller do some processing of data and gives new cfg:
        # cfgin_update = yield(vsz_data, log)  # to test run veusze.Save('-.vsz')
        # cfg['in'].update(cfgin_update)  # only update of cfg.in.add_custom_expressions is tested
        # if cfg['in']['add_custom']:
        #     for n, e in zip(cfg['in']['add_custom'], cfg['in']['add_custom_expressions']):
        #         veusze.AddCustom('definition', n, e, mode='replace')
        # #cor_savings.send((veusze, log))
        #
        #
        #
        #

    # veusze.Save(str(path_vsz_save), mode='hdf5')  # veusze.Save(str(path_vsz_save)) saves time with bad resolution
    print(f'Ok')
    print(txt_results)
    for n in names_get:
        pd.DataFrame.from_dict(
            dict(zip(list(txt_results['fit1result'].keys()),
                     vsz_data[n]))).to_csv(
                         Path(cfg['out']['path']).with_name(
                             f'average_for_fitting-{n}.txt'),
                         sep='\t',
                         header=txt_results['fit1result'].keys,
                         mode='a')
    return {**vsz_data, 'veusze': veusze}
Пример #11
0
def main(config: ConfigType) -> None:
    """
    ----------------------------
    Save data tp CSV-like files
    from Pandas HDF5 store*.h5
    ----------------------------

    :param config: with fields:
    - in - mapping with fields:
      - tables_log: - log table name or pattern str for it: in pattern '{}' will be replaced by data table name
      - cols_good_data: -
      ['dt_from_utc', 'db', 'db_path', 'table_nav']
    - out - mapping with fields:
      - cols: can use i - data row number and i_log_row - log row number that is used to load data range
      - cols_log: can use i - log row number
      - text_date_format
      - file_name_fun, file_name_fun_log - {fun} part of "lambda rec_num, t_st, t_en: {fun}" string to compile function
      for name of data and log text files
      - sep

    """
    global cfg
    cfg = to_vaex_hdf5.cfg_dataclasses.main_init(config, cs_store_name)
    cfg = to_vaex_hdf5.cfg_dataclasses.main_init_input_file(cfg, cs_store_name)
    #h5init(cfg['in'], cfg['out'])
    #cfg['out']['dt_from_utc'] = 0

    qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"
    # Prepare saving to csv
    # file name for files and log list:
    for fun in ['file_name_fun', 'file_name_fun_log']:
        cfg['out'][fun] = (
            eval(
                compile(f"lambda i, t_st, t_en: {cfg['out'][fun]}", '',
                        'eval')) if fun in cfg['out'] else
            ((lambda rec_num, t_st, t_en: 'log.csv') if fun.endswith('log')
             else lambda rec_num, t_st, t_en: '.csv')  # f'_{i}.csv'
        )
    set_field_if_no(cfg['out'], 'text_path', cfg['in']['db_path'].parent)
    dir_create_if_need(cfg['out']['text_path'])

    ## Main circle ############################################################
    i_log_row_st = 0
    for tbl, tbl_log, store in h5_tables_gen(cfg['in']['db_path'],
                                             cfg['in']['tables'],
                                             cfg['in']['tables_log']):
        # save log list
        if tbl_log:
            df_log = store.select(tbl_log, where=cfg['in']['query'])
            lf.info('Saving {} data files of ranges listed in {}',
                    df_log.shape[0], tbl_log)

            df_log_csv = order_cols(df_log, cfg['out']['cols_log'])

            # df_log_csv = interp_vals(
            #     df_log_csv,
            #     df_search=None,
            #     #cols_good_data = None,
            #     db = store,
            #     dt_search_nav_tolerance = timedelta(minutes=2)
            #     )

            df_log_csv.to_csv(
                cfg['out']['text_path'] / cfg['out']['file_name_fun_log'](
                    i_log_row_st, df_log.index[0], df_log.DateEnd[-1]),
                date_format=cfg['out']['text_date_format'],
                float_format=cfg['out']['text_float_format'],
                sep=cfg['out']['sep'])
        else:
            lf.info('{}: ', tbl)

        for i_log_row, log_row in enumerate(
                df_log.itertuples(), start=i_log_row_st
        ):  #  h5log_rows_gen(table_log=tbl_log, db=store, ):
            # Load data chunk that log_row describes
            print('.', end='')
            qstr = qstr_trange_pattern.format(log_row.Index, log_row.DateEnd)
            df_raw = store.select(tbl, qstr)
            df_raw['i_log_row'] = i_log_row
            df_csv = order_cols(df_raw, cfg['out']['cols'])
            # Save data
            df_csv.to_csv(cfg['out']['text_path'] /
                          cfg['out']['file_name_fun'](
                              i_log_row, df_raw.index[0], df_raw.index[-1]),
                          date_format=cfg['out']['text_date_format'],
                          float_format=cfg['out']['text_float_format'],
                          sep=cfg['out']['sep'])

        i_log_row_st += df_log.shape[0]

    print('Ok>', end=' ')
Пример #12
0
def init_input_cols(*, header=None, dtype, converters=None, cols_load, max_text_width=2000, dt_from_utc=0, comments='"',
                    cols_loaded_save_b=None):
    """ Append/modify dictionary cfg_in for parameters of dask/pandas load_csv() function and of save to hdf5.
    :param header (required if no 'cols'): comma/space separated string, column names in source file data header. Used to find cfg_in['cols']
         if last is not cpecified. May have format cpecifiers: '(text)','(float)','(time)', and also not used cols
         cpecified by skipping name between commas like in 'col1,,,col4' as in Veusz standard input dialog.
    :param dtype: type of data in column (as in Numpy loadtxt)
    :param converters: dict (see "converters" in Numpy loadtxt) or function(cfg_in) to make dict here
    :param cols_load: list of used column names

    :return: modified cfg_in dictionary. Will have fields:
        cols - list constructed from header by spit and remove format cpecifiers: '(text)', '(float)', '(time)'
        cols_load - list[int], indexes of ``cols`` in needed to save order
        coltime/coldate - index of 'Time'/'Date' column
        dtype: numpy.dtype of data after using loading function but before filtering/calculating fields
            numpy.float64 - default and for '(float)' format specifier
            numpy string with length cfg_in['max_text_width'] - for '(text)'
            datetime64[ns] - for coldate column (or coltime if no coldate) and for '(time)'
        col_index_name - index name for saving Pandas frame. Will be set to name of cfg_in['coltime'] column if not exist already
        used in main() default time postload proc only (if no specific loader which calculates and returns time column for index)
        cols_loaded_save_b - columns mask of cols_load to save (some columns needed only before save to calculate
        of others). Default: exluded (text) columns and index and coldate
        (because index saved in other variable and coldate may only used to create it)

    Example
    -------
    header= u'`Ensemble #`,txtYY_M_D_h_m_s_f(text),,,Top,`Average Heading (degrees)`,`Average Pitch (degrees)`,stdPitch,`Average Roll (degrees)`,stdRoll,`Average Temp (degrees C)`,txtVe_none(text) txtVn_none(text) txtVup(text) txtErrVhor(text) txtInt1(text) txtInt2(text) txtInt3(text) txtInt4(text) txtCor1(text) txtCor2(text) txtCor3(text) txtCor4(text),,,SpeedE_BT SpeedN_BT SpeedUp ErrSpeed DepthReading `Bin Size (m)` `Bin 1 Distance(m;>0=up;<0=down)` absorption IntScale'.strip()
    """
    cfg_in = locals()   # must be 1st row in function to be dict of input args
    dtype_text_max = '|S{:.0f}'.format(max_text_width)  # np.str

    if header:  # if header specified
        re_sep = ' *(?:(?:,\n)|[\n,]) *'  # not isolate "`" but process ",," right
        cfg_in['cols'] = re.split(re_sep, header)
        # re_fast = re.compile(u"(?:[ \n,]+[ \n]*|^)(`[^`]+`|[^`,\n ]*)", re.VERBOSE)
        # cfg_in['cols']= re_fast.findall(cfg_in['header'])
    elif not 'cols' in cfg_in:  # cols is from header, is specified or is default
        warnings.warn("default 'cols' is deprecated, use init_input_cols({header: "
                      "'stime, latitude, longitude'}) instead", DeprecationWarning, 2)
        cfg_in['cols'] = ('stime', 'latitude', 'longitude')

    # default parameters dependent from ['cols']
    cols_load_b = np.ones(len(cfg_in['cols']), np.bool8)

    # assign data type of input columns
    b_was_no_dtype = not 'dtype' in cfg_in
    if b_was_no_dtype:
        cfg_in['dtype'] = np.array([np.float64] * len(cfg_in['cols']))
        # 32 gets trunkation errors after 6th sign (=> shows long numbers after dot)
    elif isinstance(cfg_in['dtype'], str):
        cfg_in['dtype'] = np.array([np.dtype(cfg_in['dtype'])] * len(cfg_in['cols']))
    elif isinstance(cfg_in['dtype'], list):
        # prevent numpy array(list) guess minimal dtype because otherwise dtype will take maximum memory of length dtype_text_max
        numpy_cur_dtype = np.min_scalar_type(cfg_in['dtype'])
        numpy_cur_dtype_len = numpy_cur_dtype.itemsize / np.dtype((numpy_cur_dtype.kind, 1)).itemsize
        cfg_in['dtype'] = np.array(cfg_in['dtype'], '|S{:.0f}'.format(
            max(len(dtype_text_max), numpy_cur_dtype_len)))

    for sCol, sDefault in (['coltime', 'Time'], ['coldate', 'Date']):
        if (sCol not in cfg_in):
            # if cfg['col(time/date)'] is not provided try find 'Time'/'Date' column name
            if not (sDefault in cfg_in['cols']):
                sDefault = sDefault + '(text)'
            if not (sDefault in cfg_in['cols']):
                continue
            cfg_in[sCol] = cfg_in['cols'].index(sDefault)  # assign 'Time'/'Date' column index to cfg['col(time/date)']
        elif isinstance(cfg_in[sCol], str):
            cfg_in[sCol] = cfg_in['cols'].index(cfg_in[sCol])

    if cfg_in['converters']:
        if not isinstance(cfg_in['converters'], dict):
            # suspended evaluation required
            cfg_in['converters'] = cfg_in['converters'](cfg_in)
        if b_was_no_dtype:
            # converters produce datetime64[ns] for coldate column (or coltime if no coldate):
            cfg_in['dtype'][cfg_in.get('coldate', cfg_in['coltime'])] = 'datetime64[ns]'

    # process format cpecifiers: '(text)','(float)','(time)' and remove it from ['cols'],
    # also find not used cols cpecified by skipping name between commas like in 'col1,,,col4'
    for i, s in enumerate(cfg_in['cols']):
        if len(s) == 0:
            cols_load_b[i] = 0
        else:
            b_i_not_in_converters = (not (i in cfg_in['converters'].keys())) \
                if cfg_in['converters'] else True
            i_suffix = s.rfind('(text)')
            if i_suffix > 0:  # text
                cfg_in['cols'][i] = s[:i_suffix]
                if (cfg_in['dtype'][
                        i] == np.float64) and b_i_not_in_converters:  # reassign from default float64 to text
                    cfg_in['dtype'][i] = dtype_text_max
            else:
                i_suffix = s.rfind('(float)')
                if i_suffix > 0:  # float
                    cfg_in['cols'][i] = s[:i_suffix]
                    if b_i_not_in_converters:
                        # assign to default. Already done?
                        assert cfg_in['dtype'][i] == np.float64
                else:
                    i_suffix = s.rfind('(time)')
                    if i_suffix > 0:
                        cfg_in['cols'][i] = s[:i_suffix]
                        if (cfg_in['dtype'][i] == np.float64) and b_i_not_in_converters:
                            cfg_in['dtype'][i] = 'datetime64[ns]'  # np.str

    if cfg_in.get('cols_load'):
        cols_load_b &= np.isin(cfg_in['cols'], cfg_in['cols_load'])
    else:
        cfg_in['cols_load'] = np.array(cfg_in['cols'])[cols_load_b]
    # apply settings that more narrows used cols
    if 'cols_not_use' in cfg_in:
        cols_load_in_used_b = np.isin(cfg_in['cols_load'], cfg_in['cols_not_use'], invert=True)
        if not np.all(cols_load_in_used_b):
            cfg_in['cols_load'] = cfg_in['cols_load'][cols_load_in_used_b]
            cols_load_b = np.isin(cfg_in['cols'], cfg_in['cols_load'])

    col_names_out = cfg_in['cols_load'].copy()
    # Convert ``cols_load`` to index (to be compatible with numpy loadtxt()), names will be in cfg_in['dtype'].names
    cfg_in['cols_load'] = np.int32([cfg_in['cols'].index(c) for c in cfg_in['cols_load'] if c in cfg_in['cols']])
    # not_cols_load = np.array([n in cfg_in['cols_not_use'] for n in cfg_in['cols']], np.bool)
    # cfg_in['cols_load']= np.logical_and(~not_cols_load, cfg_in['cols_load'])
    # cfg_in['cols']= np.array(cfg_in['cols'])[cfg_in['cols_load']]
    # cfg_in['dtype']=  cfg_in['dtype'][cfg_in['cols_load']]
    # cfg_in['cols_load']= np.flatnonzero(cfg_in['cols_load'])
    # cfg_in['dtype']= np.dtype({'names': cfg_in['cols'].tolist(), 'formats': cfg_in['dtype'].tolist()})

    cfg_in['cols'] = np.array(cfg_in['cols'])
    cfg_in['dtype_raw'] = np.dtype({'names': cfg_in['cols'],
                                    'formats': cfg_in['dtype'].tolist()})
    cfg_in['dtype'] = np.dtype({'names': cfg_in['cols'][cfg_in['cols_load']],
                                'formats': cfg_in['dtype'][cfg_in['cols_load']].tolist()})

    # Get index name for saving Pandas frame
    b_index_exist = 'coltime' in cfg_in
    if b_index_exist:
        set_field_if_no(cfg_in, 'col_index_name', cfg_in['cols'][cfg_in['coltime']])


    # Output columns mask
    if cfg_in['cols_loaded_save_b'] is None:
        # Mask of only needed output columns (text columns are not more needed after load)
        cfg_in['cols_loaded_save_b'] = np.logical_not(np.array(
            [cfg_in['dtype'].fields[n][0].char == 'S' for n in
             cfg_in['dtype'].names]))  # a.dtype will = cfg_in['dtype']

        if 'coldate' in cfg_in:
            cfg_in['cols_loaded_save_b'][
                cfg_in['dtype'].names.index(
                    cfg_in['cols'][cfg_in['coldate']])] = False
    else:  # list to array
        cfg_in['cols_loaded_save_b'] = np.bool8(cfg_in['cols_loaded_save_b'])

    # Exclude index from cols_loaded_save_b
    if b_index_exist and cfg_in['col_index_name']:
        cfg_in['cols_loaded_save_b'][cfg_in['dtype'].names.index(
            cfg_in['col_index_name'])] = False  # (must index be used separately?)

    # Output columns dtype
    col_names_out = np.array(col_names_out)[cfg_in['cols_loaded_save_b']].tolist() + cfg_in['cols_use']
    cfg_in['dtype_out'] = np.dtype({
        'formats': [cfg_in['dtype'].fields[n][0] if n in cfg_in['dtype'].names else
                    np.dtype(np.float64) for n in col_names_out],
        'names': col_names_out})

    return cfg_in