예제 #1
0
def test_cache():
    c = cachey.Cache(10000)
    cc = Cache(c)

    with cc:
        assert get({"x": (inc, 1)}, "x") == 2

    assert flag == [1]
    assert c.data["x"] == 2

    assert not cc.starttimes
    assert not cc.durations

    while flag:
        flag.pop()
    dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")}
    with cc:
        assert get(dsk, "z") == 5

    assert flag == [2]  # no x present

    assert not Callback.active
예제 #2
0
def test_cache():
    c = cachey.Cache(10000)
    cc = Cache(c)

    with cc:
        assert get({'x': (inc, 1)}, 'x') == 2

    assert flag == [1]
    assert c.data['x'] == 2

    assert not cc.starttimes
    assert not cc.durations

    while flag:
        flag.pop()
    dsk = {'x': (inc, 1), 'y': (inc, 2), 'z': (add, 'x', 'y')}
    with cc:
        assert get(dsk, 'z') == 5

    assert flag == [2]  # no x present

    assert not Callback.active
예제 #3
0
"""
Dynamically load irregularly shapes images of ants and bees
"""

import numpy as np
from dask_image.imread import imread
from dask.cache import Cache
from napari import Viewer, gui_qt

cache = Cache(2e9)  # Leverage two gigabytes of memory
cache.register()

base_name = 'data/kaggle-nuclei/fixes/stage1_train/*'

images = imread(base_name + '/images/image_gray.tif')
labels = imread(base_name + '/labels/label.tif')

print(images.shape)

with gui_qt():
    # create an empty viewer
    viewer = Viewer()

    # add the images
    image_layer = viewer.add_image(images, name='nuceli', colormap='gray')
    labels_layer = viewer.add_labels(labels, name='labels', opacity=0.5)
예제 #4
0
# In[ ]:

na = distributed_array.persist()
na

# In[ ]:

dir(na)

# In[ ]:

na = None

# In[ ]:

#tag::cache[]
from dask.cache import Cache

c = Cache(1e9)  # 1GB cache
# A local cache for the part of our code where we need a cache
with c:
    distributed_array.compute()

# Or global for any calls we make
c.register()
#end::cache[]

# In[ ]:

# In[ ]:
예제 #5
0
    cat_width = 1  # Size of fixed-width string for representing categories
    columns = None
    cachesize = 9e9

    @property
    def parq_opts(self):
        return dict(file_scheme='hive',
                    has_nulls=(False if self.dftype == 'pandas' else 0),
                    write_index=False)


p = Parameters()

from dask.cache import Cache

Cache(p.cachesize).register()

filetypes_storing_categories = {'parq', 'castra'}


class Kwargs(dict):
    """Used to distinguish between dictionary argument values, and
    keyword-arguments.
    """
    pass


def benchmark(fn, args):
    """Benchmark when "fn" function gets called on "args" tuple.
    "args" may have a Kwargs instance at the end.
    """
import xarray as xr
import os
import glob
import imp
import sys
import numpy as np
import pandas as pd
import datetime
import json
import time
import utm
###
# Experimental cache option to speed up dask calls
import cachey
from dask.cache import Cache
cache = Cache(10e9)
cache.register()
###
start_time = time.time()
# Hack to force datetimes to display in GMT/UTC (numpy 1.11.1 has fixed this but other dependent modules (pynio) can't handel numpy 1.11.1)
os.environ['TZ'] = 'GMT'
time.tzset()

# Load in config file
#######  load user configurable paramters here    #######
# Check user defined configuraiton file
if len(sys.argv) == 1:
    raise ValueError(
        'requires one argument [configuration file] (i.e. python GRIB2_to_CHM_forcing.py forcing_config.py'
    )
예제 #7
0
"""Dask cache utilities.
"""
import collections.abc
import contextlib
from typing import Callable, ContextManager, Optional, Tuple

import dask
import dask.array as da
from dask.cache import Cache

#: dask.cache.Cache, optional : A dask cache for opportunistic caching
#: use :func:`~.resize_dask_cache` to actually register and resize.
#: this is a global cache (all layers will use it), but individual layers
#: can opt out using Layer(..., cache=False)
_DASK_CACHE = Cache(1)
_DEFAULT_MEM_FRACTION = 0.25


def resize_dask_cache(nbytes: Optional[int] = None,
                      mem_fraction: Optional[float] = None) -> Cache:
    """Create or resize the dask cache used for opportunistic caching.

    The cache object is an instance of a :class:`Cache`, (which
    wraps a :class:`cachey.Cache`).

    See `Dask opportunistic caching
    <https://docs.dask.org/en/latest/caching.html>`_

    Parameters
    ----------
    nbytes : int, optional
예제 #8
0
from to_pandas_hdf5.csv2h5 import main as csv2h5
from to_pandas_hdf5.csv_specific_proc import correct_kondrashov_txt, rep_in_file, correct_baranov_txt
from to_pandas_hdf5.h5_dask_pandas import h5q_interval2coord
from inclinometer.h5inclinometer_coef import h5copy_coef
import inclinometer.incl_h5clc as incl_h5clc
import inclinometer.incl_h5spectrum as incl_h5spectrum
import veuszPropagate
from utils_time import pd_period_to_timedelta
from utils2init import path_on_drive_d, init_logging, open_csv_or_archive_of_them, st

# l = logging.getLogger(__name__)
l = init_logging(logging, None, None, 'INFO')

if True:  # False. Experimental speedup but takes memory
    from dask.cache import Cache
    cache = Cache(2e9)  # Leverage two gigabytes of memory
    cache.register()  # Turn cache on globally
if False:  #  True:  # False:  #
    l.warning('using "synchronous" scheduler for debugging')
    import dask
    dask.config.set(scheduler='synchronous')

# Directory where inclinometer data will be stored
path_cruise = path_on_drive_d(
    r'd:\WorkData\BalticSea\200628_Pregolya,Lagoon-inclinometer')
r"""
d:\WorkData\BalticSea\200630_AI55\inclinometer
d:\WorkData\_experiment\inclinometer\200610_tank_ex[4,5,7,9,10,11][3,12,13,14,15,16,19]
d:\WorkData\BalticSea\200514_Pregolya,Lagoon-inclinometer
d:\WorkData\BalticSea\200317_Pregolya,Lagoon-inclinometer
d:\WorkData\BalticSea\191210_Pregolya,Lagoon-inclinometer
import napari
import dask.array as da

# https://docs.dask.org/en/latest/caching.html
# Using fixed sized cache [Need the Cachey module, so 'pip install cachey' first, 12-03-2019]
from dask.cache import Cache

cache = Cache(8e9)  # Leverage eight gigabytes of memory
cache.register()  # Turn cache on globally

# Read the zarr images
memimage = da.from_zarr(
    'W:/SV3/RC_15-06-11/Dme_E2_His2AvRFP_spiderGFP_12-03_20150611_155054.corrected/Results/zarr/membrane.zarr'
)
nucimage = da.from_zarr(
    'W:/SV3/RC_15-06-11/Dme_E2_His2AvRFP_spiderGFP_12-03_20150611_155054.corrected/Results/zarr/nuclei.zarr'
)
print(type(memimage), memimage.shape, memimage.dtype)

# create Qt GUI context
with napari.gui_qt():
    # create a Viewer and add the images as layers
    viewer = napari.Viewer(axis_labels=['view', 't', 'z', 'y', 'x'])
    viewer.add_image(memimage,
                     scale=[1, 1, 8, 1, 1],
                     colormap='inferno',
                     blending='additive',
                     name='membrane',
                     is_pyramid=False,
                     rgb=False,
                     contrast_limits=[10, 255])
예제 #10
0
def main(argv):
    global DEBUG, DD_FORCE_LOAD, DASK_CLIENT

    parser = argparse.ArgumentParser(epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('filepath')
    parser.add_argument('dftype')
    parser.add_argument('base')
    parser.add_argument('x')
    parser.add_argument('y')
    parser.add_argument('categories', nargs='+')
    parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages')
    parser.add_argument('--cache', choices=('persist', 'cachey'), default=None, help='Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'.format(int(p.cachesize)))
    parser.add_argument('--distributed', action='store_true', help='Enable the distributed scheduler instead of the threaded, which is the default.')
    parser.add_argument('--recalc-ranges', action='store_true', help='Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).')
    args = parser.parse_args(argv[1:])

    if args.cache is None:
        if args.debug:
            print("DEBUG: Cache disabled", flush=True)
    else:
        if args.cache == 'cachey':
            from dask.cache import Cache
            cache = Cache(p.cachesize)
            cache.register()
        elif args.cache == 'persist':
            DD_FORCE_LOAD = True

        if args.debug:
            print('DEBUG: Cache "{}" mode enabled'.format(args.cache), flush=True)

    if args.dftype == 'dask' and args.distributed:
        local_cluster = distributed.LocalCluster(n_workers=p.n_workers, threads_per_worker=1)
        DASK_CLIENT = distributed.Client(local_cluster)
        if args.debug:
            print('DEBUG: "distributed" scheduler is enabled')
    else:
        if args.dftype != 'dask' and args.distributed:
            raise ValueError('--distributed argument is only available with the dask dataframe type (not pandas)')
        if args.debug:
            print('DEBUG: "threaded" scheduler is enabled')

    filepath = args.filepath
    basename, extension = os.path.splitext(filepath)
    p.dftype      = args.dftype
    p.base        = args.base
    p.x           = args.x
    p.y           = args.y
    p.categories  = args.categories
    DEBUG = args.debug

    if DEBUG:
        print('DEBUG: Memory usage (before read):\t{} MB'.format(get_proc_mem(), flush=True))
    df,loadtime = timed_read(filepath, p.dftype)

    if df is None:
        if loadtime == -1:
            print("{:28} {:6}  Operation not supported".format(filepath, p.dftype), flush=True)
        return 1

    if DEBUG:
        print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True))

    img,aggtime1 = timed_agg(df,filepath,5,5,cache_ranges=(not args.recalc_ranges))
    if DEBUG:
        mem_usage = df.memory_usage(deep=True)
        if p.dftype == 'dask':
            mem_usage = mem_usage.compute()
        print('DEBUG:', mem_usage, flush=True)
        mem_usage_total = mem_usage.sum()
        print('DEBUG: DataFrame size:\t\t\t{} MB'.format(mem_usage_total / 1e6, flush=True))
        for colname in df.columns:
            print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype))
        print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True))

    img,aggtime2 = timed_agg(df,filepath,cache_ranges=(not args.recalc_ranges))
    if DEBUG:
        print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True))
    
    in_size  = get_size(filepath)
    out_size = get_size(filepath+".png")
    
    global_end = time.time()
    print("{:28} {:6}  Aggregate1:{:06.2f} ({:06.2f}+{:06.2f})  Aggregate2:{:06.2f}  In:{:011d}  Out:{:011d}  Total:{:06.2f}"\
          .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start), flush=True)

    return 0
예제 #11
0
            fig, ax = plt.subplots(1, 1, figsize=(12, 12))
            ax.imshow(result, cmap="gray")
            plt.show(block=False)
            plt.waitforbuttonpress()


if __name__ == "__main__":
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(
        level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S"
    )

    client = Client("10.109.20.6:8786")
    print(client)
    print()

    cache = Cache(2e9)
    cache.register()

    src_dir = "Y:/charm/20191015_Clarity_brain_HD91_RCCS_1"
    ds = MicroManagerV2Dataset(src_dir)

    print("== dataset inventory ==")
    print(ds.inventory)

    images, links = load_valid_tiles(ds, save=True)
    # calculate_link_shifts(ds, links)

    client.close()
예제 #12
0
#futures_total = 0
class _COUNTER:
    COUNT=0
    def inc(self, *args, **kwargs):
        self.COUNT+=1
    def __call__(self):
        return self.COUNT

futures_total = _COUNTER()
futures_cache = deque()#maxlen=MAX_FUTURE_NUM)
# allow for 100,000 sinks for these (so we don't lose them)
futures_cache_sinks = deque()#maxlen=100000)

# assume all functions are pure globally
try:
    from dask.cache import Cache
    cache = Cache(1e6)
    cache.register()
except ImportError:
    print("Error cachey not available. Will not be caching")
    pass

# make everything pure by default
set_options(delayed_pure=True)

# TAU STUFF Profiler dictionaries
profile_dict = dict()

last_run_time = None
예제 #13
0
import io, os, os.path, sys, time, shutil
import pandas as pd
import dask.dataframe as dd
import numpy as np
import datashader as ds
import bcolz
import feather
import fastparquet as fp

from datashader.utils import export_image
from datashader import transfer_functions as tf
from castra import Castra
from collections import OrderedDict
from dask.cache import Cache
Cache(9e9).register()

base, x, y = 'data', 'x', 'y'
dftype = 'pandas'
categories = []

filetypes_storing_categories = {'parq', 'castra'}

read = OrderedDict(csv={}, h5={}, castra={}, bcolz={}, parq={}, feather={})

read["csv"]["pandas"] = lambda filepath: pd.read_csv(filepath)
read["csv"]["dask"] = lambda filepath: dd.read_csv(filepath)
read["h5"]["dask"] = lambda filepath: dd.read_hdf(filepath, base)
read["h5"]["pandas"] = lambda filepath: pd.read_hdf(filepath, base)
read["castra"]["dask"] = lambda filepath: dd.from_castra(filepath)
read["bcolz"]["dask"] = lambda filepath: dd.from_bcolz(filepath,
예제 #14
0
def test_cache_with_number():
    c = Cache(10000, limit=1)
    assert isinstance(c.cache, cachey.Cache)
    assert c.cache.available_bytes == 10000
    assert c.cache.limit == 1
예제 #15
0
파일: incl_load.py 프로젝트: And0k/h5toGrid
def main(new_arg=None, **kwargs):
    """

    :param new_arg: list of strings, command line arguments
    :kwargs: dicts of dictcts (for each ini section): specified values overwrites ini values
    """

    # global l
    cfg = cfg_from_args(my_argparser(), new_arg, **kwargs)
    if not cfg['program']:
        return  # usually error of unrecognized arguments displayed
    cfg['in']['db_coefs'] = Path(cfg['in']['db_coefs'])
    for path_field in ['db_coefs', 'path_cruise']:
        if not cfg['in'][path_field].is_absolute():
            cfg['in'][path_field] = (
                cfg['in']['cfgFile'].parent / cfg['in'][path_field]
            ).resolve().absolute()  # cfg['in']['cfgFile'].parent /

    def constant_factory(val):
        def default_val():
            return val

        return default_val

    for lim in ('min_date', 'max_date'):
        # convert keys to int because they must be comparable to probes_int_list (for command line arguments keys are allways strings, in yaml you can set string or int)
        _ = {int(k): v for k, v in cfg['filter'][lim].items()}
        cfg['filter'][lim] = defaultdict(constant_factory(_.get(0)), _)

    l = init_logging(logging, None, None, 'INFO')
    #l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose'])

    if True:  # False. Experimental speedup but takes memory
        from dask.cache import Cache
        cache = Cache(2e9)  # Leverage two gigabytes of memory
        cache.register()  # Turn cache on globally

    #if __debug__:
    # # because there was errors on debug when default scheduler used
    # cfg['program']['dask_scheduler'] = 'synchronous'

    if cfg['program']['dask_scheduler']:
        if cfg['program']['dask_scheduler'] == 'distributed':
            from dask.distributed import Client
            # cluster = dask.distributed.LocalCluster(n_workers=2, threads_per_worker=1, memory_limit="5.5Gb")
            client = Client(processes=False)
            # navigate to http://localhost:8787/status to see the diagnostic dashboard if you have Bokeh installed
            # processes=False: avoide inter-worker communication for computations releases the GIL (numpy, da.array)  # without is error
        else:
            if cfg['program']['dask_scheduler'] == 'synchronous':
                l.warning('using "synchronous" scheduler for debugging')
            import dask
            dask.config.set(scheduler=cfg['program']['dask_scheduler'])

    # Run steps :
    st.start = cfg['program']['step_start']
    st.end = cfg['program']['step_end']
    st.go = True

    if not cfg['out'][
            'db_name']:  # set name by 'path_cruise' name or parent if it has digits at start. priority for name  is  "*inclinometer*"
        for p in (lambda p: [p, p.parent])(cfg['in']['path_cruise']):
            m = re.match('(^[\d_]*).*', p.name)
            if m:
                break
        cfg['out']['db_name'] = f"{m.group(1).strip('_')}incl.h5"

    dir_incl = next((d for d in cfg['in']['path_cruise'].glob('*inclinometer*')
                     if d.is_dir()), cfg['in']['path_cruise'])
    db_path = dir_incl / '_raw' / cfg['out']['db_name']

    # ---------------------------------------------------------------------------------------------
    # def fs(probe, name):
    #     if 'w' in name.lower():  # Baranov's wavegauge electronic
    #         return 10  # 5
    #     return 5
    # if probe < 20 or probe in [23, 29, 30, 32, 33]:  # 30 [4, 11, 5, 12] + [1, 7, 13, 30]
    #     return 5
    # if probe in [21, 25, 26] + list(range(28, 35)):
    #     return 8.2
    # return 4.8

    def datetime64_str(time_str: Optional[str] = None) -> np.ndarray:
        """
        Reformat time_str to ISO 8601 or to 'NaT'. Used here for input in funcs that converts str to numpy.datetime64
        :param time_str: May be 'NaT'
        :return: ndarray of strings (tested for 1 element only) formatted by numpy.
        """
        return np.datetime_as_string(np.datetime64(time_str, 's'))

    probes = cfg['in']['probes'] or range(
        1, 41)  # sets default range, specify your values before line ---
    raw_root, probe_is_incl = re.subn('INCL_?', 'INKL_',
                                      cfg['in']['probes_prefix'].upper())

    # some parameters that depends of probe type (indicated by probes_prefix)
    p_type = defaultdict(
        # baranov's format
        constant_factory({
            'correct_fun':
            partial(correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^\r?(?P<use>20\d{2}(\t\d{1,2}){5}(\t\d{5}){8}).*',
                        b'^.+'
                    ]),
            'fs':
            10,
            'format':
            'Baranov',
        }),
        {
            (lambda x: x if x.startswith('incl') else 'incl')(cfg['in']['probes_prefix']):
            {
                'correct_fun':
                partial(
                    correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,6}){6}(,\d{1,2}\.\d{2})(,\-?\d{1,3}\.\d{2})).*',
                        b'^.+'
                    ]),
                'fs':
                5,
                'format':
                'Kondrashov',
            },
            'voln': {
                'correct_fun':
                partial(
                    correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,8})(,\-?\d{1,2}\.\d{2}){2}).*',
                        b'^.+'
                    ]),
                'fs':
                5,
                #'tbl_prefix': 'w',
                'format':
                'Kondrashov',
            }
        })

    if st(1, 'Save inclinometer or wavegage data from ASCII to HDF5'):
        # Note: Can not find additional not corrected files for same probe if already have any corrected in search path (move them out if need)

        i_proc_probe = 0  # counter of processed probes
        i_proc_file = 0  # counter of processed files
        # patten to identify only _probe_'s raw data files that need to correct '*INKL*{:0>2}*.[tT][xX][tT]':

        raw_parent = dir_incl / '_raw'  # raw_parent /=
        if cfg['in']['raw_subdir'] is None:
            cfg['in']['raw_subdir'] = ''

        dir_out = raw_parent / re.sub(r'[.\\/ *?]', '_',
                                      cfg['in']['raw_subdir'])

        # sub replaces multilevel subdirs to 1 level that correct_fun() can only make

        def dt_from_utc_2000(probe):
            """ Correct time of probes started without time setting. Raw date must start from  2000-01-01T00:00"""
            return (
                datetime(year=2000, month=1, day=1) -
                cfg['in']['time_start_utc'][probe]
            ) if cfg['in']['time_start_utc'].get(probe) else timedelta(0)

        # convert cfg['in']['dt_from_utc'] keys to int

        cfg['in']['dt_from_utc'] = {
            int(p): v
            for p, v in cfg['in']['dt_from_utc'].items()
        }
        # convert cfg['in']['t_start_utc'] to cfg['in']['dt_from_utc'] and keys to int
        cfg['in']['dt_from_utc'].update(    # overwriting the 'time_start_utc' where already exist
            {int(p): dt_from_utc_2000(p) for p, v in cfg['in']['time_start_utc'].items()}
            )
        # make cfg['in']['dt_from_utc'][0] be default value
        cfg['in']['dt_from_utc'] = defaultdict(
            constant_factory(cfg['in']['dt_from_utc'].pop(0, timedelta(0))),
            cfg['in']['dt_from_utc'])

        for probe in probes:
            raw_found = []
            raw_pattern_file = str(
                Path(glob.escape(cfg['in']['raw_subdir'])) /
                cfg['in']['raw_pattern'].format(prefix=raw_root, number=probe))
            correct_fun = p_type[cfg['in']['probes_prefix']]['correct_fun']
            # if not archive:
            if (not re.match(r'.*(\.zip|\.rar)$', cfg['in']['raw_subdir'],
                             re.IGNORECASE)) and raw_parent.is_dir():
                raw_found = list(raw_parent.glob(raw_pattern_file))
            if not raw_found:
                # Check if already have corrected files for probe generated by correct_txt(). If so then just use them
                raw_found = list(
                    dir_out.glob(
                        f"{cfg['in']['probes_prefix']}{probe:0>2}.txt"))
                if raw_found:
                    print('corrected csv file', [r.name for r in raw_found],
                          'found')
                    correct_fun = lambda x, dir_out: x
                elif not cfg['in']['raw_subdir']:
                    continue

            for file_in in (raw_found or open_csv_or_archive_of_them(
                    raw_parent, binary_mode=False, pattern=raw_pattern_file)):
                file_in = correct_fun(file_in, dir_out=dir_out)
                if not file_in:
                    continue
                tbl = file_in.stem  # f"{cfg['in']['probes_prefix']}{probe:0>2}"
                # tbl = re.sub('^((?P<i>inkl)|w)_0', lambda m: 'incl' if m.group('i') else 'w',  # correct name
                #              re.sub('^[\d_]*|\*', '', file_in.stem).lower()),  # remove date-prefix if in name
                csv2h5(
                    [
                        str(
                            Path(__file__).parent / 'ini' /
                            f"csv_{'inclin' if probe_is_incl else 'wavegage'}_{p_type[cfg['in']['probes_prefix']]['format']}.ini"
                        ),
                        '--path',
                        str(file_in),
                        '--blocksize_int',
                        '50_000_000',  # 50Mbt
                        '--table',
                        tbl,
                        '--db_path',
                        str(db_path),
                        # '--log', str(scripts_path / 'log/csv2h5_inclin_Kondrashov.log'),
                        # '--b_raise_on_err', '0',  # ?
                        '--b_interact',
                        '0',
                        '--fs_float',
                        str(p_type[cfg['in']['probes_prefix']]
                            ['fs']),  #f'{fs(probe, file_in.stem)}',
                        '--dt_from_utc_seconds',
                        str(cfg['in']['dt_from_utc'][probe].total_seconds()),
                        '--b_del_temp_db',
                        '1',
                    ] +
                    (['--csv_specific_param_dict', 'invert_magnitometr: True']
                     if probe_is_incl else []),
                    **{
                        'filter': {
                            'min_date':
                            cfg['filter']['min_date'].get(
                                probe, np.datetime64(0, 'ns')),
                            'max_date':
                            cfg['filter']['max_date'].get(
                                probe, np.datetime64('now', 'ns')
                            ),  # simple 'now' works in sinchronious mode
                        }
                    })

                # Get coefs:
                l.info(
                    f"Adding coefficients to {db_path}/{tbl} from {cfg['in']['db_coefs']}"
                )
                try:
                    h5copy_coef(cfg['in']['db_coefs'], db_path, tbl)
                except KeyError as e:  # Unable to open object (component not found)
                    l.warning(
                        'No coefs to copy?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                except OSError as e:
                    l.warning(
                        'Not found DB with coefs?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                i_proc_file += 1
            else:
                print('no', raw_pattern_file, end=', ')
            i_proc_probe += 1
        print('Ok:', i_proc_probe, 'probes,', i_proc_file, 'files processed.')

    if st(2, 'Calculate physical parameters and average'):
        kwarg = {
            'in': {
                'min_date': cfg['filter']['min_date'][0],
                'max_date': cfg['filter']['max_date'][0],
                'time_range_zeroing': cfg['in']['time_range_zeroing']
            },
            'proc': {}
        }
        # if aggregate_period_s is None then not average and write to *_proc_noAvg.h5 else loading from that h5 and writing to _proc.h5
        if not cfg['out']['aggregate_period_s']:
            cfg['out']['aggregate_period_s'] = [
                None, 2, 600, 7200 if probe_is_incl else 3600
            ]

        if cfg['in']['azimuth_add']:
            if 'Lat' in cfg['in']['azimuth_add']:
                # add magnetic declination,° for used coordinates
                # todo: get time
                kwarg['proc']['azimuth_add'] = mag_dec(
                    cfg['in']['azimuth_add']['Lat'],
                    cfg['in']['azimuth_add']['Lon'],
                    datetime(2020, 9, 10),
                    depth=-1)
            else:
                kwarg['proc']['azimuth_add'] = 0
            if 'constant' in cfg['in']['azimuth_add']:
                # and add constant. For example, subtruct declination at the calibration place if it was applied
                kwarg['proc']['azimuth_add'] += cfg['in']['azimuth_add'][
                    'constant']  # add -6.656 to account for calibration in Kaliningrad (mag deg = 6.656°)

        for aggregate_period_s in cfg['out']['aggregate_period_s']:
            if aggregate_period_s is None:
                db_path_in = db_path
                db_path_out = dir_incl / f'{db_path.stem}_proc_noAvg.h5'
            else:
                db_path_in = dir_incl / f'{db_path.stem}_proc_noAvg.h5'
                db_path_out = dir_incl / f'{db_path.stem}_proc.h5'  # or separately: '_proc{aggregate_period_s}.h5'

            # 'incl.*|w\d*'  inclinometers or wavegauges w\d\d # 'incl09':
            tables_list_regex = f"{cfg['in']['probes_prefix'].replace('voln', 'w')}.*"
            if cfg['in']['probes']:
                tables_list_regex += "(?:{})".format('|'.join(
                    '{:0>2}'.format(p) for p in cfg['in']['probes']))

            args = [
                '../../empty.yml',  # all settings are here, so to not print 'using default configuration' we use some existed empty file
                '--db_path',
                str(db_path_in),
                '--tables_list',
                tables_list_regex,
                '--aggregate_period',
                f'{aggregate_period_s}S' if aggregate_period_s else '',
                '--out.db_path',
                str(db_path_out),
                '--table',
                f'V_incl_bin{aggregate_period_s}'
                if aggregate_period_s else 'V_incl',
                '--verbose',
                'INFO',  #'DEBUG' get many numba messages
                '--b_del_temp_db',
                '1',
                # '--calc_version', 'polynom(force)',  # depreshiated
                # '--chunksize', '20000',
                # '--not_joined_h5_path', f'{db_path.stem}_proc.h5',
            ]

            if aggregate_period_s is None:  # proc. parameters (if we have saved proc. data then when aggregating we are not processing)
                # Note: for Baranov's prog 4096 is not suited:
                args += ([
                    '--max_dict',
                    'M[xyz]:4096',
                    # '--time_range_zeroing_dict', "incl19: '2019-11-10T13:00:00', '2019-11-10T14:00:00'\n,"  # not works - use kwarg
                    # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
                    '--split_period',
                    '1D'
                ] if probe_is_incl else [
                    '--bad_p_at_bursts_starts_peroiod',
                    '1H',
                ])
                # csv splitted by 1day (default for no avg) else csv is monolith
            if aggregate_period_s not in cfg['out'][
                    'aggregate_period_s_not_to_text']:  # , 300, 600]:
                args += ['--text_path', str(dir_incl / 'text_output')]
            # If need all data to be combined one after one:
            # set_field_if_no(kwarg, 'in', {})
            # kwarg['in'].update({
            #
            #         'tables': [f'incl{i:0>2}' for i in min_date.keys() if i!=0],
            #         'dates_min': min_date.values(),  # in table list order
            #         'dates_max': max_date.values(),  #
            #         })
            # set_field_if_no(kwarg, 'out', {})
            # kwarg['out'].update({'b_all_to_one_col': 'True'})

            incl_h5clc.main(args, **kwarg)

    if st(3, 'Calculate spectrograms'):  # Can be done at any time after step 1
        min_Pressure = 7

        # add dict dates_min like {probe: parameter} of incl_clc to can specify param to each probe
        def raise_ni():
            raise NotImplementedError(
                'Can not proc probes having different fs in one run: you need to do it separately'
            )

        args = [
            Path(incl_h5clc.__file__).with_name(
                f'incl_h5spectrum{db_path.stem}.yaml'),
            # if no such file all settings are here
            '--db_path',
            str(dir_incl / f'{db_path.stem}_proc_noAvg.h5'),
            '--tables_list',
            f"{cfg['in']['probes_prefix']}.*",  # inclinometers or wavegauges w\d\d  ## 'w02', 'incl.*',
            # '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '',
            '--min_date',
            datetime64_str(cfg['filter']['min_date'][0]),
            '--max_date',
            datetime64_str(cfg['filter']['max_date']
                           [0]),  # '2019-09-09T16:31:00',  #17:00:00
            '--min_Pressure',
            f'{min_Pressure}',
            # '--max_dict', 'M[xyz]:4096',  # use if db_path is not ends with _proc_noAvg.h5 i.e. need calc velocity
            '--out.db_path',
            f"{db_path.stem.replace('incl', cfg['in']['probes_prefix'])}_proc_psd.h5",
            # '--table', f'psd{aggregate_period_s}' if aggregate_period_s else 'psd',
            '--fs_float',
            str(p_type[cfg['in']['probes_prefix']]
                ['fs']),  # f"{fs(probes[0], cfg['in']['probes_prefix'])}",
            # (lambda x: x == x[0])(np.vectorize(fs)(probes, prefix))).all() else raise_ni()
            #
            # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
            # '--verbose', 'DEBUG',
            # '--chunksize', '20000',
            '--b_interact',
            '0',
        ]
        if probe_is_incl:
            args += [
                '--split_period',
                '2H',
                '--fmin',
                '0.0004',  #0.0004
                '--fmax',
                '1.05'
            ]
        else:
            args += [
                '--split_period',
                '1H',
                '--dt_interval_minutes',
                '15',  # set this if burst mode to the burst interval
                '--fmin',
                '0.0001',
                '--fmax',
                '4',
                #'--min_Pressure', '-1e15',  # to not load NaNs
            ]

        incl_h5spectrum.main(args)

    if st(4, 'Draw in Veusz'):
        pattern_path = dir_incl / r'processed_h5,vsz/201202-210326incl_proc#28.vsz'
        # r'\201202_1445incl_proc#03_pattern.vsz'  #'
        # db_path.parent / r'vsz_5min\191119_0000_5m_incl19.vsz'  # r'vsz_5min\191126_0000_5m_w02.vsz'

        b_images_only = False
        # importing in vsz index slices replacing:
        pattern_str_slice_old = None

        # Length of not adjacent intervals, s (set None to not allow)
        # pandas interval in string or tuple representation '1D' of period between intervals and interval to draw
        period_str = '0s'  # '1D'  #  dt
        dt_str = '0s'  # '5m'
        file_intervals = None

        period = to_offset(period_str).delta
        dt = to_offset(dt_str).delta  # timedelta(0)  #  60 * 5

        if file_intervals and period and dt:

            # Load starts and assign ends
            t_intervals_start = pd.read_csv(
                cfg['in']['path_cruise'] /
                r'vsz+h5_proc\intervals_selected.txt',
                converters={
                    'time_start': lambda x: np.datetime64(x, 'ns')
                },
                index_col=0).index
            edges = (pd.DatetimeIndex(t_intervals_start),
                     pd.DatetimeIndex(t_intervals_start + dt_custom_s)
                     )  # np.zeros_like()
        elif period and dt:
            # Generate periodic intervals
            t_interval_start, t_intervals_end = intervals_from_period(
                datetime_range=np.array(
                    [
                        cfg['filter']['min_date']['0'],
                        cfg['filter']['max_date']['0']
                    ],
                    # ['2018-08-11T18:00:00', '2018-09-06T00:00:00'],
                    # ['2019-02-11T13:05:00', '2019-03-07T11:30:00'],
                    # ['2018-11-16T15:19', '2018-12-14T14:35'],
                    # ['2018-10-22T12:30', '2018-10-27T06:30:00'],
                    'datetime64[s]'),
                period=period)
            edges = (pd.DatetimeIndex([t_interval_start
                                       ]).append(t_intervals_end[:-1]),
                     pd.DatetimeIndex(t_intervals_end))
        else:  # [min, max] edges for each probe
            edges_dict = {
                pr:
                [cfg['filter']['min_date'][pr], cfg['filter']['max_date'][pr]]
                for pr in probes
            }

        cfg_vp = {'veusze': None}
        for i, probe in enumerate(probes):
            # cfg_vp = {'veusze': None}
            if edges_dict:  # custom edges for each probe
                edges = [pd.DatetimeIndex([t]) for t in edges_dict[probe]]

            # substr in file to rerplace probe_name_in_pattern (see below).
            probe_name = f"_{cfg['in']['probes_prefix'].replace('incl', 'i')}{probe:02}"
            tbl = None  # f"/{cfg['in']['probes_prefix']}{probe:02}"  # to check probe data exist in db else will not check
            l.info('Draw %s in Veusz: %d intervals...', probe_name,
                   edges[0].size)
            # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1):

            for i_interval, (t_interval_start,
                             t_interval_end) in enumerate(zip(*edges),
                                                          start=1):

                # if i_interval < 23: #<= 0:  # TEMPORARY Skip this number of intervals
                #     continue
                if period and period != dt:
                    t_interval_start = t_interval_end - pd.Timedelta(
                        dt_custom_s, 's')

                if tbl:
                    try:  # skipping absent probes
                        start_end = h5q_interval2coord(
                            db_path=str(db_path),
                            table=tbl,
                            t_interval=(t_interval_start, t_interval_end))
                        if not len(start_end):
                            break  # no data
                    except KeyError:
                        break  # device name not in specified range, go to next name

                pattern_path_new = pattern_path.with_name(''.join([
                    f'{t_interval_start:%y%m%d_%H%M}',
                    f'_{dt_str}' if dt else '', f'{probe_name}.vsz'
                ]))

                # Modify pattern file
                if not b_images_only:
                    pattern_type, pattern_number = re.match(
                        r'.*(incl|w)_proc?#?(\d*).*',
                        pattern_path.name).groups()
                    probe_name_in_pattern = f"_{pattern_type.replace('incl', 'i')}{pattern_number}"

                    def f_replace(line):
                        """
                        Replace in file
                        1. probe name
                        2. slice
                        """
                        # if i_interval == 1:
                        line, ok = re.subn(probe_name_in_pattern, probe_name,
                                           line)
                        if ok and pattern_str_slice_old:  # can be only in same line
                            str_slice = '(({:d}, {:d}, None),)'.format(
                                *(start_end +
                                  np.int32([-1, 1])))  # bytes(, 'ascii')
                            line = re.sub(pattern_str_slice_old, str_slice,
                                          line)
                        return line

                    if not rep_in_file(pattern_path,
                                       pattern_path_new,
                                       f_replace=f_replace,
                                       binary_mode=False):
                        l.warning('Veusz pattern not changed!'
                                  )  # may be ok if we need draw pattern
                        # break
                    elif cfg_vp['veusze']:
                        cfg_vp['veusze'].Load(str(pattern_path_new))
                elif cfg_vp['veusze']:
                    cfg_vp['veusze'].Load(str(pattern_path_new))

                txt_time_range = \
                    """
                    "[['{:%Y-%m-%dT%H:%M}', '{:%Y-%m-%dT%H:%M}']]" \
                    """.format(t_interval_start, t_interval_end)
                print(f'{i_interval}. {txt_time_range}', end=' ')

                cfg_vp = veuszPropagate.main(
                    [
                        Path(veuszPropagate.__file__).parent.with_name(
                            'veuszPropagate.ini'),
                        # '--data_yield_prefix', '-',

                        # '--path', str(db_path),  # if custom loading from db and some source is required
                        '--tables_list',
                        '',  # switches to search vsz-files only # f'/{probe_name}',  # 181022inclinometers/ \d*
                        '--pattern_path',
                        str(pattern_path_new),
                        # fr'd:\workData\BalticSea\190801inclinometer_Schuka\{probe_name}_190807_1D.vsz',
                        # str(dir_incl / f'{probe_name}_190211.vsz'), #warning: create file with small name
                        # '--before_next', 'restore_config',
                        # '--add_to_filename', f"_{t_interval_start:%y%m%d_%H%M}_{dt}",
                        '--filename_fun',
                        f'lambda tbl: "{pattern_path_new.name}"',
                        '--add_custom_list',
                        f'USEtime__',  # f'USEtime{probe_name}', nAveragePrefer',
                        '--add_custom_expressions_list',
                        txt_time_range,
                        # + """
                        # ", 5"
                        # """,
                        '--b_update_existed',
                        'True',
                        '--export_pages_int_list',
                        '0',  # 0 for all '6, 7, 8',  #'1, 2, 3'
                        # '--export_dpi_int', '200',
                        '--export_format',
                        'jpg',  #'emf',
                        '--b_interact',
                        '0',
                        '--b_images_only',
                        f'{b_images_only}',
                        '--return',
                        '<embedded_object>',  # reuse to not bloat memory
                        '--b_execute_vsz',
                        'True',
                        '--before_next',
                        'Close()'  # Close() need if b_execute_vsz many files
                    ],
                    veusze=cfg_vp['veusze'])

    if st(40, f'Draw in Veusz by loader-drawer.vsz method'):
        # save all vsz files that uses separate code

        from os import chdir as os_chdir
        dt_s = 300
        cfg['in'][
            'pattern_path'] = db_path.parent / f'vsz_{dt_s:d}s' / '~pattern~.vsz'

        time_starts = pd.read_csv(
            db_path.parent / r'processed_h5,vsz' / 'intervals_selected.txt',
            index_col=0,
            parse_dates=True,
            date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%S'
                                                 )).index

        pattern_code = cfg['in']['pattern_path'].read_bytes(
        )  # encoding='utf-8'
        path_vsz_all = []
        for i, probe in enumerate(probes):
            probe_name = f"{cfg['in']['probes_prefix']}{probe:02}"  # table name in db
            l.info('Draw %s in Veusz: %d intervals...', probe_name,
                   time_starts.size)
            for i_interval, time_start in enumerate(time_starts, start=1):
                path_vsz = cfg['in']['pattern_path'].with_name(
                    f"{time_start:%y%m%d_%H%M}_{probe_name.replace('incl','i')}.vsz"
                )
                # copy file to path_vsz
                path_vsz.write_bytes(pattern_code)  # replaces 1st row
                path_vsz_all.append(path_vsz)

        os_chdir(cfg['in']['pattern_path'].parent)
        veuszPropagate.main(
            [
                'ini/veuszPropagate.ini',
                '--path',
                str(cfg['in']['pattern_path'].with_name(
                    '??????_????_*.vsz')),  # db_path),
                '--pattern_path',
                f"{cfg['in']['pattern_path']}_",
                # here used to auto get export dir only. may not be _not existed file path_ if ['out']['paths'] is provided
                # '--table_log', f'/{device}/logRuns',
                # '--add_custom_list', f'{device_veusz_prefix}USE_time_search_runs',  # 'i3_USE_timeRange',
                # '--add_custom_expressions',
                # """'[["{log_row[Index]:%Y-%m-%dT%H:%M:%S}", "{log_row[DateEnd]:%Y-%m-%dT%H:%M:%S}"]]'""",
                # '--export_pages_int_list', '1', #'--b_images_only', 'True'
                '--b_interact',
                '0',
                '--b_update_existed',
                'True',  # todo: delete_overlapped
                '--b_images_only',
                'True',
                '--load_timeout_s_float',
                str(cfg['program']['load_timeout_s'])
                # '--min_time', '2020-07-08T03:35:00',
            ],
            **{'out': {
                'paths': path_vsz_all
            }})

    if st(50, 'Export from existed Veusz files in dir'):
        pattern_parent = db_path.parent  # r'vsz_5min\191126_0000_5m_w02.vsz''
        pattern_path = str(pattern_parent / r'processed_h5,vsz' /
                           '??????incl_proc#[1-9][0-9].vsz')  # [0-2,6-9]
        veuszPropagate.main([
            'ini/veuszPropagate.ini',
            '--path',
            pattern_path,
            '--pattern_path',
            pattern_path,
            # '--export_pages_int_list', '1', #'--b_images_only', 'True'
            '--b_interact',
            '0',
            '--b_update_existed',
            'True',  # todo: delete_overlapped
            '--b_images_only',
            'True',
            '--load_timeout_s_float',
            str(cfg['program']['load_timeout_s']),
            '--b_execute_vsz',
            'True',
            '--before_next',
            'Close()'  # Close() need if b_execute_vsz many files
        ])
예제 #16
0
def main(new_arg=None, **kwargs):
    """

    :param new_arg: list of strings, command line arguments
    :kwargs: dicts of dictcts (for each ini section): specified values overwrites ini values
    """

    # global l
    cfg = cfg_from_args(my_argparser(), new_arg, **kwargs)
    cfg['in']['db_coefs'] = Path(cfg['in']['db_coefs'])
    for path_field in ['db_coefs', 'path_cruise']:
        if not cfg['in'][path_field].is_absolute():
            cfg['in'][path_field] = (
                cfg['in']['cfgFile'].parent / cfg['in'][path_field]
            ).resolve().absolute()  # cfg['in']['cfgFile'].parent /

    def constant_factory(val):
        def default_val():
            return val

        return default_val

    for lim in ('min_date', 'max_date'):
        cfg['filter'][lim] = defaultdict(
            constant_factory(cfg['filter'][lim].get(
                '0', cfg['filter'][lim].get(0))), cfg['filter'][lim])

    l = init_logging(logging, None, None, 'INFO')
    #l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose'])

    if True:  # False. Experimental speedup but takes memory
        from dask.cache import Cache
        cache = Cache(2e9)  # Leverage two gigabytes of memory
        cache.register()  # Turn cache on globally
    if cfg['program']['dask_scheduler']:
        if cfg['program']['dask_scheduler'] == 'distributed':
            from dask.distributed import Client
            client = Client(
                processes=False
            )  # navigate to http://localhost:8787/status to see the diagnostic dashboard if you have Bokeh installed
            # processes=False: avoide inter-worker communication for computations releases the GIL (numpy, da.array)  # without is error
        else:
            if cfg['program']['dask_scheduler'] == 'synchronous':
                l.warning('using "synchronous" scheduler for debugging')
            import dask
            dask.config.set(scheduler=cfg['program']['dask_scheduler'])

    # Run steps :
    st.start = cfg['program']['step_start']
    st.end = cfg['program']['step_end']
    st.go = True

    if not cfg['out'][
            'db_name']:  # set name by 'path_cruise' name or parent if it has digits at start. priority for name  is  "*inclinometer*"
        for p in (lambda p: [p, p.parent])(cfg['in']['path_cruise']):
            m = re.match('(^[\d_]*).*', p.name)
            if m:
                break
        cfg['out']['db_name'] = f"{m.group(1).strip('_')}incl.h5"
    cfg['in']['path_cruise'].glob('*inclinometer*')
    dir_incl = next((d for d in cfg['in']['path_cruise'].glob('*inclinometer*')
                     if d.is_dir()), cfg['in']['path_cruise'])
    db_path = dir_incl / cfg['out']['db_name']

    # ---------------------------------------------------------------------------------------------
    def fs(probe, name):
        return 5
        # if 'w' in name.lower():  # Baranov's wavegauge electronic
        #     return 5  # 10
        # if probe < 20 or probe in [23, 29, 30, 32, 33]:  # 30 [4, 11, 5, 12] + [1, 7, 13, 30]
        #     return 5
        # if probe in [21, 25, 26] + list(range(28, 35)):
        #     return 8.2
        # return 4.8

    def datetime64_str(time_str: Optional[str] = None) -> np.ndarray:
        """
        Reformat time_str to ISO 8601 or to 'NaT'. Used here for input in funcs that converts str to numpy.datetime64
        :param time_str: May be 'NaT'
        :return: ndarray of strings (tested for 1 element only) formatted by numpy.
        """
        return np.datetime_as_string(np.datetime64(time_str, 's'))

    probes = cfg['in']['probes'] or range(
        1, 41)  # sets default range, specify your values before line ---
    raw_root, subs_made = re.subn('INCL_?', 'INKL_',
                                  cfg['in']['probes_prefix'].upper())
    if st(
            1
    ):  # Can not find additional not corrected files for same probe if already have any corrected in search path (move them out if need)
        i_proc_probe = 0  # counter of processed probes
        i_proc_file = 0  # counter of processed files
        # patten to identify only _probe_'s raw data files that need to correct '*INKL*{:0>2}*.[tT][xX][tT]':

        raw_parent = dir_incl / '_raw'
        dir_out = raw_parent / re.sub(
            r'[.\\/ ]', '_', cfg['in']['raw_subdir']
        )  # sub replaces multilevel subdirs to 1 level that correct_fun() can only make
        raw_parent /= cfg['in']['raw_subdir']
        for probe in probes:
            raw_found = []
            raw_pattern_file = cfg['in']['raw_pattern'].format(prefix=raw_root,
                                                               number=probe)
            correct_fun = partial(
                correct_kondrashov_txt if subs_made else correct_baranov_txt,
                dir_out=dir_out)
            # if not archive:
            if (not '.zip' in cfg['in']['raw_subdir'].lower() and not '.rar'
                    in cfg['in']['raw_subdir'].lower()) or raw_parent.is_dir():
                raw_found = list(raw_parent.glob(raw_pattern_file))
            if not raw_found:
                # Check if already have corrected files for probe generated by correct_kondrashov_txt(). If so then just use them
                raw_found = list(
                    raw_parent.glob(
                        f"{cfg['in']['probes_prefix']}{probe:0>2}.txt"))
                if raw_found:
                    print('corrected csv file', [r.name for r in raw_found],
                          'found')
                    correct_fun = lambda x: x
                elif not cfg['in']['raw_subdir']:
                    continue

            for file_in in (raw_found or open_csv_or_archive_of_them(
                    raw_parent, binary_mode=False, pattern=raw_pattern_file)):
                file_in = correct_fun(file_in)
                if not file_in:
                    continue
                tbl = f"{cfg['in']['probes_prefix']}{probe:0>2}"
                # tbl = re.sub('^((?P<i>inkl)|w)_0', lambda m: 'incl' if m.group('i') else 'w',  # correct name
                #              re.sub('^[\d_]*|\*', '', file_in.stem).lower()),  # remove date-prefix if in name
                csv2h5(
                    [
                        str(
                            Path(__file__).parent / 'ini' /
                            f"csv_inclin_{'Kondrashov' if subs_made else 'Baranov'}.ini"
                        ),
                        '--path',
                        str(file_in),
                        '--blocksize_int',
                        '50_000_000',  # 50Mbt
                        '--table',
                        tbl,
                        '--db_path',
                        str(db_path),
                        # '--log', str(scripts_path / 'log/csv2h5_inclin_Kondrashov.log'),
                        # '--b_raise_on_err', '0',  # ?
                        '--b_interact',
                        '0',
                        '--fs_float',
                        f'{fs(probe, file_in.stem)}',
                        '--dt_from_utc_seconds',
                        str(cfg['in']['dt_from_utc'].total_seconds()),
                        '--b_del_temp_db',
                        '1',
                    ] +
                    (['--csv_specific_param_dict', 'invert_magnitometr: True']
                     if subs_made else
                     ['--cols_load_list', "yyyy,mm,dd,HH,MM,SS,P,U"]),
                    **{
                        'filter': {
                            'min_date': cfg['filter']['min_date'][probe],
                            'max_date': cfg['filter']['max_date'][probe],
                        }
                    })

                # Get coefs:
                l.info(
                    f"Adding coefficients to {db_path}/{tbl} from {cfg['in']['db_coefs']}"
                )
                try:
                    h5copy_coef(cfg['in']['db_coefs'], db_path, tbl)
                except KeyError as e:  # Unable to open object (component not found)
                    l.warning(
                        'No coefs to copy?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                except OSError as e:
                    l.warning(
                        'Not found DB with coefs?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                i_proc_file += 1
            else:
                print('no', raw_pattern_file, end=', ')
            i_proc_probe += 1
        print('Ok:', i_proc_probe, 'probes,', i_proc_file, 'files processed.')

    # Calculate velocity and average
    if st(2):
        # if aggregate_period_s is None then not average and write to *_proc_noAvg.h5 else loading from that h5 and writing to _proc.h5
        if not cfg['out']['aggregate_period_s']:
            cfg['out']['aggregate_period_s'] = [
                None, 2, 600,
                3600 if 'w' in cfg['in']['probes_prefix'] else 7200
            ]

        if cfg['in']['azimuth_add']:
            if 'Lat' in cfg['in']['azimuth_add']:
                from datetime import datetime
                # add magnetic declination,° for used coordinates
                # todo: get time
                azimuth_add = mag_dec(cfg['in']['azimuth_add']['Lat'],
                                      cfg['in']['azimuth_add']['Lon'],
                                      datetime(2020, 9, 10),
                                      depth=-1)
            else:
                azimuth_add = 0
            if 'constant' in cfg['in']['azimuth_add']:
                # and add constant. For example, subtruct declination at the calibration place if it was applied
                azimuth_add += cfg['in']['azimuth_add'][
                    'constant']  # add -6.65644183° to account for calibration in Kaliningrad
        for aggregate_period_s in cfg['out']['aggregate_period_s']:
            if aggregate_period_s is None:
                db_path_in = db_path
                db_path_out = db_path.with_name(
                    f'{db_path.stem}_proc_noAvg.h5')
            else:
                db_path_in = db_path.with_name(f'{db_path.stem}_proc_noAvg.h5')
                db_path_out = f'{db_path.stem}_proc.h5'  # or separately: '_proc{aggregate_period_s}.h5'

            args = [
                Path(incl_h5clc.__file__).with_name(
                    f'incl_h5clc_{db_path.stem}.yaml'),
                # if no such file all settings are here
                '--db_path',
                str(db_path_in),
                # !   'incl.*|w\d*'  inclinometers or wavegauges w\d\d # 'incl09':
                '--tables_list',
                'incl.*' if not cfg['in']['probes'] else
                f"incl.*(?:{'|'.join('{:0>2}'.format(p) for p in cfg['in']['probes'])})",
                '--aggregate_period',
                f'{aggregate_period_s}S' if aggregate_period_s else '',
                '--out.db_path',
                str(db_path_out),
                '--table',
                f'V_incl_bin{aggregate_period_s}'
                if aggregate_period_s else 'V_incl',
                '--verbose',
                'INFO',  #'DEBUG' get many numba messages
                '--b_del_temp_db',
                '1',
                # '--calc_version', 'polynom(force)',  # depreshiated
                # '--chunksize', '20000',
                # '--not_joined_h5_path', f'{db_path.stem}_proc.h5',
            ]
            # if aggregate_period_s <= 5:   # [s], do not need split csv for big average interval
            #     args += (['--split_period', '1D'])
            if aggregate_period_s is None:  # proc. parameters (if we have saved proc. data then when aggregating we are not processing)
                args += ([
                    '--max_dict',
                    'M[xyz]:4096',
                    # Note: for Baranov's prog 4096 is not suited
                    # '--timerange_zeroing_dict', "incl19: '2019-11-10T13:00:00', '2019-11-10T14:00:00'\n,"  # not works - use kwarg
                    # '--timerange_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
                    '--split_period',
                    '1D'
                ] if subs_made else [
                    '--bad_p_at_bursts_starts_peroiod',
                    '1H',
                ])
            # csv splitted by 1day (default for no avg) and monolith csv if aggregate_period_s==600
            if aggregate_period_s not in cfg['out'][
                    'aggregate_period_s_not_to_text']:  # , 300, 600]:
                args += ['--text_path', str(db_path.parent / 'text_output')]
            kwarg = {
                'in': {
                    'min_date': cfg['filter']['min_date'][0],
                    'max_date': cfg['filter']['max_date'][0],
                    'timerange_zeroing': cfg['in']['timerange_zeroing'],
                    'azimuth_add': azimuth_add
                }
            }
            # If need all data to be combined one after one:
            # set_field_if_no(kwarg, 'in', {})
            # kwarg['in'].update({
            #
            #         'tables': [f'incl{i:0>2}' for i in min_date.keys() if i!=0],
            #         'dates_min': min_date.values(),  # in table list order
            #         'dates_max': max_date.values(),  #
            #         })
            # set_field_if_no(kwarg, 'out', {})
            # kwarg['out'].update({'b_all_to_one_col': 'True'})

            incl_h5clc.main(args, **kwarg)

    # Calculate spectrograms.
    if st(3):  # Can be done at any time after step 1

        def raise_ni():
            raise NotImplementedError(
                'Can not proc probes having different fs in one run: you need to do it separately'
            )

        args = [
            Path(incl_h5clc.__file__).with_name(
                f'incl_h5spectrum{db_path.stem}.yaml'),
            # if no such file all settings are here
            '--db_path',
            str(db_path.with_name(f'{db_path.stem}_proc_noAvg.h5')),
            '--tables_list',
            f"{cfg['in']['probes_prefix']}.*",  # inclinometers or wavegauges w\d\d  ## 'w02', 'incl.*',
            # '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '',
            '--min_date',
            datetime64_str(cfg['filter']['min_date'][0]),
            '--max_date',
            datetime64_str(cfg['filter']['max_date']
                           [0]),  # '2019-09-09T16:31:00',  #17:00:00
            # '--max_dict', 'M[xyz]:4096',  # use if db_path is not ends with _proc_noAvg.h5 i.e. need calc velocity
            '--out.db_path',
            f"{db_path.stem.replace('incl', cfg['in']['probes_prefix'])}_proc_psd.h5",
            # '--table', f'psd{aggregate_period_s}' if aggregate_period_s else 'psd',
            '--fs_float',
            f"{fs(probes[0], cfg['in']['probes_prefix'])}",
            # (lambda x: x == x[0])(np.vectorize(fs)(probes, prefix))).all() else raise_ni()
            #
            # '--timerange_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
            # '--verbose', 'DEBUG',
            # '--chunksize', '20000',
            '--b_interact',
            '0',
        ]
        if 'w' in cfg['in']['probes_prefix']:
            args += [
                '--split_period',
                '1H',
                '--dt_interval_minutes',
                '10',  # burst mode
                '--fmin',
                '0.0001',
                '--fmax',
                '4'
            ]
        else:
            args += [
                '--split_period',
                '2H',
                '--fmin',
                '0.0004',  #0.0004
                '--fmax',
                '1.05'
            ]

        incl_h5spectrum.main(args)

    # Draw in Veusz
    if st(4):
        b_images_only = True  # False
        pattern_path = db_path.parent / r'vsz_5min\191119_0000_5m_incl19.vsz'  # r'vsz_5min\191126_0000_5m_w02.vsz'
        if not b_images_only:
            pattern_bytes_slice_old = re.escape(b'((5828756, 5830223, None),)')

        # Length of not adjacent intervals, s (set None to not allow)
        period = '1D'
        length = '5m'  # period  # '1D'

        dt_custom_s = pd_period_to_timedelta(
            length) if length != period else None  # None  #  60 * 5

        if True:
            # Load starts and assign ends
            t_intervals_start = pd.read_csv(
                cfg['in']['path_cruise'] /
                r'vsz+h5_proc\intervals_selected.txt',
                converters={
                    'time_start': lambda x: np.datetime64(x, 'ns')
                },
                index_col=0).index
            edges = (pd.DatetimeIndex(t_intervals_start),
                     pd.DatetimeIndex(t_intervals_start + dt_custom_s)
                     )  # np.zeros_like()
        else:
            # Generate periodic intervals
            t_interval_start, t_intervals_end = intervals_from_period(
                datetime_range=np.array(
                    [
                        cfg['filter']['min_date']['0'],
                        cfg['filter']['max_date']['0']
                    ],
                    # ['2018-08-11T18:00:00', '2018-09-06T00:00:00'],
                    # ['2019-02-11T13:05:00', '2019-03-07T11:30:00'],
                    # ['2018-11-16T15:19', '2018-12-14T14:35'],
                    # ['2018-10-22T12:30', '2018-10-27T06:30:00'],
                    'datetime64[s]'),
                period=period)
            edges = (pd.DatetimeIndex([t_interval_start
                                       ]).append(t_intervals_end[:-1]),
                     pd.DatetimeIndex(t_intervals_end))

        for i, probe in enumerate(probes):
            probe_name = f"{cfg['in']['probes_prefix']}{probe:02}"  # table name in db
            l.info('Draw %s in Veusz: %d intervals...', probe_name,
                   edges[0].size)
            # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1):

            cfg_vp = {'veusze': None}
            for i_interval, (t_interval_start,
                             t_interval_end) in enumerate(zip(*edges),
                                                          start=1):

                # if i_interval < 23: #<= 0:  # TEMPORARY Skip this number of intervals
                #     continue
                if period != length:
                    t_interval_start = t_interval_end - pd.Timedelta(
                        dt_custom_s, 's')

                try:  # skipping absent probes
                    start_end = h5q_interval2coord(
                        db_path=str(db_path),
                        table=f'/{probe_name}',
                        t_interval=(t_interval_start, t_interval_end))
                    if not len(start_end):
                        break  # no data
                except KeyError:
                    break  # device name not in specified range, go to next name

                pattern_path_new = pattern_path.with_name(
                    f"{t_interval_start:%y%m%d_%H%M}_{length}_{probe_name}.vsz"
                )

                # Modify pattern file
                if not b_images_only:
                    probe_name_old = re.match('.*((?:incl|w)\d*).*',
                                              pattern_path.name).groups()[0]
                    bytes_slice = bytes(
                        '(({:d}, {:d}, None),)'.format(*(start_end +
                                                         np.int32([-1, 1]))),
                        'ascii')

                    def f_replace(line):
                        """
                        Replace in file
                        1. probe name
                        2. slice
                        """
                        # if i_interval == 1:
                        line, ok = re.subn(bytes(probe_name_old, 'ascii'),
                                           bytes(probe_name, 'ascii'), line)
                        if ok:  # can be only in same line
                            line = re.sub(pattern_bytes_slice_old, bytes_slice,
                                          line)
                        return line

                    if not rep_in_file(pattern_path,
                                       pattern_path_new,
                                       f_replace=f_replace):
                        l.warning('Veusz pattern not changed!')
                        # break
                    elif cfg_vp['veusze']:
                        cfg_vp['veusze'].Load(str(pattern_path_new))
                elif cfg_vp['veusze']:
                    cfg_vp['veusze'].Load(str(pattern_path_new))

                txt_time_range = \
                    """
                    "[['{:%Y-%m-%dT%H:%M}', '{:%Y-%m-%dT%H:%M}']]" \
                    """.format(t_interval_start, t_interval_end)
                print(f'{i_interval}. {txt_time_range}', end=' ')

                cfg_vp = veuszPropagate.main(
                    [
                        Path(veuszPropagate.__file__).parent.with_name(
                            'veuszPropagate.ini'),
                        # '--data_yield_prefix', '-',
                        '--path',
                        str(
                            db_path
                        ),  # use for custom loading from db and some source is required
                        '--tables_list',
                        f'/{probe_name}',  # 181022inclinometers/ \d*
                        '--pattern_path',
                        str(pattern_path_new),
                        # fr'd:\workData\BalticSea\190801inclinometer_Schuka\{probe_name}_190807_1D.vsz',
                        # str(db_path.parent / dir_incl / f'{probe_name}_190211.vsz'), #warning: create file with small name
                        # '--before_next', 'restore_config',
                        # '--add_to_filename', f"_{t_interval_start:%y%m%d_%H%M}_{length}",
                        '--filename_fun',
                        f'lambda tbl: "{pattern_path_new.name}"',
                        '--add_custom_list',
                        'USEtime',  # nAveragePrefer',
                        '--add_custom_expressions_list',
                        txt_time_range,
                        # + """
                        # ", 5"
                        # """,
                        '--b_update_existed',
                        'True',
                        '--export_pages_int_list',
                        '1, 2',  # 0 for all '6, 7, 8',  #'1, 2, 3'
                        # '--export_dpi_int', '200',
                        '--export_format',
                        'emf',
                        '--b_interact',
                        '0',
                        '--b_images_only',
                        f'{b_images_only}',
                        '--return',
                        '<embedded_object>',  # reuse to not bloat memory
                    ],
                    veusze=cfg_vp['veusze'])
예제 #17
0
import xarray as xr
import os
import glob
import imp
import sys
import numpy as np
import pandas as pd
import datetime
import json
import time
import utm
###
# Experimental cache option to speed up dask calls
import cachey
from dask.cache import Cache
cache = Cache(4e9)
cache.register()
###
start_time = time.time()
# Hack to force datetimes to display in GMT/UTC (numpy 1.11.1 has fixed this but other dependent modules (pynio) can't handel numpy 1.11.1)
os.environ['TZ'] = 'GMT'
time.tzset()

# Load in config file
#######  load user configurable paramters here    #######
# Check user defined configuraiton file
if len(sys.argv) == 1:
    raise ValueError(
        'Netcdf_to_CHM_forcing.py requires one argument [configuration file] (i.e. python GRIB2_to_CHM_forcing.py forcing_config.py'
    )
예제 #18
0
def main(argv):
    global DEBUG, DD_FORCE_LOAD, DASK_CLIENT

    parser = argparse.ArgumentParser(
        epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('filepath')
    parser.add_argument('dftype')
    parser.add_argument('base')
    parser.add_argument('x')
    parser.add_argument('y')
    parser.add_argument('categories', nargs='+')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Enable increased verbosity and DEBUG messages')
    parser.add_argument(
        '--cache',
        choices=('persist', 'cachey'),
        default=None,
        help=
        'Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'
        .format(int(p.cachesize)))
    parser.add_argument(
        '--distributed',
        action='store_true',
        help=
        'Enable the distributed scheduler instead of the threaded, which is the default.'
    )
    parser.add_argument(
        '--recalc-ranges',
        action='store_true',
        help=
        'Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).'
    )
    args = parser.parse_args(argv[1:])

    if args.cache is None:
        if args.debug:
            print("DEBUG: Cache disabled", flush=True)
    else:
        if args.cache == 'cachey':
            from dask.cache import Cache
            cache = Cache(p.cachesize)
            cache.register()
        elif args.cache == 'persist':
            DD_FORCE_LOAD = True

        if args.debug:
            print('DEBUG: Cache "{}" mode enabled'.format(args.cache),
                  flush=True)

    if args.dftype == 'dask' and args.distributed:
        local_cluster = distributed.LocalCluster(n_workers=p.n_workers,
                                                 threads_per_worker=1)
        DASK_CLIENT = distributed.Client(local_cluster)
        if args.debug:
            print('DEBUG: "distributed" scheduler is enabled')
    else:
        if args.dftype != 'dask' and args.distributed:
            raise ValueError(
                '--distributed argument is only available with the dask dataframe type (not pandas)'
            )
        if args.debug:
            print('DEBUG: "threaded" scheduler is enabled')

    filepath = args.filepath
    basename, extension = os.path.splitext(filepath)
    p.dftype = args.dftype
    p.base = args.base
    p.x = args.x
    p.y = args.y
    p.categories = args.categories
    DEBUG = args.debug

    if DEBUG:
        print('DEBUG: Memory usage (before read):\t{} MB'.format(
            get_proc_mem(), flush=True))
    df, loadtime = timed_read(filepath, p.dftype)

    if df is None:
        if loadtime == -1:
            print("{:28} {:6}  Operation not supported".format(
                filepath, p.dftype),
                  flush=True)
        return 1

    if DEBUG:
        print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(),
                                                                flush=True))

    img, aggtime1 = timed_agg(df,
                              filepath,
                              5,
                              5,
                              cache_ranges=(not args.recalc_ranges))
    if DEBUG:
        mem_usage = df.memory_usage(deep=True)
        if p.dftype == 'dask':
            mem_usage = mem_usage.compute()
        print('DEBUG:', mem_usage, flush=True)
        mem_usage_total = mem_usage.sum()
        print('DEBUG: DataFrame size:\t\t\t{} MB'.format(mem_usage_total / 1e6,
                                                         flush=True))
        for colname in df.columns:
            print('DEBUG: column "{}" dtype: {}'.format(
                colname, df[colname].dtype))
        print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(),
                                                                flush=True))

    img, aggtime2 = timed_agg(df,
                              filepath,
                              cache_ranges=(not args.recalc_ranges))
    if DEBUG:
        print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(),
                                                                flush=True))

    in_size = get_size(filepath)
    out_size = get_size(filepath + ".png")

    global_end = time.time()
    print("{:28} {:6}  Aggregate1:{:06.2f} ({:06.2f}+{:06.2f})  Aggregate2:{:06.2f}  In:{:011d}  Out:{:011d}  Total:{:06.2f}"\
          .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start), flush=True)

    return 0
예제 #19
0
def main(argv):
    global DEBUG, DD_FORCE_LOAD

    parser = argparse.ArgumentParser(
        epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('filepath')
    parser.add_argument('dftype')
    parser.add_argument('base')
    parser.add_argument('x')
    parser.add_argument('y')
    parser.add_argument('categories', nargs='+')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Enable increased verbosity and DEBUG messages')
    parser.add_argument(
        '--cache',
        choices=('persist', 'cachey'),
        default=None,
        help=
        'Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'
        .format(int(p.cachesize)))
    args = parser.parse_args(argv[1:])

    if args.cache is None:
        if args.debug:
            print("DEBUG: Cache disabled")
    else:
        if args.cache == 'cachey':
            from dask.cache import Cache
            Cache(p.cachesize).register()
        elif args.cache == 'persist':
            DD_FORCE_LOAD = True

        if args.debug:
            print('DEBUG: Cache "{}" mode enabled'.format(args.cache))

    filepath = args.filepath
    basename, extension = os.path.splitext(filepath)
    p.dftype = args.dftype
    p.base = args.base
    p.x = args.x
    p.y = args.y
    p.categories = args.categories
    DEBUG = args.debug

    df, loadtime = timed_read(filepath, p.dftype)

    if df is None:
        if loadtime == -1:
            print("{:28} {:6}  Operation not supported".format(
                filepath, p.dftype))
        elif loadtime == -2:
            print("{:28} {:6}  File does not exist".format(filepath, p.dftype))
        return 1

    img, aggtime1 = timed_agg(df, filepath, 5, 5)
    img, aggtime2 = timed_agg(df, filepath)

    in_size = get_size(filepath)
    out_size = get_size("{}.png".format(filepath))

    global_end = time.time()
    print("{:28} {:6}  Aggregate1:{:06.2f} ({:06.2f}+{:06.2f})  Aggregate2:{:06.2f}  In:{:011d}  Out:{:011d}  Total:{:06.2f}"\
          .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start))

    return 0