示例#1
0
    def setup(self):
        keys = self.keys

        while not keys.issubset(self.scheduler.tasks):
            yield gen.sleep(0.05)

        tasks = [self.scheduler.tasks[k] for k in keys]

        self.keys = None

        self.scheduler.add_plugin(self)  # subtle race condition here
        self.all_keys, errors = dependent_keys(tasks, complete=self.complete)
        if not self.complete:
            self.keys = self.all_keys.copy()
        else:
            self.keys, _ = dependent_keys(tasks, complete=False)
        self.all_keys.update(keys)
        self.keys |= errors & self.all_keys

        if not self.keys:
            self.stop(exception=None, key=None)

        # Group keys by func name
        self.keys = valmap(set, groupby(self.func, self.keys))
        self.all_keys = valmap(set, groupby(self.func, self.all_keys))
        for k in self.all_keys:
            if k not in self.keys:
                self.keys[k] = set()

        for k in errors:
            self.transition(k, None, 'erred', exception=True)
        logger.debug("Set up Progress keys")
示例#2
0
 def function(scheduler, p):
     result = {'all': valmap(len, p.all_keys),
               'remaining': valmap(len, p.keys),
               'status': p.status}
     if p.status == 'error':
         result.update(p.extra)
     return result
    def _scatter(self, data, workers=None, broadcast=False):
        """ Scatter data to local data dictionary

        Rather than send data out to the cluster we keep data local.  However
        we do report to the scheduler that the local worker has the scattered
        data.  This allows other workers to come by and steal this data if
        desired.

        Keywords like ``broadcast=`` do not work, however operations like
        ``.replicate`` work fine after calling scatter, which can fill in for
        this functionality.
        """
        with log_errors():
            if not (workers is None and broadcast is False):
                raise NotImplementedError("Scatter from worker doesn't support workers or broadcast keywords")

            if isinstance(data, dict) and not all(isinstance(k, (bytes, str))
                                                   for k in data):
                d = yield self._scatter(keymap(tokey, data), workers, broadcast)
                raise gen.Return({k: d[tokey(k)] for k in data})

            if isinstance(data, (list, tuple, set, frozenset)):
                keys = []
                for x in data:
                    try:
                        keys.append(tokenize(x))
                    except:
                        keys.append(str(uuid.uuid1()))
                data2 = dict(zip(keys, data))
            elif isinstance(data, dict):
                keys = set(data)
                data2 = data
            else:
                raise TypeError("Don't know how to scatter %s" % type(data))

            nbytes = valmap(sizeof, data2)

            # self.worker.data.update(data2)  # thread safety matters
            self.worker.loop.add_callback(self.worker.data.update, data2)

            yield self.scheduler.update_data(
                    who_has={key: [self.worker.address] for key in data2},
                    nbytes=valmap(sizeof, data2),
                    client=self.id)

            if isinstance(data, dict):
                out = {k: Future(k, self) for k in data}
            elif isinstance(data, (tuple, list, set, frozenset)):
                out = type(data)([Future(k, self) for k in keys])
            else:
                raise TypeError(
                        "Input to scatter must be a list or dict")

            for key in keys:
                self.futures[key]['status'] = 'finished'
                self.futures[key]['event'].set()

            raise gen.Return(out)
示例#4
0
    def setup(self, keys, complete):
        errors = Progress.setup(self, keys, complete)

        # Group keys by func name
        self.keys = valmap(set, groupby(self.func, self.keys))
        self.all_keys = valmap(set, groupby(self.func, self.all_keys))
        for k in self.all_keys:
            if k not in self.keys:
                self.keys[k] = set()

        logger.debug("Set up Progress keys")
        return errors
示例#5
0
def expect_dtypes(**named):
    """
    Preprocessing decorator that verifies inputs have expected numpy dtypes.

    Usage
    -----
    >>> from numpy import dtype, arange, int8, float64
    >>> @expect_dtypes(x=dtype(int8))
    ... def foo(x, y):
    ...    return x, y
    ...
    >>> foo(arange(3, dtype=int8), 'foo')
    (array([0, 1, 2], dtype=int8), 'foo')
    >>> foo(arange(3, dtype=float64), 'foo')  # doctest: +NORMALIZE_WHITESPACE
    ...                                       # doctest: +ELLIPSIS
    Traceback (most recent call last):
       ...
    TypeError: ...foo() expected a value with dtype 'int8' for argument 'x',
    but got 'float64' instead.
    """
    for name, type_ in iteritems(named):
        if not isinstance(type_, (dtype, tuple)):
            raise TypeError(
                "expect_dtypes() expected a numpy dtype or tuple of dtypes"
                " for argument {name!r}, but got {dtype} instead.".format(
                    name=name, dtype=dtype,
                )
            )

    @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,)))
    def _expect_dtype(dtypes):
        """
        Factory for dtype-checking functions that work with the @preprocess
        decorator.
        """
        def error_message(func, argname, value):
            # If the bad value has a dtype, but it's wrong, show the dtype
            # name.  Otherwise just show the value.
            try:
                value_to_show = value.dtype.name
            except AttributeError:
                value_to_show = value
            return (
                "{funcname}() expected a value with dtype {dtype_str} "
                "for argument {argname!r}, but got {value!r} instead."
            ).format(
                funcname=_qualified_name(func),
                dtype_str=' or '.join(repr(d.name) for d in dtypes),
                argname=argname,
                value=value_to_show,
            )

        def _actual_preprocessor(func, argname, argvalue):
            if getattr(argvalue, 'dtype', object()) not in dtypes:
                raise TypeError(error_message(func, argname, argvalue))
            return argvalue

        return _actual_preprocessor

    return preprocess(**valmap(_expect_dtype, named))
示例#6
0
def expect_element(*_pos, **named):
    """
    Preprocessing decorator that verifies inputs are elements of some
    expected collection.

    Usage
    -----
    >>> @expect_element(x=('a', 'b'))
    ... def foo(x):
    ...    return x.upper()
    ...
    >>> foo('a')
    'A'
    >>> foo('b')
    'B'
    >>> foo('c')
    Traceback (most recent call last):
       ...
    ValueError: foo() expected a value in ('a', 'b') for argument 'x', but got 'c' instead.  # noqa

    Notes
    -----
    This uses the `in` operator (__contains__) to make the containment check.
    This allows us to use any custom container as long as the object supports
    the container protocol.
    """
    if _pos:
        raise TypeError("expect_element() only takes keyword arguments.")

    return preprocess(**valmap(_expect_element, named))
示例#7
0
def expect_dtypes(*_pos, **named):
    """
    Preprocessing decorator that verifies inputs have expected numpy dtypes.

    Usage
    -----
    >>> from numpy import dtype, arange
    >>> @expect_dtypes(x=dtype(int))
    ... def foo(x, y):
    ...    return x, y
    ...
    >>> foo(arange(3), 'foo')
    (array([0, 1, 2]), 'foo')
    >>> foo(arange(3, dtype=float), 'foo')
    Traceback (most recent call last):
       ...
    TypeError: foo() expected an argument with dtype 'int64' for argument 'x', but got dtype 'float64' instead.  # noqa
    """
    if _pos:
        raise TypeError("expect_dtypes() only takes keyword arguments.")

    for name, type_ in iteritems(named):
        if not isinstance(type_, (dtype, tuple)):
            raise TypeError(
                "expect_dtypes() expected a numpy dtype or tuple of dtypes"
                " for argument {name!r}, but got {dtype} instead.".format(
                    name=name, dtype=dtype,
                )
            )
    return preprocess(**valmap(_expect_dtype, named))
示例#8
0
 def get_has_what(self, stream, keys=None):
     if keys:
         keys = [coerce_to_address(key) for key in keys]
     if keys is not None:
         return {k: list(self.has_what[k]) for k in keys}
     else:
         return valmap(list, self.has_what)
示例#9
0
def merge_ownership_periods(mappings):
    """
    Given a dict of mappings where the values are lists of
    OwnershipPeriod objects, returns a dict with the same structure with
    new OwnershipPeriod objects adjusted so that the periods have no
    gaps.

    Orders the periods chronologically, and pushes forward the end date
    of each period to match the start date of the following period. The
    end date of the last period pushed forward to the max Timestamp.
    """
    return valmap(
        lambda v: tuple(
            OwnershipPeriod(
                a.start,
                b.start,
                a.sid,
                a.value,
            ) for a, b in sliding_window(
                2,
                concatv(
                    sorted(v),
                    # concat with a fake ownership object to make the last
                    # end date be max timestamp
                    [OwnershipPeriod(
                        pd.Timestamp.max.tz_localize('utc'),
                        None,
                        None,
                        None,
                    )],
                ),
            )
        ),
        mappings,
    )
示例#10
0
    def test_retrieve_specific_type(self, type_, lookup_name, failure_type):
        equities = make_simple_equity_info(
            range(5), start_date=pd.Timestamp("2014-01-01"), end_date=pd.Timestamp("2015-01-01")
        )
        max_equity = equities.index.max()
        futures = make_commodity_future_info(first_sid=max_equity + 1, root_symbols=["CL"], years=[2014])
        equity_sids = [0, 1]
        future_sids = [max_equity + 1, max_equity + 2, max_equity + 3]
        if type_ == Equity:
            success_sids = equity_sids
            fail_sids = future_sids
        else:
            fail_sids = equity_sids
            success_sids = future_sids

        with tmp_asset_finder(equities=equities, futures=futures) as finder:
            # Run twice to exercise caching.
            lookup = getattr(finder, lookup_name)
            for _ in range(2):
                results = lookup(success_sids)
                self.assertIsInstance(results, dict)
                self.assertEqual(set(results.keys()), set(success_sids))
                self.assertEqual(valmap(int, results), dict(zip(success_sids, success_sids)))
                self.assertEqual({type_}, {type(asset) for asset in itervalues(results)})
                with self.assertRaises(failure_type):
                    lookup(fail_sids)
                with self.assertRaises(failure_type):
                    # Should fail if **any** of the assets are bad.
                    lookup([success_sids[0], fail_sids[0]])
示例#11
0
def test_multiprogress(s, a, b):
    sched, report = Queue(), Queue(); s.handle_queues(sched, report)
    s.update_graph(tasks=valmap(dumps_task, {'x-1': (inc, 1),
                                             'x-2': (inc, 'x-1'),
                                             'x-3': (inc, 'x-2'),
                                             'y-1': (dec, 'x-3'),
                                             'y-2': (dec, 'y-1')}),
                   keys=['y-2'],
                   dependencies={'x-2': ['x-1'], 'x-3': ['x-2'],
                                 'y-1': ['x-3'], 'y-2': ['y-1']})

    p = MultiProgress(['y-2'], scheduler=s, func=key_split)
    yield p.setup()

    assert p.keys == {'x': {'x-1', 'x-2', 'x-3'},
                      'y': {'y-1', 'y-2'}}

    while True:
        msg = yield report.get()
        if msg['op'] == 'key-in-memory' and msg['key'] == 'x-3':
            break

    assert p.keys == {'x': set(),
                      'y': {'y-1', 'y-2'}}

    while True:
        msg = yield report.get()
        if msg['op'] == 'key-in-memory' and msg['key'] == 'y-2':
            break

    assert p.keys == {'x': set(),
                      'y': set()}

    assert p.status == 'finished'
示例#12
0
文件: util.py 项目: vshesh/glue
def assemble_ast(tag:str, idsclasses: Mapping[str, str], attrs: Mapping[str, str], body: list):
  """
  Small helper function for the template_2_ast function that assembles the appropriate ast element
  given the tag name, a dictionary of ids/classes from the tag name, further attrs, and a list of children or the body.
  For most components, there won't be any children.
  
  :param tag:
  :param idsclasses:
  :param attrs:
  :param body:
  :return:
  """
  iscomponent = re.match(r'^[A-Z]', tag)
  attrs['id'] = (attrs.get('id', '') + ' ' + idsclasses.get('id', '')).strip()
  attrs['class'] = (attrs.get('class', '') + ' ' + idsclasses.get('class', '')).strip()
  # remove the empty attributes to avoid clutter and save bytes.
  attrs = dict(t.valfilter(lambda x: not (isinstance(x, str) and x.strip() == ''), attrs))
  # special handling for the "style" attribute, since that can be a dictionary
  attrs = t.valmap(lambda val:' '.join('{}: {};'.format(k,v) for k,v in val.items())
                   if isinstance(val, dict) else val,
                   attrs)
  
  if iscomponent:
    return {'name': tag, 'props': attrs, 'children': body}
  else:
    return {'tag': tag, 'attrs': attrs, 'body': body}
示例#13
0
    def symbol_ownership_map(self):
        rows = sa.select(self.equity_symbol_mappings.c).execute().fetchall()

        mappings = {}
        for row in rows:
            mappings.setdefault((row.company_symbol, row.share_class_symbol), []).append(
                SymbolOwnership(
                    pd.Timestamp(row.start_date, unit="ns", tz="utc"),
                    pd.Timestamp(row.end_date, unit="ns", tz="utc"),
                    row.sid,
                    row.symbol,
                )
            )

        return valmap(
            lambda v: tuple(
                SymbolOwnership(a.start, b.start, a.sid, a.symbol)
                for a, b in sliding_window(
                    2,
                    concatv(
                        sorted(v),
                        # concat with a fake ownership object to make the last
                        # end date be max timestamp
                        [SymbolOwnership(pd.Timestamp.max.tz_localize("utc"), None, None, None)],
                    ),
                )
            ),
            mappings,
            factory=lambda: mappings,
        )
示例#14
0
文件: into.py 项目: dalejung/blaze
def into(a, b, **kwargs):
    dialect = b.dialect.copy()
    del dialect['lineterminator']
    dates = [i for i, typ in enumerate(b.schema[0].types)
               if 'date' in str(typ)]
    schema = b.schema
    if '?' in str(schema):
        schema = dshape(str(schema).replace('?', ''))

    dtypes = valmap(to_numpy_dtype, schema[0].dict)

    datenames = [name for name in dtypes
                      if np.issubdtype(dtypes[name], np.datetime64)]

    dtypes = dict((k, v) for k, v in dtypes.items()
                         if not np.issubdtype(v, np.datetime64))

    if 'strict' in dialect:
        del dialect['strict']

    # Pass only keyword arguments appropriate for read_csv
    kws = keywords(pd.read_csv)
    options = toolz.merge(dialect, kwargs)
    options = toolz.keyfilter(lambda k: k in kws, options)

    if b.open == gzip.open:
        options['compression'] = 'gzip'

    return pd.read_csv(b.path,
                       skiprows=1 if b.header else 0,
                       dtype=dtypes,
                       parse_dates=datenames,
                       names=b.columns,
                       **options)
示例#15
0
文件: ABuChecker.py 项目: 3774257/abu
 def check_type(*ty_args, **ty_kwargs):
     """
     【装饰器】
     检查输入参数类型;检查失败raise CheckError
     :param ty_args: 类型tuple
     :param ty_kwargs: 类型dict
     :return: 
     """
     # 检查是否有不合规的tuple参数
     for ty in ty_args:
         if not isinstance(ty, (type, tuple)):
             raise TypeError(
                 "check_type() expected a type or tuple of types"
                 ", but got {type_} instead.".format(
                     type_=ty,
                 )
             )
     # 检查是否有不合规的dict参数
     for name, ty in six.iteritems(ty_kwargs):
         if not isinstance(ty, (type, tuple)):
             raise TypeError(
                 "check_type() expected a type or tuple of types for "
                 "argument '{name}', but got {type_} instead.".format(
                     name=name, type_=ty,
                 )
             )
     # 将type_check作用在函数参数上
     return arg_process(*map(type_check, list(ty_args)), **valmap(type_check, ty_kwargs))
示例#16
0
def expect_types(*_pos, **named):
    """
    Preprocessing decorator that verifies inputs have expected types.

    Usage
    -----
    >>> @expect_types(x=int, y=str)
    ... def foo(x, y):
    ...    return x, y
    ...
    >>> foo(2, '3')
    (2, '3')
    >>> foo(2.0, '3')
    Traceback (most recent call last):
       ...
    TypeError: foo() expected an argument of type 'int' for argument 'x', but got float instead.  # noqa
    """
    if _pos:
        raise TypeError("expect_types() only takes keyword arguments.")

    for name, type_ in iteritems(named):
        if not isinstance(type_, (type, tuple)):
            raise TypeError(
                "expect_types() expected a type or tuple of types for "
                "argument '{name}', but got {type_} instead.".format(
                    name=name, type_=type_,
                )
            )

    return preprocess(**valmap(_expect_type, named))
示例#17
0
def md(template, *args, **kwargs):
    """Wraps string.format with naive markdown escaping"""
    def escape(s):
        for char in ('*', '#', '_', '~', '`', '>'):
            s = s.replace(char, '\\' + char)
        return s
    return template.format(*map(escape, args), **toolz.valmap(escape, kwargs))
示例#18
0
文件: case.py 项目: llllllllll/adt
    def __init__(self, constructors, name, *args, **kwargs):
        args = tuple(map(self._unwrap_name, args))
        kwargs = valmap(self._unwrap_name, kwargs)
        already_bound = {}
        for n, arg in enumerate(args):
            if arg in already_bound:
                raise TypeError(
                    'argument %r at position %d is already bound to the'
                    ' positional argument at index %d' % (
                        arg,
                        n,
                        already_bound[arg],
                    ),
                )
            already_bound[arg] = n

        for k, arg in kwargs.items():
            if arg in already_bound:
                loc = already_bound[arg]
                raise TypeError(
                    'argument %r at keyword %s is already bound to the %s' % (
                        arg,
                        k,
                        ('positional argument at index %d' % loc)
                        if isinstance(loc, int) else
                        ('keyword argument %r' % loc),
                    ),
                )

        super().__init__(constructors, name, *args, **kwargs)
        del constructors[name]
        self._constructors = constructors
示例#19
0
def test_compression_binary(fmt):
    from dask.bytes.core import open_files
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_files('.test.accounts.*', compression=fmt)
        data = compute(*[file.read() for file in myfiles])
        assert list(data) == [files[k] for k in sorted(files)]
示例#20
0
def expect_kinds(**named):
    """
    Preprocessing decorator that verifies inputs have expected dtype kinds.

    Usage
    -----
    >>> from numpy import int64, int32, float32
    >>> @expect_kinds(x='i')
    ... def foo(x):
    ...    return x
    ...
    >>> foo(int64(2))
    2
    >>> foo(int32(2))
    2
    >>> foo(float32(2))
    Traceback (most recent call last):
       ...n
    TypeError: foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead.  # noqa
    """
    for name, kind in iteritems(named):
        if not isinstance(kind, (str, tuple)):
            raise TypeError(
                "expect_dtype_kinds() expected a string or tuple of strings"
                " for argument {name!r}, but got {kind} instead.".format(
                    name=name, kind=dtype,
                )
            )

    @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,)))
    def _expect_kind(kinds):
        """
        Factory for kind-checking functions that work the @preprocess
        decorator.
        """
        def error_message(func, argname, value):
            # If the bad value has a dtype, but it's wrong, show the dtype
            # kind.  Otherwise just show the value.
            try:
                value_to_show = value.dtype.kind
            except AttributeError:
                value_to_show = value
            return (
                "{funcname}() expected a numpy object of kind {kinds} "
                "for argument {argname!r}, but got {value!r} instead."
            ).format(
                funcname=_qualified_name(func),
                kinds=' or '.join(map(repr, kinds)),
                argname=argname,
                value=value_to_show,
            )

        def _actual_preprocessor(func, argname, argvalue):
            if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds:
                raise TypeError(error_message(func, argname, argvalue))
            return argvalue

        return _actual_preprocessor

    return preprocess(**valmap(_expect_kind, named))
示例#21
0
文件: ABuChecker.py 项目: 3774257/abu
 def check_subset(*ss_args, **ss_kwargs):
     """
     【装饰器】
     检查输入参数是否是某一集合的子集;检查失败raise CheckError
     :param ss_args: 参数集合tuple
     :param ss_kwargs: 参数集合dict
     :return: 
     """
     # 检查是否有不合规的tuple参数
     for ss in ss_args:
         if not isinstance(ss, (list, set, type(None))):
             raise TypeError(
                 "check_subset() expected a list or set or None of values"
                 ", but got {subset_} or tuple instead.".format(
                     subset_=str(type(ss)),
                 )
             )
     # 检查是否有不合规的dict参数
     for name, ss in six.iteritems(ss_kwargs):
         if not isinstance(ss, (list, set, type(None))):
             raise TypeError(
                 "check_subset() expected a list or set of values for "
                 "argument '{name_}', but got {subset_} or tuple instead.".format(
                     name_=name, subset_=str(type(ss)),
                 )
             )
     # 将subset_check函数作用在函数参数上
     return arg_process(*map(subset_check, list(ss_args)), **valmap(subset_check, ss_kwargs))
示例#22
0
 def __init__(self,
              data=None,
              formats=None,
              authorization=None,
              allow_profiler=False,
              profiler_output=None,
              profile_by_default=False,
              allow_add=False):
     if isinstance(data, collections.Mapping):
         data = valmap(lambda v: v.data if isinstance(v, _Data) else v,
                       data)
     elif isinstance(data, _Data):
         data = data._resources()
     app = self.app = Flask('blaze.server.server')
     if data is None:
         data = {}
     app.register_blueprint(api,
                            data=data,
                            formats=formats if formats is not None else (json,),
                            authorization=authorization,
                            allow_profiler=allow_profiler,
                            profiler_output=profiler_output,
                            profile_by_default=profile_by_default,
                            allow_add=allow_add)
     self.data = data
示例#23
0
def test_multi_progressbar_widget(s, a, b):
    s.update_graph(tasks=valmap(dumps_task, {'x-1': (inc, 1),
                                             'x-2': (inc, 'x-1'),
                                             'x-3': (inc, 'x-2'),
                                             'y-1': (dec, 'x-3'),
                                             'y-2': (dec, 'y-1'),
                                             'e': (throws, 'y-2'),
                                             'other': (inc, 123)}),
                   keys=['e'],
                   dependencies={'x-2': ['x-1'], 'x-3': ['x-2'],
                                 'y-1': ['x-3'], 'y-2': ['y-1'],
                                 'e': ['y-2']})

    p = MultiProgressWidget(['e'], scheduler=(s.ip, s.port))
    yield p.listen()

    assert p.bars['x'].value == 1.0
    assert p.bars['y'].value == 1.0
    assert p.bars['e'].value == 0.0
    assert '3 / 3' in p.bar_texts['x'].value
    assert '2 / 2' in p.bar_texts['y'].value
    assert '0 / 1' in p.bar_texts['e'].value

    assert p.bars['x'].bar_style == 'success'
    assert p.bars['y'].bar_style == 'success'
    # assert p.bars['e'].bar_style == 'danger'

    assert p.status == 'error'

    capacities = [int(re.search(r'\d+ / \d+', row.children[0].value)
                    .group().split(' / ')[1])
                  for row in p.bar_widgets.children]
    assert sorted(capacities, reverse=True) == capacities
示例#24
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True}

        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk3.items():
            dependencies[k] |= set(_deps(dsk, v))

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk3),
                                 'dependencies': dependencies,
                                 'keys': flatkeys,
                                 'restrictions': restrictions or {},
                                 'client': self.id})

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = 'OK', result
            except Exception as e:
                result = 'error', e
        raise gen.Return(result)
示例#25
0
    def persist(self, collections):
        """ Persist dask collections on cluster

        Starts computation of the collection on the cluster in the background.
        Provides a new dask collection that is semantically identical to the
        previous one, but now based off of futures currently in execution.

        Parameters
        ----------
        collections: sequence or single dask object
            Collections like dask.array or dataframe or dask.value objects

        Returns
        -------
        List of collections, or single collection, depending on type of input.

        Examples
        --------
        >>> xx = executor.persist(x)  # doctest: +SKIP
        >>> xx, yy = executor.persist([x, y])  # doctest: +SKIP

        See Also
        --------
        Executor.compute
        """
        if isinstance(collections, (tuple, list, set, frozenset)):
            singleton = False
        else:
            singleton = True
            collections = [collections]

        assert all(isinstance(c, Base) for c in collections)

        groups = groupby(lambda x: x._optimize, collections)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk2.items():
            dependencies[k] |= set(_deps(dsk, v))

        names = list({k for c in collections for k in flatten(c._keys())})

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk2),
                                 'dependencies': dependencies,
                                 'keys': names,
                                 'client': self.id})
        result = [redict_collection(c, {k: Future(k, self)
                                        for k in flatten(c._keys())})
                for c in collections]
        if singleton:
            return first(result)
        else:
            return result
示例#26
0
文件: core.py 项目: kastnerkyle/dask
def optimize(dsk, keys, **kwargs):
    if isinstance(keys, list):
        dsk2 = cull(dsk, list(core.flatten(keys)))
    else:
        dsk2 = cull(dsk, [keys])
    dsk3 = fuse(dsk2)
    dsk4 = valmap(rewrite_rules.rewrite, dsk3)
    return dsk4
示例#27
0
def test_compression(s3, fmt, blocksize):
    with s3_context('compress', valmap(compress[fmt], files)) as s3:
        sample, values = read_bytes('compress/test/accounts.*', s3=s3,
                                    compression=fmt, blocksize=blocksize)
        assert sample.startswith(files[sorted(files)[0]][:10])

        results = compute(*concat(values))
        assert b''.join(results) == b''.join([files[k] for k in sorted(files)])
示例#28
0
文件: core.py 项目: JDWarner/dask
def lazify(dsk):
    """
    Remove unnecessary calls to ``list`` in tasks

    See Also:
        ``dask.bag.core.lazify_task``
    """
    return valmap(lazify_task, dsk)
示例#29
0
def test_delete_data(s, a, b):
    yield s.scatter(data=valmap(dumps, {'x': 1, 'y': 2, 'z': 3}))
    assert set(a.data) | set(b.data) == {'x', 'y', 'z'}
    assert merge(a.data, b.data) == {'x': 1, 'y': 2, 'z': 3}

    s.delete_data(keys=['x', 'y'])
    yield s.clear_data_from_workers()
    assert set(a.data) | set(b.data) == {'z'}
示例#30
0
def test_compression_text(fmt):
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_text_files('.test.accounts.*', compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k].decode() for k in sorted(files)]
示例#31
0
def test_many_Progresss(s, a, b):
    sched, report = Queue(), Queue()
    s.handle_queues(sched, report)
    s.update_graph(tasks=valmap(dumps_task, {
        'x': (inc, 1),
        'y': (inc, 'x'),
        'z': (inc, 'y')
    }),
                   keys=['z'],
                   dependencies={
                       'y': ['x'],
                       'z': ['y']
                   })

    bars = [Progress(keys=['z'], scheduler=s) for i in range(10)]
    yield [b.setup() for b in bars]

    while True:
        msg = yield report.get()
        if msg['op'] == 'key-in-memory' and msg['key'] == 'z':
            break

    assert all(b.status == 'finished' for b in bars)
示例#32
0
def test_add_worker(s, a, b):
    w = Worker(s.address, ncores=3)
    w.data["x-5"] = 6
    w.data["y"] = 1
    yield w

    dsk = {("x-%d" % i): (inc, i) for i in range(10)}
    s.update_graph(
        tasks=valmap(dumps_task, dsk),
        keys=list(dsk),
        client="client",
        dependencies={k: set() for k in dsk},
    )

    s.add_worker(
        address=w.address, keys=list(w.data), ncores=w.ncores, services=s.services
    )

    s.validate_state()

    assert w.ip in s.host_info
    assert s.host_info[w.ip]["addresses"] == {a.address, b.address, w.address}
    yield w.close()
示例#33
0
def test_add_worker(s, a, b):
    w = Worker(s.ip, s.port, ncores=3)
    w.data['x-5'] = 6
    w.data['y'] = 1
    yield w._start(0)

    dsk = {('x-%d' % i): (inc, i) for i in range(10)}
    s.update_graph(tasks=valmap(dumps_task, dsk),
                   keys=list(dsk),
                   client='client',
                   dependencies={k: set()
                                 for k in dsk})

    s.add_worker(address=w.address,
                 keys=list(w.data),
                 ncores=w.ncores,
                 services=s.services)

    s.validate_state()

    assert w.ip in s.host_info
    assert s.host_info[w.ip]['addresses'] == {a.address, b.address, w.address}
    yield w._close()
示例#34
0
def coerce_types(**kwargs):
    """
    Preprocessing decorator that applies type coercions.

    Parameters
    ----------
    **kwargs : dict[str -> (type, callable)]
         Keyword arguments mapping function parameter names to pairs of
         (from_type, to_type).

    Examples
    --------
    >>> @coerce_types(x=(float, int), y=(int, str))
    ... def func(x, y):
    ...     return (x, y)
    ...
    >>> func(1.0, 3)
    (1, '3')
    """
    def _coerce(types):
        return coerce(*types)

    return preprocess(**valmap(_coerce, kwargs))
示例#35
0
def test_compression(s3, fmt, blocksize):
    if fmt == "zip" and sys.version_info.minor == 5:
        pytest.skip("zipfile is read-only on py35")
    if fmt not in compress:
        pytest.skip("compression function not provided")
    s3._cache.clear()
    with s3_context("compress", valmap(compress[fmt], files)):
        if fmt and blocksize:
            with pytest.raises(ValueError):
                read_bytes(
                    "s3://compress/test/accounts.*",
                    compression=fmt,
                    blocksize=blocksize,
                )
            return
        sample, values = read_bytes(
            "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize
        )
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
示例#36
0
def into(a, b, **kwargs):
    dialect = b.dialect.copy()
    del dialect['lineterminator']

    schema = b.schema
    if '?' in str(schema):
        schema = dshape(str(schema).replace('?', ''))

    dtypes = valmap(to_numpy_dtype, schema[0].dict)

    datenames = [
        name for name in dtypes if np.issubdtype(dtypes[name], np.datetime64)
    ]

    dtypes = dict((k, v) for k, v in dtypes.items()
                  if not np.issubdtype(v, np.datetime64))

    if 'strict' in dialect:
        del dialect['strict']

    # Pass only keyword arguments appropriate for read_csv
    kws = keywords(pd.read_csv)
    options = toolz.merge(dialect, kwargs)
    options = toolz.keyfilter(lambda k: k in kws, options)

    if b.open == gzip.open:
        options['compression'] = 'gzip'

    usecols = names = options.pop('names', b.columns)

    return pd.read_csv(b.path,
                       header=0 if b.header else None,
                       dtype=dtypes,
                       parse_dates=datenames,
                       names=names,
                       usecols=usecols,
                       **options)
示例#37
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        raw = ffill_query_in_range(
            self._expr,
            dates[0],
            dates[-1],
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(sids[~sids.isin(assets)].index, inplace=True)

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)
示例#38
0
def merge_ownership_periods(mappings):
    """
    Given a dict of mappings where the values are lists of
    OwnershipPeriod objects, returns a dict with the same structure with
    new OwnershipPeriod objects adjusted so that the periods have no
    gaps.

    Orders the periods chronologically, and pushes forward the end date
    of each period to match the start date of the following period. The
    end date of the last period pushed forward to the max Timestamp.
    """
    return valmap(
        lambda v: tuple(
            OwnershipPeriod(
                a.start,
                b.start,
                a.sid,
                a.value,
            ) for a, b in sliding_window(
                2,
                concatv(
                    sorted(v),
                    # concat with a fake ownership object to make the last
                    # end date be max timestamp
                    [
                        OwnershipPeriod(
                            pd.Timestamp.max.tz_localize('utc'),
                            None,
                            None,
                            None,
                        )
                    ],
                ),
            )),
        mappings,
    )
示例#39
0
def test_add_worker(s, a, b):
    w = Worker(s.ip, s.port, ncores=3, ip='127.0.0.1')
    w.data['x-5'] = 6
    w.data['y'] = 1
    yield w._start(0)

    dsk = {('x-%d' % i).encode(): (inc, i) for i in range(10)}
    s.update_graph(tasks=valmap(dumps_task, dsk),
                   keys=list(dsk),
                   client='client',
                   dependencies={k: set()
                                 for k in dsk})

    s.add_worker(address=w.address,
                 keys=list(w.data),
                 ncores=w.ncores,
                 services=s.services,
                 coerce_address=False)

    s.validate_state()

    assert w.ip in s.host_info
    assert s.host_info[w.ip]['ports'] == set(map(str,
                                                 [a.port, b.port, w.port]))
示例#40
0
def test_nanny_process_failure(s):
    n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop)
    yield n._start()
    nn = rpc(ip=n.ip, port=n.port)
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    ww = rpc(ip=n.ip, port=n.worker_port)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(StreamClosedError):
        yield ww.compute(function=dumps(sys.exit), args=dumps((0, )), key='z')

    start = time()
    while n.process.is_alive():  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while not n.process.is_alive():  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 2

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 2

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    nn.close_streams()
    s.stop()
示例#41
0
 def load_raw_arrays(self, dts, assets, fields=None):
     """股东持仓变动"""
     sids = [a.sid for a in assets]
     table = self.metadata.tables['holder']
     sql = select([
         table.c.sid, table.c.declared_date, table.c.股东, table.c.方式,
         cast(table.c.变动股本, Numeric(10, 2)),
         cast(table.c.总持仓, Integer),
         cast(table.c.占总股本比, Numeric(10, 5)),
         cast(table.c.总流通股, Integer),
         cast(table.c.占流通比, Numeric(10, 5))
     ]).where(table.c.declared_date.between(dts[0], dts[1]))
     frame = pd.DataFrame(self.engine.execute(sql).fetchall(),
                          columns=[
                              'sid', 'declared_date', '股东', '方式', '变动股本',
                              '总持仓', '占总股本比', '总流通股', '占流通比'
                          ])
     frame.set_index('sid', inplace=True)
     frame.drop_duplicates(inplace=True)
     frame_dct = unpack_df_to_component_dict(frame, 'declared_date')
     frame_dct = valmap(lambda x: x.loc[:, fields]
                        if fields else x, frame_dct)
     holder_frame = keyfilter(lambda x: x in sids, frame_dct)
     return holder_frame
示例#42
0
def test_multi_progressbar_widget_after_close(s, a, b):
    s.update_graph(tasks=valmap(
        dumps_task, {
            'x-1': (inc, 1),
            'x-2': (inc, 'x-1'),
            'x-3': (inc, 'x-2'),
            'y-1': (dec, 'x-3'),
            'y-2': (dec, 'y-1'),
            'e': (throws, 'y-2'),
            'other': (inc, 123)
        }),
                   keys=['e'],
                   dependencies={
                       'x-2': {'x-1'},
                       'x-3': {'x-2'},
                       'y-1': {'x-3'},
                       'y-2': {'y-1'},
                       'e': {'y-2'}
                   })

    p = MultiProgressWidget(['x-1', 'x-2', 'x-3'], scheduler=(s.ip, s.port))
    yield p.listen()

    assert 'x' in p.bars
示例#43
0
 def load_raw_arrays(self, dts, assets, fields=None):
     sids = [a.sid for a in assets]
     # 获取数据
     table = self.metadata.tables['massive']
     sql = select([
         table.c.declared_date, table.c.sid,
         cast(table.c.bid_price, Numeric(10, 2)),
         cast(table.c.discount, Numeric(10, 5)),
         cast(table.c.bid_volume, Integer), table.c.buyer, table.c.seller,
         table.c.cjeltszb
     ]).where(table.c.declared_date.between(dts[0], dts[1]))
     frame = pd.DataFrame(self.engine.execute(sql).fetchall(),
                          columns=[
                              'declared_date', 'sid', 'bid_price',
                              'discount', 'bid_volume', 'buyer', 'seller',
                              'cjeltszb'
                          ])
     frame.set_index('sid', inplace=True)
     frame.drop_duplicates(inplace=True)
     frame_dct = unpack_df_to_component_dict(frame, 'declared_date')
     frame_dct = valmap(lambda x: x.loc[:, fields]
                        if fields else x, frame_dct)
     massive_frame = keyfilter(lambda x: x in sids, frame_dct)
     return massive_frame
示例#44
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        expr = self._expr
        filtered = expr[expr[TS_FIELD_NAME] <= dates[0]]
        lower = odo(
            bz.by(
                filtered[SID_FIELD_NAME],
                timestamp=filtered[TS_FIELD_NAME].max(),
            ).timestamp.min(), pd.Timestamp, **self._odo_kwargs)
        if pd.isnull(lower):
            # If there is no lower date, just query for data in the date
            # range. It must all be null anyways.
            lower = dates[0]

        raw = odo(
            expr[(expr[TS_FIELD_NAME] >= lower)
                 & (expr[TS_FIELD_NAME] <= dates[-1])], pd.DataFrame,
            **self._odo_kwargs)

        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(sids[~(sids.isin(assets) | sids.notnull())].index,
                 inplace=True)

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)
示例#45
0
def test_nanny_process_failure(c, s):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    yield n._start()
    first_dir = n.worker_dir

    assert os.path.exists(first_dir)

    original_process = n.process
    ww = rpc(n.worker_address)
    yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2}))
    with ignoring(CommClosedError):
        yield c._run(sys.exit, 0, workers=[n.worker_address])

    start = time()
    while n.process is original_process:  # wait while process dies
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while not isalive(n.process):  # wait while process comes back
        yield gen.sleep(0.01)
        assert time() - start < 5

    start = time()
    while n.worker_address not in s.ncores or n.worker_dir is None:
        yield gen.sleep(0.01)
        assert time() - start < 5

    second_dir = n.worker_dir

    yield n._close()
    assert not os.path.exists(second_dir)
    assert not os.path.exists(first_dir)
    assert first_dir != n.worker_dir
    ww.close_rpc()
    s.stop()
示例#46
0
文件: server.py 项目: spyamine/blaze
 def __init__(self,
              data=None,
              formats=None,
              authorization=None,
              allow_profiler=False,
              profiler_output=None,
              profile_by_default=False):
     if isinstance(data, collections.Mapping):
         data = valmap(lambda v: v.data
                       if isinstance(v, _Data) else v, data)
     elif isinstance(data, _Data):
         data = data._resources()
     app = self.app = Flask('blaze.server.server')
     if data is None:
         data = {}
     app.register_blueprint(api,
                            data=data,
                            formats=formats if formats is not None else
                            (json, ),
                            authorization=authorization,
                            allow_profiler=allow_profiler,
                            profiler_output=profiler_output,
                            profile_by_default=profile_by_default)
     self.data = data
示例#47
0
def expect_element(__funcname=_qualified_name, **named):
    """
    Preprocessing decorator that verifies inputs are elements of some
    expected collection.
    Notes
    -----
    A special argument, __funcname, can be provided as a string to override the
    function name shown in error messages.  This is most often used on __init__
    or __new__ methods to make errors refer to the class name instead of the
    function name.
    This uses the `in` operator (__contains__) to make the containment check.
    This allows us to use any custom container as long as the object supports
    the container protocol.
    """

    def _expect_element(collection):
        if isinstance(collection, (set, frozenset)):
            # Special case the error message for set and frozen set to make it
            # less verbose.
            collection_for_error_message = tuple(sorted(collection))
        else:
            collection_for_error_message = collection

        template = (
            "%(funcname)s() expected a value in {collection} "
            "for argument '%(argname)s', but got %(actual)s instead."
        ).format(collection=collection_for_error_message)
        return make_check(
            ValueError,
            template,
            complement(op.contains(collection)),
            repr,
            funcname=__funcname,
        )

    return preprocess(**valmap(_expect_element, named))
示例#48
0
def main(scheduler, host, worker_port, listen_address, contact_address,
         nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file,
         reconnect, resources, bokeh, bokeh_port, local_directory,
         scheduler_file, interface, death_timeout, preload, bokeh_prefix,
         tls_ca_file, tls_cert, tls_key):
    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_worker_cert=tls_cert,
        tls_worker_key=tls_key,
    )

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if contact_address and not listen_address:
        logger.error(
            "Failed to launch worker. "
            "Must specify --listen-address when --contact-address is given")
        exit(1)

    if nprocs > 1 and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when nprocs > 1.")
        exit(1)

    if (worker_port or host) and listen_address:
        logger.error(
            "Failed to launch worker. "
            "You cannot specify --listen-address when --worker-port or --host is given."
        )
        exit(1)

    try:
        if listen_address:
            (host, worker_port) = get_address_host_port(listen_address,
                                                        strict=True)

        if contact_address:
            # we only need this to verify it is getting parsed
            (_, _) = get_address_host_port(contact_address, strict=True)
        else:
            # if contact address is not present we use the listen_address for contact
            contact_address = listen_address
    except ValueError as e:
        logger.error("Failed to launch worker. " + str(e))
        exit(1)

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {'prefix': bokeh_prefix})
            else:
                result = BokehWorker
            services[('bokeh', bokeh_port)] = result

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port, 'listen_address': listen_address}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if not scheduler and not scheduler_file:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    nannies = [
        t(scheduler,
          scheduler_file=scheduler_file,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          security=sec,
          contact_address=contact_address,
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        if nanny:
            yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n.start(addr) for n in nannies]
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#49
0
def call_neural_network(state):

    logger = logging.getLogger(__name__)

    # Pre-process the inputs
    # ----------------------
    kwargs = {}
    dt = state.pop('dt')
    for key, val in state.items():
        if isinstance(val, np.ndarray):
            kwargs[key] = val

    state['SOLIN'] = compute_insolation(state['lat'], state['day'])

    # Compute the output of all the models
    # ------------------------------------
    merged_outputs = {}
    for model in MODELS:
        logger.info(f"Calling NN")
        nz = 34  #len(model.heights)
        lower_atmos_kwargs = get_lower_atmosphere(kwargs, nz)

        # add a singleton dimension and convert to float32
        lower_atmos_kwargs = {
            key: val[np.newaxis].astype(np.float32)
            for key, val in lower_atmos_kwargs.items()
        }
        # call the neural network
        out = call_with_numpy_dict(model, lower_atmos_kwargs)
        # remove the singleton first dimension
        out = valmap(np.squeeze, out)
        out = expand_lower_atmosphere(state,
                                      out,
                                      n_in=nz,
                                      n_out=state['QT'].shape[0])

        renamed = {}
        for key in out:
            renamed['F' + key + 'NN'] = out[key]

        merged_outputs.update(renamed)

    # update the state
    state.update(merged_outputs)

    # Debugging info below here
    # -------------------------
    nstep = int(state['nstep'])
    output_this_step = OUTPUT_INTERVAL and (nstep - 1) % OUTPUT_INTERVAL == 0
    if DEBUG:
        save_debug({
            'args': (kwargs, dt),
            'out': merged_outputs,
        }, state)

    try:
        logger.info("Mean Precip: %f" % out['Prec'].mean())
    except KeyError:
        pass

    if output_this_step:
        zarr_logger.append_all(kwargs)
        zarr_logger.append_all(merged_outputs)
        zarr_logger.append('time', np.array([state['day']]))
示例#50
0
                for base in ['A', 'C', 'G', 'T']:
                    new = cb[:i] + base + cb[i+1:]
                    if cb == new:
                        continue
                    if new in counter_some_error:
                        shadow_counts.append(counter_some_error[new])
            shadows[cb] = sum(shadow_counts)
            real[cb] = freq
            df.append({'n_shadows': shadows[cb], 'n_real': freq, 'cb': cb})
        df = pd.DataFrame(df)
        df['error_rate'] = df['n_shadows'] / df['n_real']
        # error_rates = [shadows[cb] / real[cb] for cb, freq in most_common_CBS]

        return df

    dfs = toolz.valmap(lambda x: estimate_error_rate_shadows(x[0], x[1]), res_dict)

    """
    the above its the per read error rate:  #correct reads / #wrong reads

    But how does that translate to a per-base error:
    - to get 16BP correct, we need (1-p_err)^16
    - a single error is 16 * err * (1-err)^15

    %correct = (1-p_err)^16

    c**(1/16) = 1-p
    p = 1-  c**(1/16)

    """
    right = U0 / (U1 + U0)
示例#51
0
def test_read_csv_compression(fmt, blocksize):
    files2 = valmap(compress[fmt], csv_files)
    with filetexts(files2, mode='b'):
        df = dd.read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize)
        assert_eq(df.compute(scheduler='sync').reset_index(drop=True),
                  expected.reset_index(drop=True), check_dtype=False)
示例#52
0
文件: test_csv.py 项目: jseabold/dask
def test_read_csv_compression(fmt, blocksize):
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode='b'):
        df = read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize)
        eq(df.compute(get=get_sync).reset_index(drop=True),
           expected.reset_index(drop=True), check_dtype=False)
示例#53
0
def main(
    scheduler,
    host,
    worker_port,
    listen_address,
    contact_address,
    nanny_port,
    nthreads,
    nprocs,
    nanny,
    name,
    memory_limit,
    pid_file,
    reconnect,
    resources,
    dashboard,
    bokeh_port,
    local_directory,
    scheduler_file,
    interface,
    protocol,
    death_timeout,
    preload,
    preload_argv,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    dashboard_address,
):
    g0, g1, g2 = gc.get_threshold(
    )  # https://github.com/dask/distributed/issues/1653
    gc.set_threshold(g0 * 3, g1 * 3, g2 * 3)

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if bokeh_port is not None:
        warnings.warn(
            "The --bokeh-port flag has been renamed to --dashboard-address. "
            "Consider adding ``--dashboard-address :%d`` " % bokeh_port)
        dashboard_address = bokeh_port

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if contact_address and not listen_address:
        logger.error(
            "Failed to launch worker. "
            "Must specify --listen-address when --contact-address is given")
        exit(1)

    if nprocs > 1 and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when nprocs > 1.")
        exit(1)

    if (worker_port or host) and listen_address:
        logger.error(
            "Failed to launch worker. "
            "You cannot specify --listen-address when --worker-port or --host is given."
        )
        exit(1)

    try:
        if listen_address:
            (host, worker_port) = get_address_host_port(listen_address,
                                                        strict=True)

        if contact_address:
            # we only need this to verify it is getting parsed
            (_, _) = get_address_host_port(contact_address, strict=True)
        else:
            # if contact address is not present we use the listen_address for contact
            contact_address = listen_address
    except ValueError as e:
        logger.error("Failed to launch worker. " + str(e))
        exit(1)

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {"worker_port": worker_port, "listen_address": listen_address}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs["service_ports"] = {"nanny": nanny_port}
        t = Worker

    if (not scheduler and not scheduler_file
            and dask.config.get("scheduler-address", None) is None):
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    nannies = [
        t(scheduler,
          scheduler_file=scheduler_file,
          ncores=nthreads,
          services=services,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          preload_argv=preload_argv,
          security=sec,
          contact_address=contact_address,
          interface=interface,
          protocol=protocol,
          host=host,
          port=port,
          dashboard_address=dashboard_address if dashboard else None,
          service_kwargs={"bokhe": {
              "prefix": dashboard_prefix
          }},
          name=name if nprocs == 1 or not name else name + "-" + str(i),
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        if nanny:
            yield [n.close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#54
0
def test_repeated_groupby():
    b = db.range(10, npartitions=4)
    c = b.groupby(lambda x: x % 3)
    assert valmap(len, dict(c)) == valmap(len, dict(c))
示例#55
0
def expect_kinds(**named):
    """
    Preprocessing decorator that verifies inputs have expected dtype kinds.

    Examples
    --------
    >>> from numpy import int64, int32, float32
    >>> @expect_kinds(x='i')
    ... def foo(x):
    ...    return x
    ...
    >>> foo(int64(2))
    2
    >>> foo(int32(2))
    2
    >>> foo(float32(2))  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    Traceback (most recent call last):
       ...
    TypeError: ...foo() expected a numpy object of kind 'i' for argument 'x',
    but got 'f' instead.
    """
    for name, kind in iteritems(named):
        if not isinstance(kind, (str, tuple)):
            raise TypeError(
                "expect_dtype_kinds() expected a string or tuple of strings"
                " for argument {name!r}, but got {kind} instead.".format(
                    name=name, kind=dtype,
                )
            )

    @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,)))
    def _expect_kind(kinds):
        """
        Factory for kind-checking functions that work the @preprocess
        decorator.
        """
        def error_message(func, argname, value):
            # If the bad value has a dtype, but it's wrong, show the dtype
            # kind.  Otherwise just show the value.
            try:
                value_to_show = value.dtype.kind
            except AttributeError:
                value_to_show = value
            return (
                "{funcname}() expected a numpy object of kind {kinds} "
                "for argument {argname!r}, but got {value!r} instead."
            ).format(
                funcname=_qualified_name(func),
                kinds=' or '.join(map(repr, kinds)),
                argname=argname,
                value=value_to_show,
            )

        def _actual_preprocessor(func, argname, argvalue):
            if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds:
                raise TypeError(error_message(func, argname, argvalue))
            return argvalue

        return _actual_preprocessor

    return preprocess(**valmap(_expect_kind, named))
示例#56
0
def expect_dtypes(__funcname=_qualified_name, **named):
    """
    Preprocessing decorator that verifies inputs have expected numpy dtypes.
    Usage
    -----
    >>> from numpy import dtype, arange, int8, float64
    >>> @expect_dtypes(x=dtype(int8))
    ... def foo(x, y):
    ...    return x, y
    ...
    >>> foo(arange(3, dtype=int8), 'foo')
    (array([0, 1, 2], dtype=int8), 'foo')
    >>> foo(arange(3, dtype=float64), 'foo')  # doctest: +NORMALIZE_WHITESPACE
    ...                                       # doctest: +ELLIPSIS
    Traceback (most recent call last):
       ...
    TypeError: ...foo() expected a value with dtype 'int8' for argument 'x',
    but got 'float64' instead.
    """
    for name, type_ in iteritems(named):
        if not isinstance(type_, (dtype, tuple)):
            raise TypeError(
                "expect_dtypes() expected a numpy dtype or tuple of dtypes"
                " for argument {name!r}, but got {dtype} instead.".format(
                    name=name, dtype=dtype,
                )
            )

    if isinstance(__funcname, str):
        def get_funcname(_):
            return __funcname
    else:
        get_funcname = __funcname

    @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,)))
    def _expect_dtype(dtypes):
        """
        Factory for dtype-checking functions that work with the @preprocess
        decorator.
        """

        def error_message(func, argname, value):
            # If the bad value has a dtype, but it's wrong, show the dtype
            # name.  Otherwise just show the value.
            try:
                value_to_show = value.dtype.name
            except AttributeError:
                value_to_show = value
            return (
                "{funcname}() expected a value with dtype {dtype_str} "
                "for argument {argname!r}, but got {value!r} instead."
            ).format(
                funcname=get_funcname(func),
                dtype_str=' or '.join(repr(d.name) for d in dtypes),
                argname=argname,
                value=value_to_show,
            )

        def _actual_preprocessor(func, argname, argvalue):
            if getattr(argvalue, 'dtype', object()) not in dtypes:
                raise TypeError(error_message(func, argname, argvalue))
            return argvalue

        return _actual_preprocessor

    return preprocess(**valmap(_expect_dtype, named))
示例#57
0
文件: client.py 项目: esloch/ibis
except AttributeError:
    infer_pandas_dtype = pd.lib.infer_dtype

_ibis_dtypes = toolz.valmap(
    np.dtype,
    {
        dt.Boolean: np.bool_,
        dt.Null: np.object_,
        dt.Array: np.object_,
        dt.String: np.object_,
        dt.Binary: np.object_,
        dt.Date: 'datetime64[ns]',
        dt.Time: 'timedelta64[ns]',
        dt.Timestamp: 'datetime64[ns]',
        dt.Int8: np.int8,
        dt.Int16: np.int16,
        dt.Int32: np.int32,
        dt.Int64: np.int64,
        dt.UInt8: np.uint8,
        dt.UInt16: np.uint16,
        dt.UInt32: np.uint32,
        dt.UInt64: np.uint64,
        dt.Float32: np.float32,
        dt.Float64: np.float64,
        dt.Decimal: np.object_,
        dt.Struct: np.object_,
    },
)

_numpy_dtypes = toolz.keymap(
    np.dtype,
示例#58
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    rmm_pool_size,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    enable_tcp_over_ucx,
    enable_infiniband,
    enable_nvlink,
    enable_rdmacm,
    net_devices,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if tls_ca_file and tls_cert and tls_key:
        sec = Security(
            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
        )
    else:
        sec = None

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    preload_argv = kwargs.get("preload_argv", [])
    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if rmm_pool_size is not None:
        try:
            import rmm  # noqa F401
        except ImportError:
            raise ValueError(
                "RMM pool requested but module 'rmm' is not available. "
                "For installation instructions, please see "
                "https://github.com/rapidsai/rmm"
            )  # pragma: no cover
        rmm_pool_size = parse_bytes(rmm_pool_size)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            interface=get_ucx_net_devices(
                cuda_device_index=i,
                ucx_net_devices=net_devices,
                get_openfabrics=False,
                get_network=True,
            ),
            preload=(list(preload) or []) + ["dask_cuda.initialize"],
            preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            local_directory=local_directory,
            config={
                "ucx": get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_infiniband=enable_infiniband,
                    enable_nvlink=enable_nvlink,
                    enable_rdmacm=enable_rdmacm,
                    net_devices=net_devices,
                    cuda_device_index=i,
                )
            },
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": memory_limit,
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#59
0
文件: test_bag.py 项目: fortiema/dask
def test_groupby_with_indexer():
    b = db.from_sequence([[1, 2, 3], [1, 4, 9], [2, 3, 4]])
    result = dict(b.groupby(0))
    assert valmap(sorted, result) == {1: [[1, 2, 3], [1, 4, 9]],
                                      2: [[2, 3, 4]]}
示例#60
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
         nanny, name, memory_limit, pid_file, reconnect, resources, bokeh,
         bokeh_port, local_directory, scheduler_file, interface, death_timeout,
         preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key):
    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_worker_cert=tls_cert,
        tls_worker_key=tls_key,
    )

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {'prefix': bokeh_prefix})
            else:
                result = BokehWorker
            services[('bokeh', bokeh_port)] = result

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if scheduler_file:
        while not os.path.exists(scheduler_file):
            sleep(0.01)
        for i in range(10):
            try:
                with open(scheduler_file) as f:
                    cfg = json.load(f)
                scheduler = cfg['address']
                break
            except (ValueError, KeyError):  # race with scheduler on file
                sleep(0.01)

    if not scheduler:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    nannies = [
        t(scheduler,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          security=sec,
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        try:
            if nanny:
                yield [n._close(timeout=2) for n in nannies]
        finally:
            loop.stop()

    def handle_signal(signum, frame):
        logger.info("Exiting on signal %d", signum)
        if loop._running:
            loop.add_callback_from_signal(loop.stop)
        else:
            exit(0)

    # NOTE: We can't use the generic install_signal_handlers() function from
    # distributed.cli.utils because we're handling the signal differently.
    signal.signal(signal.SIGINT, handle_signal)
    signal.signal(signal.SIGTERM, handle_signal)

    for n in nannies:
        n.start(addr)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")

    # Clean exit: unregister all workers from scheduler
    loop.run_sync(close_all)