示例#1
0
def test_read_bytes_delimited(s3, blocksize):
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=b'\n', s3=s3)
    _, values2 = read_bytes(test_bucket_name+'/test/accounts*',
                            blocksize=blocksize, delimiter=b'foo', s3=s3)
    assert ([a.key for a in concat(values)] !=
            [b.key for b in concat(values2)])

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b'\n') for r in res)
    ourlines = b''.join(res).split(b'\n')
    testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
    assert ourlines == testlines

    # delimiter not at the end
    d = b'}'
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=d, s3=s3)
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b'}') for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test
示例#2
0
文件: sql.py 项目: giangzuzana/blaze
def compute_up(expr, data, scope=None, **kwargs):
    data = lower_column(data)
    grouper = compute(
        expr.grouper,
        scope,
        post_compute=False,
        return_type='native',
        **kwargs
    )

    app = expr.apply
    reductions = [
        compute(
            val,
            data,
            post_compute=None,
            return_type='native',
        ).label(name)
        for val, name in zip(app.values, app.fields)
    ]

    froms = list(unique(chain(get_all_froms(grouper),
                              concat(map(get_all_froms, reductions)))))
    inner_cols = list(getattr(grouper, 'inner_columns', [grouper]))
    grouper_cols = inner_cols[:]
    inner_cols.extend(concat(
        getattr(getattr(r, 'element', None), 'inner_columns', [r])
        for r in reductions
    ))
    wheres = unify_wheres([grouper] + reductions)
    sel = unify_froms(sa.select(inner_cols, whereclause=wheres), froms)
    return sel.group_by(*grouper_cols)
示例#3
0
def test_read_bytes_delimited():
    with filetexts(files, mode='b'):
        for bs in [5, 15, 45, 1500]:
            _, values = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'\n')
            _, values2 = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'foo')
            assert ([a.key for a in concat(values)] !=
                    [b.key for b in concat(values2)])

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b'\n') for r in res)
            ourlines = b''.join(res).split(b'\n')
            testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
            assert ourlines == testlines

            # delimiter not at the end
            d = b'}'
            _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b'}') for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test
示例#4
0
  def diagnostic_yield(self, metric='completeness', cutoff=1,
                       superblock_ids=None, group_id=None, sample_ids=None):
    """Calculate diagnostic yield."""
    # extract column to filter on
    metric_column = getattr(BlockData, metric)

    # set up the base query for all blocks
    total_query = self.total_count(BlockData)

    if superblock_ids:
      # apply the superblock filter on the Block class level
      total_query = total_query.join(BlockData.parent)\
                               .filter(Block.superblock_id.in_(superblock_ids))

    # extend base query to include only passed blocks
    pass_query = total_query.filter(metric_column >= cutoff)

    # optionally limit query
    queries = [limit_query(query, group=group_id, samples=sample_ids)
               for query in (total_query, pass_query)]

    # group multiple queries by sample ID (first column)
    metrics = groupby(get(0), concat(queries))

    # iterate over all values, concat different query results, and keep
    # only the unique values (excluding second sample_id)
    combined = (unique(concat(values)) for values in itervalues(metrics))

    # calculate diagnostic yield by simple division
    for sample_id, group_id, total, covered in combined:
      yield sample_id, group_id, (covered / total)
示例#5
0
文件: trf1.py 项目: jni/cafe
def scatter(kd, control, colors=['orange', 'blue'], **kwargs):
    """Show a jittered scatterplot of the measurements.

    Parameters
    ----------
    kd : list of list of float
        The list of `trf_quantify` results for all AUKB knockdown
        images in the dataset. (Each result is itself a list.)
    control : list of list of float
        The list of `trf_quantify` results for all control images in
        the dataset.
    colors : list of two matplotlib colorspecs, optional
        The colors corresponding to AUKB-KD (0) and control (1) data
        points on the scatterplot.
    
    Additional Parameters
    ---------------------
    **kwargs : keyword arguments
        Additional keyword arguments passed directly to
        ``plt.scatter``.

    Returns
    -------
    fig : matplotlib axes
        The returned value from the call to ``plt.scatter``.
    """
    xs = list(tz.concat([i + 0.2 * np.random.randn(n)
                         for i, n in enumerate(map(len, kd + control))]))
    color_vector = ([colors[0]] * sum(map(len, kd)) +
                    [colors[1]] * sum(map(len, control)))
    ys = list(tz.concat(kd + control))
    fig = plt.scatter(xs, ys, c=color_vector, **kwargs)
    plt.xlim(0, max(xs) + 1)
    plt.ylim(0, max(ys) + 1)
    return fig
示例#6
0
def test_modification_time_read_bytes():
    with s3_context('compress', files) as s3:
        _, a = read_bytes('compress/test/accounts.*', s3=s3)
        _, b = read_bytes('compress/test/accounts.*', s3=s3)

        assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)]

    with s3_context('compress', valmap(double, files)) as s3:
        _, c = read_bytes('compress/test/accounts.*', s3=s3)

    assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
示例#7
0
 def start(self):
     self.status = 'running'
     logger.debug("Start Progress Plugin")
     self._start()
     if not self.keys or not any(v for v in self.keys.values()):
         self.stop()
     elif all(k in self.scheduler.exceptions_blame for k in
             concat(self.keys.values())):
         key = next(k for k in concat(self.keys.values()) if k in
                 self.scheduler.exceptions_blame)
         self.stop(exception=True, key=key)
示例#8
0
文件: sql.py 项目: blaze/blaze
def compute_up(expr, args, **kwargs):
    from_objs = list(unique(concat(map(get_all_froms, args))))
    if len(from_objs) > 1:
        # TODO: how do you do this in sql? please send help
        raise ValueError('only columns from the same table can be merged')

    cols = list(unique(concat(map(get_unsafe_inner_columns, args, expr.args))))
    sel = sa.select(cols, from_obj=from_objs[0])
    where = unify_wheres(args)
    if where is not None:
        sel = sel.where(where)
    return sel
示例#9
0
def render_tabular(api, options=None):
  """Entry point for the tabular reporter interface."""
  # determine separator
  separator = options.get('report.separator', '\t')
  human = options.get('report.human')
  panel = options.get('report.panel')
  samples = options.get('report.samples')
  group = options.get('report.group')

  # read gene panel file if it has been set
  if panel:
    superblock_ids = [line.rstrip() for line in panel]
  else:
    superblock_ids = None

  # get sample ID, group and cutoff from metadata
  sample_query = limit_query(api.samples(), group=group, samples=samples)
  metadata = ((sample.id, sample.group_id, sample.cutoff)
              for sample in sample_query)

  # get the data
  base_query = limit_query(api.average_metrics(superblock_ids=superblock_ids),
                           group=group,
                           samples=samples)

  queries = [metadata,
             base_query,
             api.diagnostic_yield(superblock_ids=superblock_ids,
                                  group_id=group, sample_ids=samples),
             api.sex_checker(group_id=group, sample_ids=samples)]

  # group multiple queries by sample ID (first column)
  key_metrics = groupby(get(0), concat(queries))

  # get the column names dynamically from the query
  headers = concatv(['sample_id', 'group_id', 'cutoff'],
                    (column['name'] for column
                     in base_query.column_descriptions),
                    ['diagnostic yield', 'gender'])

  unique_headers = unique(headers)

  # iterate over all values, concat different query results, and keep
  # only the unique values (excluding second sample_id)
  data = (unique(concat(values)) for values in itervalues(key_metrics))

  if human:
    # export key_metrics in a more human friendly format
    return tabulate(data, unique_headers)

  # yield headers
  return '\n'.join(cons('#' + separator.join(unique_headers),
                        stringify_list(data, separator=separator)))
示例#10
0
def compile_components(summary, schema):
    """Given a ``Summary`` object and a table schema, returning 5 sub-functions.

    Parameters
    ----------
    summary : Summary
        The expression describing the aggregations to be computed.

    Returns
    -------
    A tuple of the following functions:

    ``create(shape)``
        Takes the aggregate shape, and returns a tuple of initialized numpy
        arrays.

    ``info(df)``
        Takes a dataframe, and returns preprocessed 1D numpy arrays of the
        needed columns.

    ``append(i, x, y, *aggs_and_cols)``
        Appends the ``i``th row of the table to the ``(x, y)`` bin, given the
        base arrays and columns in ``aggs_and_cols``. This does the bulk of the
        work.

    ``combine(base_tuples)``
        Combine a list of base tuples into a single base tuple. This forms the
        reducing step in a reduction tree.

    ``finalize(aggs)``
        Given a tuple of base numpy arrays, returns the finalized
        ``dynd`` array.
    """
    paths, reds = zip(*preorder_traversal(summary))

    # List of base reductions (actually computed)
    bases = list(unique(concat(r._bases for r in reds)))
    dshapes = [b.out_dshape(schema) for b in bases]
    # List of tuples of (append, base, input columns, temps)
    calls = [_get_call_tuples(b, d) for (b, d) in zip(bases, dshapes)]
    # List of unique column names needed
    cols = list(unique(concat(pluck(2, calls))))
    # List of temps needed
    temps = list(pluck(3, calls))

    create = make_create(bases, dshapes)
    info = make_info(cols)
    append = make_append(bases, cols, calls)
    combine = make_combine(bases, dshapes, temps)
    finalize = make_finalize(bases, summary, schema)

    return create, info, append, combine, finalize
示例#11
0
def test_chunk_datetime():
    data = [[1, 'Alice', 100, datetime.datetime(2014, 10, 1, 1, 1, 1)],
            [2, 'Bob', 200, datetime.datetime(2014, 10, 1, 1, 1, 1)],
            [3, 'Alice', -300, datetime.datetime(2014, 10, 1, 1, 1, 1)],
            [4, 'Charlie', 400, datetime.datetime(2014, 10, 1, 1, 1, 1)],
            [5, 'Edith', 200, datetime.datetime(2014, 10, 1, 1, 1, 1)]]

    t = Symbol('t', 'var * {id: int, name: string, amount: int, when: datetime}')

    c = ChunkIterable(data, chunksize=2)
    assert list(concat(compute(t.when.day, c))) == [1] * 5
    assert list(concat(compute(t.when.date, c))) == \
            [datetime.date(2014, 10, 1)] * 5
示例#12
0
def test_deterministic_key_names(hdfs):
    data = b'abc\n' * int(1e3)
    fn = '%s/file' % basedir

    with hdfs.open(fn, 'wb', replication=1) as fil:
        fil.write(data)

    _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
    _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
    _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c', sample=False)

    assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
    assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
示例#13
0
def test_join():
    cities = TableSymbol('cities', schema='{id: int, city: string}')
    j = join(t, cities, 'id')

    city_data = [[1, 'NYC'], [1, 'Chicago'], [5, 'Paris']]

    assert set(concat(compute(join(cities, t, 'id')[['name', 'city']],
                              {t: c, cities: city_data}))) == \
            set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))

    assert set(concat(compute(join(t, cities, 'id')[['name', 'city']],
                              {t: c, cities: city_data}))) == \
            set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))
示例#14
0
def test_join():
    cities = symbol('cities', dshape='var * {id: int, city: string}')
    j = join(t, cities, 'id')

    city_data = [[1, 'NYC'], [1, 'Chicago'], [5, 'Paris']]

    assert set(concat(compute(j[['name', 'city']],
                              {t: c, cities: city_data}))) == \
            set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))

    assert set(concat(compute(j[['name', 'city']],
                              {t: c, cities: city_data}))) == \
            set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))
示例#15
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'abc\n' * int(1e3)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c')

        assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
        assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
示例#16
0
文件: numpy.py 项目: CaptainAL/Spyder
def deserialize(bytes, dtype, copy=False):
    if dtype == 'O':
        try:
            l = list(concat(map(msgpack.unpackb, framesplit(bytes))))
        except:
            l = list(concat(map(pickle.loads, framesplit(bytes))))

        l = decode(l)

        return np.array(l, dtype='O')
    else:
        result = np.frombuffer(bytes, dtype)
        if copy:
            result = result.copy()
        return result
示例#17
0
def test_registered_read_bytes():
    from dask.bytes.core import read_bytes
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')

        results = compute(*concat(values))
        assert set(results) == set(files.values())
示例#18
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
示例#19
0
文件: split.py 项目: vitan/blaze
def aggregate_shape(leaf, expr, chunk, chunk_expr):
    """ The shape of the intermediate aggregate

    >>> leaf = Symbol('leaf', '10 * 10 * int')
    >>> expr = leaf.sum(axis=0)
    >>> chunk = Symbol('chunk', '3 * 3 * int') # 3 does not divide 10
    >>> chunk_expr = chunk.sum(axis=0, keepdims=1)

    >>> aggregate_shape(leaf, expr, chunk, chunk_expr)
    (4, 10)
    """
    if datashape.var in concat(map(shape, [leaf, expr, chunk, chunk_expr])):
        return (datashape.var, ) * leaf.ndim

    numblocks = [int(floor(l / c)) for l, c in zip(leaf.shape, chunk.shape)]
    last_chunk_shape = [l % c for l, c in zip(leaf.shape, chunk.shape)]

    if builtins.sum(last_chunk_shape) != 0:
        last_chunk = Symbol(chunk._name,
                            DataShape(*(last_chunk_shape + [chunk.dshape.measure])))
        last_chunk_expr = chunk_expr._subs({chunk: last_chunk})
        last_chunk_shape = shape(last_chunk_expr)


    return tuple(int(floor(l / c)) * ce + lce
            for l, c, ce, lce
            in zip(shape(leaf), shape(chunk), shape(chunk_expr), last_chunk_shape))
示例#20
0
    def f(c, a, b):
        keys = yield _scatter((c.ip, c.port), [1, 2, 3])

        assert merge(a.data, b.data) == \
                {k: i for k, i in zip(keys, [1, 2, 3])}

        assert set(c.who_has) == set(keys)
        assert all(len(v) == 1 for v in c.who_has.values())

        keys2, who_has, nbytes = yield scatter_to_workers([a.address, b.address],
                                                          [4, 5, 6])

        m = merge(a.data, b.data)

        for k, v in zip(keys2, [4, 5, 6]):
            assert m[k] == v

        assert isinstance(who_has, dict)
        assert set(concat(who_has.values())) == {a.address, b.address}
        assert len(who_has) == len(keys2)

        assert isinstance(nbytes, dict)
        assert set(nbytes) == set(who_has)
        assert all(isinstance(v, int) for v in nbytes.values())

        result = yield _gather((c.ip, c.port), keys2)
        assert result == [4, 5, 6]
示例#21
0
 def schema(self):
     for c in self.children:
         if not isinstance(c.schema[0], Record):
             raise TypeError("All schemas must have Record shape.  Got %s" %
                             c.schema[0])
     return dshape(Record(list(concat(c.schema[0].parameters[0] for c in
         self.children))))
示例#22
0
    def test_current_session(self):
        regular_minutes = self.trading_calendar.minutes_for_sessions_in_range(
            self.equity_minute_bar_days[0],
            self.equity_minute_bar_days[-1]
        )

        bts_minutes = days_at_time(
            self.equity_minute_bar_days,
            time(8, 45),
            "US/Eastern"
        )

        # some other non-market-minute
        three_oh_six_am_minutes = days_at_time(
            self.equity_minute_bar_days,
            time(3, 6),
            "US/Eastern"
        )

        all_minutes = [regular_minutes, bts_minutes, three_oh_six_am_minutes]
        for minute in list(concat(all_minutes)):
            bar_data = self.create_bardata(lambda: minute)

            self.assertEqual(
                self.trading_calendar.minute_to_session_label(minute),
                bar_data.current_session
            )
示例#23
0
def write_tables(fname, models, year):
    """
    Write all tables injected into `models` to a pandas.HDFStore file.
    If year is not None it will be used to prefix the table names so that
    multiple years can go in the same file.

    Parameters
    ----------
    fname : str
        File name for HDFStore. Will be opened in append mode and closed
        at the end of this function.
    models : list of str
        Models from which to gather injected tables for saving.
    year : int or None
        If an integer, used as a prefix along with table names for
        labeling DataFrames in the HDFStore.

    """
    models = (get_model(m) for m in toolz.unique(models))
    table_names = toolz.unique(toolz.concat(m._tables_used() for m in models))
    tables = (get_table(t) for t in table_names)

    key_template = '{}/{{}}'.format(year) if year is not None else '{}'

    with pd.get_store(fname, mode='a') as store:
        for t in tables:
            store[key_template.format(t.name)] = t.to_frame()
示例#24
0
def test_repeat():
    x = np.random.random((10, 11, 13))
    d = da.from_array(x, chunks=(4, 5, 3))

    repeats = [1, 2, 5]
    axes = [-3, -2, -1, 0, 1, 2]

    for r in repeats:
        for a in axes:
            assert_eq(x.repeat(r, axis=a), d.repeat(r, axis=a))

    assert_eq(d.repeat(2, 0), da.repeat(d, 2, 0))

    with pytest.raises(NotImplementedError):
        da.repeat(d, np.arange(10))

    with pytest.raises(NotImplementedError):
        da.repeat(d, 2, None)

    with pytest.raises(NotImplementedError):
        da.repeat(d, 2)

    for invalid_axis in [3, -4]:
        with pytest.raises(ValueError):
            da.repeat(d, 2, axis=invalid_axis)

    x = np.arange(5)
    d = da.arange(5, chunks=(2,))

    assert_eq(x.repeat(3), d.repeat(3))

    for r in [1, 2, 3, 4]:
        assert all(concat(d.repeat(r).chunks))
示例#25
0
文件: sql.py 项目: jcrist/odo
def batch(sel, chunksize=10000, bind=None):
    """Execute `sel`, streaming row at a time and fetching from the database in
    batches of size `chunksize`.

    Parameters
    ----------
    sel : sa.sql.Selectable
        Selectable to execute
    chunksize : int, optional, default 10000
        Number of rows to fetch from the database
    """

    def rowterator(sel, chunksize=chunksize):
        with getbind(sel, bind).connect() as conn:
            result = conn.execute(sel)
            yield result.keys()

            for rows in iter_except(curry(result.fetchmany, size=chunksize), sa.exc.ResourceClosedError):
                if rows:
                    yield rows
                else:
                    return

    terator = rowterator(sel)
    return next(terator), concat(terator)
示例#26
0
文件: anrat.py 项目: rewonc/treeano
    def init_state(self, network):
        super(ANRATNode, self).init_state(network)

        inits = list(toolz.concat(network.find_hyperparameters(["inits"], [])))
        # setting initial lambda to 5 instead of 10, because 10 is too large
        # for the default parameters
        # TODO might also want to add clipping to cap the value of lambda
        initial_lambda = network.find_hyperparameter(["anrat_initial_lambda"], 5)
        if ANRAT_USE_LOG_LAMBDA:
            initial_lambda = np.log(initial_lambda)

        lambda_vw = network.create_vw(
            name="lambda",
            is_shared=True,
            shape=(),
            tags={"parameter"},
            inits=inits + [treeano.inits.ConstantInit(initial_lambda)],
        )
        p = network.find_hyperparameter(["nrae_p"], 2)
        q = network.find_hyperparameter(["nrae_q"], 2)
        r = network.find_hyperparameter(["anrat_r"], 1)
        alpha = network.find_hyperparameter(["anrat_alpha", "alpha"], 0.1)
        i32_target = network.find_hyperparameter(["i32_target"], False)
        lambda_var = lambda_vw.variable

        if ANRAT_USE_LOG_LAMBDA:
            lambda_var = T.exp(lambda_var)

        cost_function = functools.partial(_ANRAT, lambda_=lambda_var, p=p, q=q, r=r, alpha=alpha, i32_target=i32_target)
        network.set_hyperparameter(self.name + "_elementwise", "cost_function", cost_function)
示例#27
0
文件: bcolz.py 项目: leolujuyi/blaze
def compute_down(expr, data, chunksize=2**20, map=map, **kwargs):
    leaf = expr._leaves()[0]

    # If the bottom expression is a projection or field then want to do
    # compute_up first
    children = set(e for e in expr._traverse()
                   if isinstance(e, Expr)
                   and any(i is expr._leaves()[0] for i in e._inputs))
    if len(children) == 1 and isinstance(first(children), (Field, Projection)):
        raise NotImplementedError()


    chunk = symbol('chunk', chunksize * leaf.schema)
    (chunk, chunk_expr), (agg, agg_expr) = split(leaf, expr, chunk=chunk)

    data_parts = partitions(data, chunksize=(chunksize,))

    parts = list(map(curry(compute_chunk, data, chunk, chunk_expr),
                           data_parts))

    if isinstance(parts[0], np.ndarray):
        intermediate = np.concatenate(parts)
    elif isinstance(parts[0], pd.DataFrame):
        intermediate = pd.concat(parts)
    elif isinstance(parts[0], Iterable):
        intermediate = list(concat(parts))
    else:
        raise TypeError(
        "Don't know how to concatenate objects of type %s" % type(parts[0]))

    return compute(agg_expr, {agg: intermediate})
示例#28
0
文件: core.py 项目: MattMing/zipline
 def load_adjusted_array(self, columns, dates, assets, mask):
     return dict(
         concat(map(
             partial(self._load_dataset, dates, assets, mask),
             itervalues(groupby(getdataset, columns))
         ))
     )
示例#29
0
def scatter_to_workers(center, ncores, data, key=None):
    """ Scatter data directly to workers

    This distributes data in a round-robin fashion to a set of workers based on
    how many cores they have.  ncores should be a dictionary mapping worker
    identities to numbers of cores.

    See scatter for parameter docstring
    """
    center = coerce_to_rpc(center)
    if key is None:
        key = str(uuid.uuid1())

    if isinstance(ncores, Iterable) and not isinstance(ncores, dict):
        ncores = {worker: 1 for worker in ncores}

    workers = list(concat([w] * nc for w, nc in ncores.items()))
    if isinstance(data, dict):
        names, data = list(zip(*data.items()))
    else:
        names = ("%s-%d" % (key, i) for i in count(0))

    L = list(zip(cycle(workers), names, data))
    d = groupby(0, L)
    d = {k: {b: c for a, b, c in v} for k, v in d.items()}

    yield [rpc(ip=w_ip, port=w_port).update_data(data=v, close=True) for (w_ip, w_port), v in d.items()]

    result = [RemoteData(b, center.ip, center.port, result=c) for a, b, c in L]

    raise Return(result)
示例#30
0
文件: core.py 项目: kastnerkyle/dask
 def compute(self, **kwargs):
     results = self.get(self.dask, self._keys(), **kwargs)
     if isinstance(results[0], Iterable):
         results = concat(results)
     if not isinstance(results, Iterator):
         results = iter(results)
     return results
示例#31
0
def aconcat(seqs):
    """Like `toolz.concat`, but it returns a array instead of an iterator."""
    return np.array(list(toolz.concat(seqs)))
示例#32
0
文件: dask.py 项目: nkhuyu/blaze
 def compute_broadcast(expr, *data, **kwargs):
     expr_inds = tuple(range(ndim(expr)))[::-1]
     func = get_numba_ufunc(expr)
     return atop(func,
                 expr_inds,
                 *concat((dat, tuple(range(ndim(dat))[::-1])) for dat in data))
示例#33
0
def get_all_froms(function):
    return list(unique(concat(map(get_all_froms, function.clauses.clauses))))
示例#34
0
def slice_with_bool_dask_array(x, index):
    """ Slice x with one or more dask arrays of bools

    This is a helper function of `Array.__getitem__`.

    Parameters
    ----------
    x: Array
    index: tuple with as many elements as x.ndim, among which there are
           one or more Array's with dtype=bool

    Returns
    -------
    tuple of (sliced x, new index)

    where the new index is the same as the input, but with slice(None)
    replaced to the original slicer when a filter has been applied.

    Note: The sliced x will have nan chunks on the sliced axes.
    """
    from .core import Array, atop, elemwise

    out_index = [
        slice(None) if isinstance(ind, Array) and ind.dtype == bool else ind
        for ind in index
    ]

    if len(index) == 1 and index[0].ndim == x.ndim:
        y = elemwise(getitem, x, *index, dtype=x.dtype)
        name = 'getitem-' + tokenize(x, index)
        dsk = {(name, i): k
               for i, k in enumerate(core.flatten(y.__dask_keys__()))}
        chunks = ((np.nan, ) * y.npartitions, )
        return (Array(sharedict.merge(y.dask, (name, dsk)), name, chunks,
                      x.dtype), out_index)

    if any(
            isinstance(ind, Array) and ind.dtype == bool and ind.ndim != 1
            for ind in index):
        raise NotImplementedError(
            "Slicing with dask.array of bools only permitted when "
            "the indexer has only one dimension or when "
            "it has the same dimension as the sliced "
            "array")
    indexes = [
        ind if isinstance(ind, Array) and ind.dtype == bool else slice(None)
        for ind in index
    ]

    arginds = []
    i = 0
    for ind in indexes:
        if isinstance(ind, Array) and ind.dtype == bool:
            new = (ind, tuple(range(i, i + ind.ndim)))
            i += x.ndim
        else:
            new = (slice(None), None)
            i += 1
        arginds.append(new)

    arginds = list(concat(arginds))

    out = atop(getitem_variadic,
               tuple(range(x.ndim)),
               x,
               tuple(range(x.ndim)),
               *arginds,
               dtype=x.dtype)

    chunks = []
    for ind, chunk in zip(index, out.chunks):
        if isinstance(ind, Array) and ind.dtype == bool:
            chunks.append((np.nan, ) * len(chunk))
        else:
            chunks.append(chunk)
    out._chunks = tuple(chunks)
    return out, tuple(out_index)
示例#35
0
def assert_balanced(inp, expected, c, s, *workers):
    steal = s.extensions["stealing"]
    steal._pc.stop()

    counter = itertools.count()
    tasks = list(concat(inp))
    data_seq = itertools.count()

    futures = []
    for w, ts in zip(workers, inp):
        for t in sorted(ts, reverse=True):
            if t:
                [dat] = yield c.scatter([next(data_seq)], workers=w.address)
                ts = s.tasks[dat.key]
                # Ensure scheduler state stays consistent
                old_nbytes = ts.nbytes
                ts.nbytes = s.bandwidth * t
                for ws in ts.who_has:
                    ws.nbytes += ts.nbytes - old_nbytes
            else:
                dat = 123
            s.task_duration[str(int(t))] = 1
            i = next(counter)
            f = c.submit(
                func,
                dat,
                key="%d-%d" % (int(t), i),
                workers=w.address,
                allow_other_workers=True,
                pure=False,
                priority=-i,
            )
            futures.append(f)

    while len(s.rprocessing) < len(futures):
        yield gen.sleep(0.001)

    for i in range(10):
        steal.balance()

        while steal.in_flight:
            yield gen.sleep(0.001)

        result = [
            sorted([int(key_split(k)) for k in s.processing[w.address]],
                   reverse=True) for w in workers
        ]

        result2 = sorted(result, reverse=True)
        expected2 = sorted(expected, reverse=True)

        if config.get("pdb-on-err"):
            if result2 != expected2:
                import pdb

                pdb.set_trace()

        if result2 == expected2:
            return
    raise Exception("Expected: {}; got: {}".format(str(expected2),
                                                   str(result2)))
示例#36
0
def everything_but(k, d):
    """
    Return iterator of all values in d except the values in k.
    """
    assert k in d
    return concat(keyfilter(lambda x: x != k, d).values())
def read_text(urlpath, blocksize=None, compression='infer',
              encoding=system_encoding, errors='strict',
              linedelimiter=os.linesep, collection=True,
              storage_options=None):
    """ Read lines from text files

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), globstring, or a list of beforementioned strings.
    blocksize: None or int
        Size (in bytes) to cut up larger files.  Streams by default.
    compression: string
        Compression format like 'gzip' or 'xz'.  Defaults to 'infer'
    encoding: string
    errors: string
    linedelimiter: string
    collection: bool, optional
        Return dask.bag if True, or list of delayed values if false
    storage_options: dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.

    Examples
    --------
    >>> b = read_text('myfiles.1.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt.gz')  # doctest: +SKIP
    >>> b = read_text('s3://bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt')  # doctest: +SKIP

    Parallelize a large file by providing the number of uncompressed bytes to
    load into each partition.

    >>> b = read_text('largefile.txt', blocksize=1e7)  # doctest: +SKIP

    Returns
    -------
    dask.bag.Bag if collection is True or list of Delayed lists otherwise

    See Also
    --------
    from_sequence: Build bag from Python sequence
    """
    if isinstance(urlpath, (tuple, list, set)):
        blocks = sum([read_text(fn, blocksize=blocksize,
                      compression=compression, encoding=encoding, errors=errors,
                      linedelimiter=linedelimiter, collection=False,
                      storage_options=storage_options)
                     for fn in urlpath], [])
    else:
        if blocksize is None:
            files = open_text_files(urlpath, encoding=encoding, errors=errors,
                                    compression=compression,
                                    **(storage_options or {}))
            blocks = [delayed(list, pure=True)(delayed(file_to_blocks)(file))
                      for file in files]

        else:
            _, blocks = read_bytes(urlpath, delimiter=linedelimiter.encode(),
                                   blocksize=blocksize, sample=False,
                                   compression=compression,
                                   **(storage_options or {}))
            if isinstance(blocks[0], (tuple, list)):
                blocks = list(concat(blocks))
            blocks = [delayed(decode)(b, encoding, errors)
                      for b in blocks]

    if not blocks:
        raise ValueError("No files found", urlpath)

    if not collection:
        return blocks
    else:
        return from_delayed(blocks)
示例#38
0
def physical_tables_join(join):
    # Physical roots of Join nodes are the unique physical roots of their
    # left and right TableNodes.
    func = compose(physical_tables, methodcaller('op'))
    return list(unique(concat(map(func, (join.left, join.right)))))
示例#39
0
def everything_but(k, d):
    """
    Return iterator of all values in d except the values in k.
    """
    assert k in d
    return concat(itervalues(keyfilter(ne(k), d)))
示例#40
0
文件: selection.py 项目: bochuxt/ibis
def physical_tables_node(node):
    # Iterative case. Any other Node's physical roots are the unique physical
    # roots of that Node's root tables.
    tables = toolz.concat(map(physical_tables, node.root_tables()))
    return list(toolz.unique(tables, key=id))
示例#41
0
文件: core.py 项目: zofuthan/zipline
def powerset(values):
    """
    Return the power set (i.e., the set of all subsets) of entries in `values`.
    """
    return concat(combinations(values, i) for i in range(len(values) + 1))
示例#42
0
def distinct_roots(*expressions):
    # TODO: move to analysis
    roots = toolz.concat(expr.op().root_tables() for expr in expressions)
    return list(toolz.unique(roots))
示例#43
0
文件: types.py 项目: djv/ibis
def distinct_roots(*expressions):
    roots = toolz.concat(expression._root_tables()
                         for expression in expressions)
    return list(toolz.unique(roots, key=id))
示例#44
0
 def __iter__(self):
     return toolz.unique(toolz.concat(self.layers.values()))
示例#45
0
def lconcat(seqs):
    """Like `toolz.concat`, but it returns a list instead of an iterator."""
    return list(toolz.concat(seqs))
示例#46
0
文件: dask.py 项目: nkhuyu/blaze
def elemwise_array(expr, *data, **kwargs):
    leaves = expr._inputs
    expr_inds = tuple(range(ndim(expr)))[::-1]
    return atop(curry(compute_it, expr, leaves, **kwargs),
                expr_inds,
                *concat((dat, tuple(range(ndim(dat))[::-1])) for dat in data))
    def morph_counts_fastest_version(self, words):
        # Word List to list of all morphisms

        word_counts = Counter(
            word for word in toolz.concat(map(self.word_tokenizer, words)))

        #print("words_counts: ")
        #print(word_counts)
        print("")
        print("Unique number words: " + str(len(set(words))))
        print("Total number of words: " + str(len(words)))
        print("")

        unique_words_set = set(words)
        unique_words = list(unique_words_set)

        frog = Frog(
            FrogOptions(tok=True,
                        lemma=True,
                        morph=True,
                        daringmorph=False,
                        mwu=False,
                        chunking=False,
                        ner=False,
                        parser=False))
        batch_size = 400
        morphisms = []
        print_batch_number = 1
        start_time = time.time()
        total_batch_number = math.ceil(len(unique_words) / batch_size)
        total_process_time = 0
        total_getting_morphisms_time = 0

        for i in range(0, len(unique_words), batch_size):
            t0 = time.time()
            words_batch = unique_words[i:i + batch_size]
            words_batch_string = ' '.join(words_batch)
            output = frog.process(words_batch_string)
            process_time = time.time() - t0
            t1 = time.time()

            for j in range(0, len(words_batch) - 1):
                current_word = output[j].get("text")
                morphisms_word = output[j].get("morph")
                morphisms_word_list = morphisms_word.replace('[',
                                                             '').split(']')
                current_word_count = word_counts[current_word]

                # Momenteel GEEN GEHELE WOORDEN IN COUNT
                if len(morphisms_word_list) > 2:
                    morphisms += morphisms_word_list * current_word_count

                total_batch_length = len(words_batch)
            print("batch" + " (" + str(batch_size) + " words):    " +
                  str(print_batch_number) + " of " + str(total_batch_number))

            print_batch_number += 1
            getting_morphisms_time = time.time() - t1
            total_process_time += process_time
            total_getting_morphisms_time += getting_morphisms_time

        print("")
        print("Total number of words: ")
        print(len(words))
        print("")
        print("Unique number words: ")
        print(len(set(words)))
        print("")
        print("Total Process Time:")
        print(self.format_time(total_process_time))
        print("")
        print("Total Getting Morphisms Time: ")
        print(self.format_time(total_getting_morphisms_time))
        print("")
        print("Total Time:")
        print(self.format_time(time.time() - start_time))
        print("")

        # Remove the empty strings
        morphisms = list(filter(None, morphisms))
        #Make a counter of all morphisms
        morph_counts = Counter(morphisms)

        with open('Old/morph_counts.pickle', 'wb') as outputfile:
            pickle.dump(morph_counts, outputfile)

        return morph_counts
示例#48
0
 def test_default_calendars(self):
     # concat 连接迭代
     for name in concat(
         [_default_calendar_factories, _default_calendar_aliases]):
         self.assertIsNotNone(get_calendar(name),
                              "get_calendar(%r) returned None" % name)
示例#49
0
 def get_assets(self):
     assets = [directory.walk() for directory in self._root_dirs]
     self.assets = sorted(toolz.unique(toolz.concat(assets)))
     return self
示例#50
0
文件: top.py 项目: carletes/dask
def atop(func, out_ind, *args, **kwargs):
    """ Tensor operation: Generalized inner and outer products

    A broad class of blocked algorithms and patterns can be specified with a
    concise multi-index notation.  The ``atop`` function applies an in-memory
    function across multiple blocks of multiple inputs in a variety of ways.
    Many dask.array operations are special cases of atop including elementwise,
    broadcasting, reductions, tensordot, and transpose.

    Parameters
    ----------
    func : callable
        Function to apply to individual tuples of blocks
    out_ind : iterable
        Block pattern of the output, something like 'ijk' or (1, 2, 3)
    *args : sequence of Array, index pairs
        Sequence like (x, 'ij', y, 'jk', z, 'i')
    **kwargs : dict
        Extra keyword arguments to pass to function
    dtype : np.dtype
        Datatype of resulting array.
    concatenate : bool, keyword only
        If true concatenate arrays along dummy indices, else provide lists
    adjust_chunks : dict
        Dictionary mapping index to function to be applied to chunk sizes
    new_axes : dict, keyword only
        New indexes and their dimension lengths

    Examples
    --------
    2D embarrassingly parallel operation from two arrays, x, and y.

    >>> z = atop(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8')  # z = x + y  # doctest: +SKIP

    Outer product multiplying x by y, two 1-d vectors

    >>> z = atop(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8')  # doctest: +SKIP

    z = x.T

    >>> z = atop(np.transpose, 'ji', x, 'ij', dtype=x.dtype)  # doctest: +SKIP

    The transpose case above is illustrative because it does same transposition
    both on each in-memory block by calling ``np.transpose`` and on the order
    of the blocks themselves, by switching the order of the index ``ij -> ji``.

    We can compose these same patterns with more variables and more complex
    in-memory functions

    z = X + Y.T

    >>> z = atop(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8')  # doctest: +SKIP

    Any index, like ``i`` missing from the output index is interpreted as a
    contraction (note that this differs from Einstein convention; repeated
    indices do not imply contraction.)  In the case of a contraction the passed
    function should expect an iterable of blocks on any array that holds that
    index.  To receive arrays concatenated along contracted dimensions instead
    pass ``concatenate=True``.

    Inner product multiplying x by y, two 1-d vectors

    >>> def sequence_dot(x_blocks, y_blocks):
    ...     result = 0
    ...     for x, y in zip(x_blocks, y_blocks):
    ...         result += x.dot(y)
    ...     return result

    >>> z = atop(sequence_dot, '', x, 'i', y, 'i', dtype='f8')  # doctest: +SKIP

    Add new single-chunk dimensions with the ``new_axes=`` keyword, including
    the length of the new dimension.  New dimensions will always be in a single
    chunk.

    >>> def f(x):
    ...     return x[:, None] * np.ones((1, 5))

    >>> z = atop(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype)  # doctest: +SKIP

    If the applied function changes the size of each chunk you can specify this
    with a ``adjust_chunks={...}`` dictionary holding a function for each index
    that modifies the dimension size in that index.

    >>> def double(x):
    ...     return np.concatenate([x, x])

    >>> y = atop(double, 'ij', x, 'ij',
    ...          adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype)  # doctest: +SKIP

    Include literals by indexing with None

    >>> y = atop(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype)  # doctest: +SKIP

    See Also
    --------
    top - dict formulation of this function, contains most logic
    """
    out = kwargs.pop('name', None)  # May be None at this point
    token = kwargs.pop('token', None)
    dtype = kwargs.pop('dtype', None)
    adjust_chunks = kwargs.pop('adjust_chunks', None)
    new_axes = kwargs.get('new_axes', {})

    from .core import Array, unify_chunks, normalize_arg

    if dtype is None:
        raise ValueError("Must specify dtype of output array")

    chunkss, arrays = unify_chunks(*args)
    for k, v in new_axes.items():
        chunkss[k] = (v, )
    arginds = list(zip(arrays, args[1::2]))

    for arg, ind in arginds:
        if hasattr(arg, 'ndim') and hasattr(
                ind, '__len__') and arg.ndim != len(ind):
            raise ValueError(
                "Index string %s does not match array dimension %d" %
                (ind, arg.ndim))

    numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None}
    argindsstr = list(
        toolz.concat([(normalize_arg(a) if ind is None else a.name, ind)
                      for a, ind in arginds]))
    # Finish up the name
    if not out:
        out = '%s-%s' % (token or utils.funcname(func).strip('_'),
                         base.tokenize(func, out_ind, argindsstr, dtype, **
                                       kwargs))

    kwargs2 = {k: normalize_arg(v) for k, v in kwargs.items()}
    dsk = _top(func, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs2)
    dsks = [a.dask for a, ind in arginds if ind is not None]

    chunks = [chunkss[i] for i in out_ind]
    if adjust_chunks:
        for i, ind in enumerate(out_ind):
            if ind in adjust_chunks:
                if callable(adjust_chunks[ind]):
                    chunks[i] = tuple(map(adjust_chunks[ind], chunks[i]))
                elif isinstance(adjust_chunks[ind], numbers.Integral):
                    chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i])
                elif isinstance(adjust_chunks[ind], (tuple, list)):
                    chunks[i] = tuple(adjust_chunks[ind])
                else:
                    raise NotImplementedError(
                        "adjust_chunks values must be callable, int, or tuple")
    chunks = tuple(chunks)

    return Array(sharedict.merge(
        (out, dsk),
        *dsks,
        dependencies={out: {a.name
                            for a, ind in arginds if ind is not None}}),
                 out,
                 chunks,
                 dtype=dtype)
示例#51
0
 def _():
     return concat(
         convert(chunks(pd.DataFrame), csv, **kwargs) for csv in csvs)
示例#52
0
def get_unsafe_inner_columns(f):
    unique_columns = unique(concat(map(get_unsafe_inner_columns, f.clauses)))
    lowered = [x.label(getattr(x, 'name', None)) for x in unique_columns]
    return [getattr(sa.func, f.name)(*lowered)]
示例#53
0
文件: test_s3.py 项目: ThreeNG/dask
def test_registered(s3):
    sample, values = read_bytes('s3://%s/test/accounts.*.json' %
                                test_bucket_name)

    results = compute(*concat(values))
    assert set(results) == set(files.values())
示例#54
0
def get_all_froms(colelement):
    return list(unique(concat(map(get_all_froms, colelement.get_children()))))
示例#55
0
 def inputs(self):
     return tuple(unique(concat(v.inputs for v in self.values)))
示例#56
0
    def __init__(self,
                 host='127.0.0.1',
                 http_port=9786,
                 bokeh_port=8787,
                 scheduler_address='tcp://127.0.0.1:8786',
                 bokeh_whitelist=[],
                 log_level=logging_level,
                 show=False,
                 prefix=None,
                 use_xheaders=False,
                 quiet=True):
        self.port = bokeh_port
        ip = socket.gethostbyname(host)

        hosts = ['localhost', '127.0.0.1', ip, host]

        with ignoring(Exception):
            hosts.append(socket.gethostbyname(ip))
        with ignoring(Exception):
            hosts.append(socket.gethostbyname(socket.gethostname()))

        hosts = ['%s:%d' % (h, bokeh_port) for h in hosts]

        hosts.append("*")

        hosts.extend(map(str, bokeh_whitelist))

        args = ([sys.executable, '-m', 'bokeh', 'serve'] + paths + [
            '--check-unused-sessions=50', '--unused-session-lifetime=1',
            '--allow-websocket-origin=*', '--port',
            str(bokeh_port)
        ])
        if bokeh.__version__ <= '0.12.4':
            args += sum([['--host', h] for h in hosts], [])

        if prefix:
            args.extend(['--prefix', prefix])

        if show:
            args.append('--show')

        if use_xheaders:
            args.append('--use-xheaders')

        if log_level in ('debug', 'info', 'warning', 'error', 'critical'):
            args.extend(['--log-level', log_level])

        bokeh_options = {
            'host': host,
            'http-port': http_port,
            'scheduler-address': scheduler_address,
            'bokeh-port': bokeh_port
        }

        args.extend(['--args'] + list(map(str, concat(bokeh_options.items()))))

        import subprocess
        process = subprocess.Popen(args)
        self.process = process

        @atexit.register
        def cleanup_process():
            try:
                process.terminate()
            except OSError:
                pass

        if not quiet:
            logger.info("Web UI: http://%s:%d/status/" % (ip, bokeh_port))
示例#57
0
    plt.show()

    ###########################################################################
    # Calculating variances
    # array of features
    xa = np.vstack([list(toolz.pluck('x', ep)) for ep in episodes])
    # get array of returns
    ga = np.hstack([episode_return(ep) for ep in episodes])
    # returns multiplied by feature vector
    gx = xa.T * ga
    # per-feature variance (is this right?)
    va = np.var(gx, axis=1)


    # Delta squared for variance
    deltas = np.array(list(toolz.pluck('delta', toolz.concat(episodes))))
    dsqa = deltas**2
    # delta-squared return
    dsqret = calculate_return(dsqa, lpluck('gm', concat(episodes)))
    # multiplying by features
    dsqrx = xa.T * dsqret
    
    # averaging for per-feature delta-squared return (is this right?)
    dsvar_w = np.mean(dsqrx, axis=1)

    # least squares solution (this appears to have some sort of issue)
    # dsvar_w, res, *_ = np.linalg.lstsq(xa, dsqret)


    # make heatmap 
    traj = np.array(list(toolz.pluck('obs', episodes[-1])))    
示例#58
0
def sumDigits(ints: PVector[int]) -> int:
    return last(accumulate(add, concat(map(lambda c: toDigits(c), ints))))
示例#59
0
def physical_tables_node(node):
    # Iterative case. Any other Node's physical roots are the unique physical
    # roots of that Node's root tables.
    return list(unique(concat(map(physical_tables, node.root_tables()))))
示例#60
0
文件: core.py 项目: haibocheng/blaze
def subterms(expr):
    return concat([[expr], concat(map(subterms, expr._inputs))])