示例#1
0
def has_next_day(dates_dict, year, month, day):
    """Return next day found in nested dates_dict
    or None if can't find one."""
    # Check current month for next days
    days = sorted(dates_dict[year][month].keys())
    if day != last(days):
        di = days.index(day)
        next_day = days[di + 1]
        return {"year": year, "month": month, "day": next_day}
    # dates_dict[year][month][next_day])

    # Check current year for next months
    months = sorted(dates_dict[year].keys())
    if month != last(months):
        mi = months.index(month)
        next_month = months[mi + 1]
        next_day = first(sorted(dates_dict[year][next_month].keys()))
        return {"year": year, "month": next_month, "day": next_day}

    # Check for next years
    years = sorted(dates_dict.keys())
    if year != last(years):
        yi = years.index(year)
        next_year = years[yi + 1]
        next_month = first(sorted(dates_dict[next_year].keys()))
        next_day = first(sorted(dates_dict[next_year][next_month].keys()))
        return {"year": next_year, "month": next_month, "day": next_day}
    return False
示例#2
0
文件: multi.py 项目: yliapis/dask
def single_partition_join(left, right, **kwargs):
    # if the merge is perfomed on_index, divisions can be kept, otherwise the
    # new index will not necessarily correspond the current divisions

    meta = left._meta_nonempty.merge(right._meta_nonempty, **kwargs)
    kwargs['empty_index_dtype'] = meta.index.dtype
    name = 'merge-' + tokenize(left, right, **kwargs)
    if left.npartitions == 1 and kwargs['how'] in ('inner', 'right'):
        left_key = first(left.__dask_keys__())
        dsk = {(name, i): (apply, merge_chunk, [left_key, right_key], kwargs)
               for i, right_key in enumerate(right.__dask_keys__())}

        if kwargs.get('right_index') or right._contains_index_name(
                kwargs.get('right_on')):
            divisions = right.divisions
        else:
            divisions = [None for _ in right.divisions]

    elif right.npartitions == 1 and kwargs['how'] in ('inner', 'left'):
        right_key = first(right.__dask_keys__())
        dsk = {(name, i): (apply, merge_chunk, [left_key, right_key], kwargs)
               for i, left_key in enumerate(left.__dask_keys__())}

        if kwargs.get('left_index') or left._contains_index_name(
                kwargs.get('left_on')):
            divisions = left.divisions
        else:
            divisions = [None for _ in left.divisions]
    else:
        raise NotImplementedError("single_partition_join has no fallback for invalid calls")

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[left, right])
    return new_dd_object(graph, name, meta, divisions)
示例#3
0
def has_previous_day(dates_dict, year, month, day):
    """Return previous day found in nested dates_dict
    or None if can't find one."""
    days = sorted(dates_dict[year][month].keys())
    # Check current month
    if day != first(days):
        di = days.index(day)
        prev_day = days[di - 1]
        return {"year": year, "month": month, "day": prev_day}

    # Check current year
    months = sorted(dates_dict[year].keys())
    if month != first(months):
        mi = months.index(month)
        prev_month = months[mi - 1]
        last_day = last(sorted(dates_dict[year][prev_month].keys()))
        return {"year": year, "month": prev_month, "day": last_day}

    # Check other years
    years = sorted(dates_dict.keys())
    if year != first(years):
        yi = years.index(year)
        prev_year = years[yi - 1]
        prev_month = last(sorted(dates_dict[prev_year].keys()))
        last_day = last(sorted(dates_dict[prev_year][prev_month].keys()))
        return {"year": prev_year, "month": prev_month, "day": last_day}

    return False
示例#4
0
def _get_larger_chroms(ref_file):
    """Retrieve larger chromosomes, avoiding the smaller ones for plotting.
    """
    from scipy.cluster.vq import kmeans, vq
    all_sizes = []
    for c in ref.file_contigs(ref_file):
        all_sizes.append(float(c.size))
    all_sizes.sort()
    # separate out smaller chromosomes and haplotypes with kmeans
    centroids, _ = kmeans(np.array(all_sizes), 2)
    idx, _ = vq(np.array(all_sizes), centroids)
    little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes)))
    little_sizes = [x[1] for x in little_sizes]
    # create one more cluster with the smaller, removing the haplotypes
    centroids2, _ = kmeans(np.array(little_sizes), 2)
    idx2, _ = vq(np.array(little_sizes), centroids2)
    little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes)))
    little_sizes2 = [x[1] for x in little_sizes2]
    # get any chromosomes not in haplotype/random bin
    thresh = max(little_sizes2)
    larger_chroms = []
    for c in ref.file_contigs(ref_file):
        if c.size > thresh:
            larger_chroms.append(c.name)
    return larger_chroms
示例#5
0
文件: sql.py 项目: giangzuzana/blaze
def str_cat_sql(expr, lhs, rhs, **kwargs):
    left, right = first(lhs.inner_columns), first(rhs.inner_columns)
    if expr.sep:
        result = (left + expr.sep + right).label(expr.lhs._name)
    else:
        result = (left + right).label(expr.lhs._name)
    return reconstruct_select([result], lhs)
示例#6
0
文件: multi.py 项目: floriango/dask
def single_partition_join(left, right, **kwargs):
    # if the merge is perfomed on_index, divisions can be kept, otherwise the
    # new index will not necessarily correspond the current divisions

    meta = pd.merge(left._meta_nonempty, right._meta_nonempty, **kwargs)
    name = 'merge-' + tokenize(left, right, **kwargs)
    if left.npartitions == 1:
        left_key = first(left.__dask_keys__())
        dsk = {(name, i): (apply, pd.merge, [left_key, right_key], kwargs)
               for i, right_key in enumerate(right.__dask_keys__())}

        if kwargs.get('right_index') or right._contains_index_name(
                kwargs.get('right_on')):
            divisions = right.divisions
        else:
            divisions = [None for _ in right.divisions]

    elif right.npartitions == 1:
        right_key = first(right.__dask_keys__())
        dsk = {(name, i): (apply, pd.merge, [left_key, right_key], kwargs)
               for i, left_key in enumerate(left.__dask_keys__())}

        if kwargs.get('left_index') or left._contains_index_name(
                kwargs.get('left_on')):
            divisions = left.divisions
        else:
            divisions = [None for _ in left.divisions]

    return new_dd_object(toolz.merge(dsk, left.dask, right.dask), name,
                         meta, divisions)
示例#7
0
    def test_live_migrate_anti_affinity(self):
        """
        Make sure that if we have an anti-affinity group set, and we try
        to live migrate to a host with the anti-affinity group, it will
        fail

        - Creates an
        :return:
        """
        data = self.setup_affinities(self.sanity)

        # Make sure that the affinity and anti-aff instances are booted up
        aff_inst = data["aff_instance"]
        anti_inst = data["anti_instance"]
        smog.nova.poll_status(aff_inst, "ACTIVE")
        smog.nova.poll_status(anti_inst, "ACTIVE")

        # Now, perform a live migration for the anti_inst.  This should fail
        # Get what host the instance is currently on, and compare before/after
        discovered = self.sanity.discover()
        fltrfn = lambda x: x.instance.name == "aa-test"

        # In functional-speak, find the instance object in out discovered
        # discovered Instance objects whose name is 'aff-test'.  There should
        # only be one of these, so take the first one.  Use toolz.first rather
        # than use index ([0]).  In the general case this is better (for
        # example, what if we use a generator or iterator instead of list or
        # tuple.  Remember, functional programming rulez!
        before_inst = toolz.first(filter(fltrfn, [inst for inst in discovered]))
        before_host = before_inst.host
        anti_inst.live_migrate()
        discovered = self.sanity.discover()
        after_inst = toolz.first(filter(fltrfn, [inst for inst in discovered]))
        after_host = after_inst.host
        self.assertTrue(before_host.hostname == after_host.hostname)
示例#8
0
文件: sql.py 项目: jessezwd/blaze
def binop_sql(t, lhs, rhs, **kwargs):
    if isinstance(lhs, Select):
        assert len(lhs.c) == 1, "Select cannot have more than a single column when doing" " arithmetic, got %r" % lhs
        lhs = first(lhs.inner_columns)
    if isinstance(rhs, Select):
        assert len(rhs.c) == 1, "Select cannot have more than a single column when doing" " arithmetic, got %r" % rhs
        rhs = first(rhs.inner_columns)

    return t.op(lhs, rhs)
示例#9
0
文件: sql.py 项目: giangzuzana/blaze
def coalesce_sql_select(expr, lhs, rhs, **kwargs):
    if isinstance(lhs, Select):
        orig = lhs
        lhs = first(lhs.inner_columns)
    else:
        orig = rhs
        rhs = first(rhs.inner_columns)
    result = sa.sql.functions.coalesce(lhs, rhs).label(expr._name)
    return reconstruct_select([result], orig)
示例#10
0
文件: sql.py 项目: giangzuzana/blaze
def str_cat_sql(expr, lhs, rhs, **kwargs):
    if isinstance(lhs, Select):
        orig = lhs
        lhs = first(lhs.inner_columns)
    else:
        orig = rhs
        rhs = first(rhs.inner_columns)
    if expr.sep:
        result = (lhs + expr.sep + rhs).label(expr.lhs._name)
    else:
        result = (lhs + rhs).label(expr.lhs._name)
    return reconstruct_select([result], orig)
示例#11
0
文件: sql.py 项目: kwmsmith/blaze
def compute_up(expr, data, **kwargs):
    name = expr._name
    try:
        inner_columns = list(data.inner_columns)
        names = list(c.name for c in data.inner_columns)
        column = inner_columns[names.index(name)]
    except (KeyError, ValueError):
        single_column_select = compute(expr, first(data.inner_columns), post_compute=False, return_type="native")
        column = first(single_column_select.inner_columns)
        result = unify_froms(sa.select([column]), data.froms + single_column_select.froms)
        return result.where(unify_wheres([data, single_column_select]))
    else:
        return data.with_only_columns([column])
示例#12
0
文件: sql.py 项目: giangzuzana/blaze
    def binop_sql(t, lhs, rhs, **kwargs):
        if isinstance(lhs, Select):
            assert len(lhs.c) == 1, (
                'Select cannot have more than a single column when doing'
                ' arithmetic, got %r' % lhs
            )
            lhs = first(lhs.inner_columns)
        if isinstance(rhs, Select):
            assert len(rhs.c) == 1, (
                'Select cannot have more than a single column when doing'
                ' arithmetic, got %r' % rhs
            )
            rhs = first(rhs.inner_columns)

        return f(t, lhs, rhs)
示例#13
0
文件: bcolz.py 项目: leolujuyi/blaze
def compute_down(expr, data, chunksize=2**20, map=map, **kwargs):
    leaf = expr._leaves()[0]

    # If the bottom expression is a projection or field then want to do
    # compute_up first
    children = set(e for e in expr._traverse()
                   if isinstance(e, Expr)
                   and any(i is expr._leaves()[0] for i in e._inputs))
    if len(children) == 1 and isinstance(first(children), (Field, Projection)):
        raise NotImplementedError()


    chunk = symbol('chunk', chunksize * leaf.schema)
    (chunk, chunk_expr), (agg, agg_expr) = split(leaf, expr, chunk=chunk)

    data_parts = partitions(data, chunksize=(chunksize,))

    parts = list(map(curry(compute_chunk, data, chunk, chunk_expr),
                           data_parts))

    if isinstance(parts[0], np.ndarray):
        intermediate = np.concatenate(parts)
    elif isinstance(parts[0], pd.DataFrame):
        intermediate = pd.concat(parts)
    elif isinstance(parts[0], Iterable):
        intermediate = list(concat(parts))
    else:
        raise TypeError(
        "Don't know how to concatenate objects of type %s" % type(parts[0]))

    return compute(agg_expr, {agg: intermediate})
示例#14
0
文件: table.py 项目: fandres70/blaze
 def dshape(self):
     if self.child.columns and len(self.child.columns) == 1:
         name = self.child.columns[0] + '_' + type(self).__name__
         dtype = self.dtype or first(self.child.schema[0].fields.values()[0])
         return DataShape(Record([[name, self.dtype]]))
     else:
         return DataShape(Record([[type(self).__name__, self.dtype]]))
示例#15
0
文件: strings.py 项目: cloudera/ibis
def execute_string_group_by_find_in_set(op, needle, haystack, **kwargs):
    # `list` could contain series, series groupbys, or scalars
    # mixing series and series groupbys is not allowed
    series_in_haystack = [
        type(piece)
        for piece in haystack
        if isinstance(piece, (pd.Series, SeriesGroupBy))
    ]

    if not series_in_haystack:
        return ibis.util.safe_index(haystack, needle)

    try:
        collection_type, = frozenset(map(type, series_in_haystack))
    except ValueError:
        raise ValueError('Mixing Series and SeriesGroupBy is not allowed')

    pieces = haystack_to_series_of_lists(
        [getattr(piece, 'obj', piece) for piece in haystack]
    )

    result = pieces.map(toolz.flip(ibis.util.safe_index)(needle))
    if issubclass(collection_type, pd.Series):
        return result

    assert issubclass(collection_type, SeriesGroupBy)

    return result.groupby(
        toolz.first(
            piece.grouper.groupings
            for piece in haystack
            if hasattr(piece, 'grouper')
        )
    )
示例#16
0
def discover_chunks(c, **kwargs):
    data = c.data
    if isinstance(data, Iterator):
        fst, c.data = peek(data)
    else:
        fst = first(c)
    return var * discover(fst).subshape[0]
示例#17
0
def test_pre_compute_with_projection_projects_on_data_frames():
    csv = CSV(example('iris.csv'))
    s = symbol('s', discover(csv))
    result = pre_compute(s[['sepal_length', 'sepal_width']].distinct(),
                         csv, comfortable_memory=10)
    assert set(first(result).columns) == \
            set(['sepal_length', 'sepal_width'])
示例#18
0
def udf(func):
    llvm_module = first(func._compileinfos.values()).library._final_module
    engine = ee.EngineBuilder.new(llvm_module).create()
    functions = [
        func for func in llvm_module.functions
        if not func.name.startswith('_') and not func.is_declaration
    ]
    addr = engine.get_function_address(functions[1].name)
    assert addr > 0, 'addr == %d' % addr

    # Declare the ctypes function prototype
    # functype = cfunctype(c_double, c_double)

    path = os.path.expanduser(
        os.path.join('~', 'ibis-data', 'ibis-testing-data', 'ibis-testing.db')
    )
    con = sqlite3_connection(path.encode('utf8'))
    result = register(
        con,
        addr,
        func.__name__.encode('utf8'),
        len(func.nopython_signatures[0].args)
    )
    import ipdb; ipdb.set_trace()
    con.execute("select mysin(1.0230923)".encode('utf8'))
示例#19
0
 def _schema(self):
     schema = self._child.schema[0]
     if isinstance(schema, Record) and len(schema.types) == 1:
         result = toolz.first(schema.types)
     else:
         result = schema
     return DataShape(result)
示例#20
0
def test_pre_compute_calls_lean_projection():
    csv = CSV(example('iris.csv'))
    s = symbol('s', discover(csv))
    result = pre_compute(s.sort('sepal_length').species,
                         csv, comfortable_memory=10)
    assert set(first(result).columns) == \
            set(['sepal_length', 'species'])
示例#21
0
文件: sql.py 项目: leolujuyi/blaze
def post_compute(expr, query, scope=None):
    """ Execute SQLAlchemy query against SQLAlchemy engines

    If the result of compute is a SQLAlchemy query then it is likely that the
    data elements are themselves SQL objects which contain SQLAlchemy engines.
    We find these engines and, if they are all the same, run the query against
    these engines and return the result.
    """
    if not all(isinstance(val, (MetaData, Engine, Table)) for val in scope.values()):
        return query

    engines = set(filter(None, map(engine_of, scope.values())))

    if not engines:
        return query

    if len(set(map(str, engines))) != 1:
        raise NotImplementedError("Expected single SQLAlchemy engine")

    engine = first(engines)

    with engine.connect() as conn:  # Perform query
        result = conn.execute(select(query)).fetchall()

    if isscalar(expr.dshape):
        return result[0][0]
    if isscalar(expr.dshape.measure):
        return [x[0] for x in result]
    return result
示例#22
0
 def port(self):
     if not self._port:
         try:
             self._port = first(self._sockets.values()).getsockname()[1]
         except StopIteration:
             raise OSError("Server has no port.  Please call .listen first")
     return self._port
示例#23
0
    def test_context_manager(self, dt_tb, dt_data):
        """ check the context manager auto-closes the resources """

        with Data("{0}::dt".format(dt_tb)) as t:
            f = first(t._resources().values())
            assert f.isopen
        assert not f.isopen
示例#24
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0]
    all_vrs = _get_variant_regions(items)
    ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items)
                    if len(all_vrs) > 0 else chrom)
    with shared.bedtools_tmpdir(items[0]):
        # Get a bedtool for the full region if no variant regions
        if ready_region == chrom:
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed))
        else:
            want_bedtool = pybedtools.BedTool(ready_region).saveas()
        sv_exclude_bed = _get_sv_exclude_file(items)
        if sv_exclude_bed and len(want_bedtool) > 0:
            want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
        if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
            with file_transaction(out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
示例#25
0
文件: numbers.py 项目: ChrisBg/blaze
def scalar_coerce(rec, val):
    if len(rec.fields) == 1:
        return scalar_coerce(first(rec.types), val)
    else:
        raise TypeError("Trying to coerce complex datashape\n"
                "got dshape: %s\n"
                "scalar_coerce only intended for scalar values" % rec)
示例#26
0
def compute_up(t, data, **kwargs):
    assert len(data.c) == 1, "Select cannot have more than a single column when doing arithmetic"
    column = first(data.inner_columns)
    if isinstance(t.lhs, Expr):
        return sa.func.pow(column, t.rhs)
    else:
        return sa.func.pow(t.lhs, column)
示例#27
0
def test_basic():
    def test_g():
        time.sleep(0.01)

    def test_h():
        time.sleep(0.02)

    def test_f():
        for i in range(100):
            test_g()
            test_h()

    thread = threading.Thread(target=test_f)
    thread.daemon = True
    thread.start()

    state = create()

    for i in range(100):
        time.sleep(0.02)
        frame = sys._current_frames()[thread.ident]
        process(frame, None, state)

    assert state['count'] == 100
    d = state
    while len(d['children']) == 1:
        d = first(d['children'].values())

    assert d['count'] == 100
    assert 'test_f' in str(d['description'])
    g = [c for c in d['children'].values() if 'test_g' in str(c['description'])][0]
    h = [c for c in d['children'].values() if 'test_h' in str(c['description'])][0]

    assert g['count'] < h['count']
    assert 95 < g['count'] + h['count'] <= 100
示例#28
0
 def read_header(self, stream):
     stream.seek(0)
     if stream.read(len(self.MAGIC)) != self.MAGIC:
         raise FormatException('Not a Daybreak database')
     version = first(unpack('!H', stream.read(2)))
     if version != self.VERSION:
         raise FormatException("Expected database version {}, got {}".format(self.VERSION, version))
示例#29
0
    def persist(self, collections):
        """ Persist dask collections on cluster

        Starts computation of the collection on the cluster in the background.
        Provides a new dask collection that is semantically identical to the
        previous one, but now based off of futures currently in execution.

        Parameters
        ----------
        collections: sequence or single dask object
            Collections like dask.array or dataframe or dask.value objects

        Returns
        -------
        List of collections, or single collection, depending on type of input.

        Examples
        --------
        >>> xx = executor.persist(x)  # doctest: +SKIP
        >>> xx, yy = executor.persist([x, y])  # doctest: +SKIP

        See Also
        --------
        Executor.compute
        """
        if isinstance(collections, (tuple, list, set, frozenset)):
            singleton = False
        else:
            singleton = True
            collections = [collections]

        assert all(isinstance(c, Base) for c in collections)

        groups = groupby(lambda x: x._optimize, collections)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk2.items():
            dependencies[k] |= set(_deps(dsk, v))

        names = list({k for c in collections for k in flatten(c._keys())})

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk2),
                                 'dependencies': dependencies,
                                 'keys': names,
                                 'client': self.id})
        result = [redict_collection(c, {k: Future(k, self)
                                        for k in flatten(c._keys())})
                for c in collections]
        if singleton:
            return first(result)
        else:
            return result
示例#30
0
def compute_up(t, data, **kwargs):
    assert len(data.c) == 1, "Select cannot have more than a single column when doing arithmetic"
    column = first(data.inner_columns)
    op = getattr(sa.func, type(t).__name__)
    if isinstance(t.lhs, Expr):
        return op(column, t.rhs)
    else:
        return op(t.lhs, column)
示例#31
0
def shuffle(
    df: DataFrame,
    column_names: List[str],
    npartitions: Optional[int] = None,
    ignore_index: bool = False,
) -> DataFrame:
    """Order divisions of DataFrame so that all values within column(s) align

    This enacts a task-based shuffle using explicit-comms. It requires a full
    dataset read, serialization and shuffle. This is expensive. If possible
    you should avoid shuffles.

    This does not preserve a meaningful index/partitioning scheme. This is not
    deterministic if done in parallel.

    Requires an activate client.

    Parameters
    ----------
    df: dask.dataframe.DataFrame
        Dataframe to shuffle
    column_names: list of strings
        List of column names on which we want to split.
    npartitions: int or None
        The desired number of output partitions. If None, the number of output
        partitions equals `df.npartitions`
    ignore_index: bool
        Ignore index during shuffle.  If True, performance may improve,
        but index values will not be preserved.

    Returns
    -------
    df: dask.dataframe.DataFrame
        Shuffled dataframe

    Developer Notes
    ---------------
    The implementation consist of three steps:
      (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each
          dataframe partition and submit the graph.
      (b) Submit a task on each worker that shuffle (all-to-all communicate)
          the groups from (a) and return a list of dataframe-partitions.
      (c) Submit a dask graph that extract (using `getitem()`) individual
          dataframe-partitions from (b).
    """
    c = comms.default_comms()

    # As default we preserve number of partitions
    if npartitions is None:
        npartitions = df.npartitions

    # Step (a): partition/group each dataframe-partition
    name = ("explicit-comms-shuffle-group-"
            f"{tokenize(df, column_names, npartitions, ignore_index)}")
    df = df.persist(
    )  # Making sure optimizations are apply on the existing graph
    dsk = dict(df.__dask_graph__())
    output_keys = []
    for input_key in df.__dask_keys__():
        output_key = (name, input_key[1])
        dsk[output_key] = (
            shuffle_group,
            input_key,
            column_names,
            0,
            npartitions,
            npartitions,
            ignore_index,
            npartitions,
        )
        output_keys.append(output_key)

    # Compute `df_groups`, which is a list of futures, one future per partition in `df`.
    # Each future points to a dict of length `df.npartitions` that maps each
    # partition-id to a DataFrame.
    df_groups = compute_as_if_collection(type(df),
                                         dsk,
                                         output_keys,
                                         sync=False)
    wait(df_groups)
    for f in df_groups:  # Check for errors
        if f.status == "error":
            f.result()  # raise exception

    # Step (b): find out which workers has what part of `df_groups`,
    #           find the number of output each worker should have,
    #           and submit `local_shuffle()` on each worker.
    key_to_part = {str(part.key): part for part in df_groups}
    in_parts = defaultdict(list)  # Map worker -> [list of futures]
    for key, workers in c.client.who_has(df_groups).items():
        # Note, if multiple workers have the part, we pick the first worker
        in_parts[first(workers)].append(key_to_part[key])

    # Let's create a dict that specifices the number of partitions each worker has
    in_nparts = {}
    workers = set()  # All ranks that have a partition of `df`
    for rank, worker in enumerate(c.worker_addresses):
        nparts = len(in_parts.get(worker, ()))
        if nparts > 0:
            in_nparts[rank] = nparts
            workers.add(rank)
    workers_sorted = sorted(workers)

    # Find the output partitions for each worker
    div = npartitions // len(workers)
    rank_to_out_part_ids = {}  # rank -> [list of partition id]
    for i, rank in enumerate(workers_sorted):
        rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1)))
    for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)):
        rank_to_out_part_ids[rank].append(i)

    # Run `local_shuffle()` on each worker
    result_futures = {}
    for rank, worker in enumerate(c.worker_addresses):
        if rank in workers:
            result_futures[rank] = c.submit(
                worker,
                local_shuffle,
                in_nparts,
                in_parts[worker],
                rank_to_out_part_ids,
                ignore_index,
            )
    distributed.wait(list(result_futures.values()))
    del df_groups

    # Step (c): extract individual dataframe-partitions
    name = f"explicit-comms-shuffle-getitem-{tokenize(name)}"
    dsk = {}
    meta = None
    for rank, parts in rank_to_out_part_ids.items():
        for i, part_id in enumerate(parts):
            dsk[(name, part_id)] = (getitem, result_futures[rank], i)
            if meta is None:
                # Get the meta from the first output partition
                meta = delayed(make_meta)(delayed(getitem)(
                    result_futures[rank], i)).compute()
    assert meta is not None

    divs = [None] * (len(dsk) + 1)
    return new_dd_object(dsk, name, meta, divs).persist()
示例#32
0
def until_convergence(it: Iterator[Params],
                      eq: Callable = lambda x: x[0] != x[1]) -> Params:
    it2 = tz.drop(1, it)
    pairs = zip(it, it2)
    return tz.first(itertools.dropwhile(eq, pairs))[0]
示例#33
0
state = i['state']
nsteps = snakemake.params.get('nsteps', 1)

files = [
    (i.tend, ('FQT', 'FSL')),
    (i.cent, ('QV', 'TABS', 'QN', 'QP', 'QRAD')),
    (i.stat, ('p', 'RHO')),
    (i['2d'], ('LHF', 'SHF', 'SOLIN')),
]

data = TrainingData.from_var_files(files)
nt, ny, nx, nz = data.FQT.shape

loader = data.get_loader(nt, batch_size=ny * nx * nz, shuffle=False)
input_data = first(loader)

model = ForcedStepper.from_file(state)
model.eval()

model.nsteps = 1
print("nsteps", nsteps)
with torch.no_grad():
    out = model(input_data)


def unstackdiag(x):
    shape = (nt - 1, ny, nx)
    return x.data.numpy().reshape(shape)

示例#34
0
    def _do_predict(self, X_df, coefs, loc_dict, intercept, dtype):
        client = default_client()

        part_size = ceil(X_df.shape[1] / X_df.npartitions)

        # We scatter delayed operations to gather columns on the workers
        scattered = []
        for i in range(X_df.npartitions):
            up_limit = min((i + 1) * part_size, X_df.shape[1])
            cols = X_df.columns.values[i * part_size:up_limit]
            loc_cudf = X_df[cols]
            yield wait(loc_cudf)
            scattered.append(
                client.submit(preprocess_predict,
                              loc_cudf,
                              workers=[loc_dict[i]]))
            yield wait(scattered)
            del (loc_cudf)

        # Break apart Dask.array/dataframe into chunks/parts
        data_parts = scattered
        coef_parts = coefs.to_delayed()

        # Arrange parts into pairs.  This enforces co-locality
        parts = list(map(delayed, zip(data_parts, coef_parts)))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)

        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])
        """
        Create IP Handles on each worker hosting input data
        """

        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker,
                            client.submit(predict_to_device_arrays,
                                          part,
                                          worker,
                                          loc_dict,
                                          X_df.npartitions,
                                          dtype=dtype,
                                          workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)
        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """
        exec_node = loc_dict[X_df.npartitions - 1]

        # Need to fetch parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(
            filter(lambda x: x[0] != exec_node, input_devarrays))

        ipc_handles = [
            client.submit(get_input_ipc_handles,
                          future,
                          unique=np.random.randint(0, 1e6),
                          workers=[a_worker])
            for a_worker, future in not_on_worker
        ]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython
        # Calls _predict_on_worker defined in the bottom
        ret = client.submit(_predict_on_worker, (ipc_handles, raw_arrays),
                            self.intercept,
                            self._build_params_map(),
                            workers=[exec_node])

        yield wait(ret)

        dfs = [
            client.submit(series_on_worker,
                          f,
                          worker,
                          loc_dict,
                          X_df.npartitions,
                          X_df,
                          workers=[worker]) for worker, f in input_devarrays
        ]

        return dfs
示例#35
0
def test():
    patients = _get_patients('Animal 2', 'Customer 2')
    if len(patients) == 1:
        patient = first(patients)
        print(patient)
示例#36
0
    def map(self, func, *iterables, **kwargs):
        """ Map a function on a sequence of arguments

        Arguments can be normal objects or Futures

        Parameters
        ----------
        func: callable
        iterables: Iterables, Iterators, or Queues
        pure: bool (defaults to True)
            Whether or not the function is pure.  Set ``pure=False`` for
            impure functions like ``np.random.random``.
        workers: set, iterable of sets
            A set of worker hostnames on which computations may be performed.
            Leave empty to default to all workers (common case)

        Examples
        --------
        >>> L = executor.map(func, sequence)  # doctest: +SKIP

        Returns
        -------
        List, iterator, or Queue of futures, depending on the type of the
        inputs.

        See also
        --------
        Executor.submit: Submit a single function
        """
        if not callable(func):
            raise TypeError("First input to map must be a callable function")

        if (all(map(isqueue, iterables))
                or all(isinstance(i, Iterator) for i in iterables)):
            q_out = pyQueue()
            t = Thread(target=self._threaded_map,
                       args=(q_out, func, iterables),
                       kwargs=kwargs)
            t.daemon = True
            t.start()
            if isqueue(iterables[0]):
                return q_out
            else:
                return queue_to_iterator(q_out)

        pure = kwargs.pop('pure', True)
        workers = kwargs.pop('workers', None)
        allow_other_workers = kwargs.pop('allow_other_workers', False)

        if allow_other_workers and workers is None:
            raise ValueError("Only use allow_other_workers= if using workers=")

        iterables = list(zip(*zip(*iterables)))
        if pure:
            keys = [
                funcname(func) + '-' + tokenize(func, kwargs, *args)
                for args in zip(*iterables)
            ]
        else:
            uid = str(uuid.uuid4())
            keys = [
                funcname(func) + '-' + uid + '-' + str(uuid.uuid4())
                for i in range(min(map(len, iterables)))
            ]

        if not kwargs:
            dsk = {
                key: (func, ) + args
                for key, args in zip(keys, zip(*iterables))
            }
        else:
            dsk = {
                key: (apply, func, (tuple, list(args)), kwargs)
                for key, args in zip(keys, zip(*iterables))
            }

        d = {key: unpack_remotedata(task) for key, task in dsk.items()}
        dsk = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        if isinstance(workers, str):
            workers = [workers]
        if isinstance(workers, (list, set)):
            if workers and isinstance(first(workers), (list, set)):
                if len(workers) != len(keys):
                    raise ValueError("You only provided %d worker restrictions"
                                     " for a sequence of length %d" %
                                     (len(workers), len(keys)))
                restrictions = dict(zip(keys, workers))
            else:
                restrictions = {key: workers for key in keys}
        elif workers is None:
            restrictions = {}
        else:
            raise TypeError("Workers must be a list or set of workers or None")
        if allow_other_workers not in (True, False, None):
            raise TypeError("allow_other_workers= must be True or False")
        if allow_other_workers is True:
            loose_restrictions = set(keys)
        else:
            loose_restrictions = set()

        logger.debug("map(%s, ...)", funcname(func))
        self._send_to_scheduler({
            'op': 'update-graph',
            'tasks': valmap(dumps_task, dsk),
            'dependencies': dependencies,
            'keys': keys,
            'restrictions': restrictions,
            'loose_restrictions': loose_restrictions,
            'client': self.id
        })

        return [Future(key, self) for key in keys]
示例#37
0
文件: core.py 项目: wuyu1998/zipline
    def _load_dataset(self, dates, data_query_cutoff_times, assets, mask,
                      columns):
        try:
            (expr_data, ) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data', )

        expr, deltas, checkpoints, odo_kwargs = expr_data
        odo_kwargs = dict(odo_kwargs)

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        lower_dt, upper_dt = data_query_cutoff_times[[0, -1]]

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower))
                               if deltas is not None else None)

        # If the rows that come back from the blaze backend are constructed
        # from LabelArrays with Nones in the categories, pandas
        # complains. Ignore those warnings for now until we have a story for
        # updating our categorical missing values to NaN.
        with ignore_pandas_nan_categorical_warning():
            all_rows = pd.concat(
                filter(
                    lambda df: df is not None,
                    (
                        materialized_checkpoints,
                        materialized_expr_deferred.get(),
                        materialized_deltas,
                    ),
                ),
                ignore_index=True,
                copy=False,
            )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]', )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_cutoff_times,
                assets,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_cutoff_times,
                columns,
                all_rows,
            )
示例#38
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (expr_data, ) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data', )

        expr, deltas, checkpoints, odo_kwargs = expr_data

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower))
                               if deltas is not None else None)

        all_rows = pd.concat(
            filter(
                lambda df: df is not None,
                (
                    materialized_checkpoints,
                    materialized_expr_deferred.get(),
                    materialized_deltas,
                ),
            ),
            ignore_index=True,
            copy=False,
        )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]', )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_time,
                data_query_tz,
                assets,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_time,
                data_query_tz,
                columns,
                all_rows,
            )
示例#39
0
def test_global_workers(s, a, b):
    n = len(Worker._instances)
    w = first(Worker._instances)
    assert w is a or w is b
def _get_tenant(customer):
    tenant = frappe.get_all("Tenant Master",
                            fields=["*"],
                            filters={"customer": customer})
    return first(tenant) if tenant else None
示例#41
0
                                tags.label("Target identity:", fr=key)
                                with tags.select(
                                        cls="form-control target-identity",
                                        id=key,
                                        data_scenario="unseen"):
                                    for t, _ in col["audio-paths-ours"]:
                                        tags.option(
                                            t,
                                            data_target=t,
                                            data_speaker=col["speaker"],
                                            data_sample=col["sample-id"])
                            with tags.audio(controls=True,
                                            cls="embed-responsive",
                                            id=key + "-audio",
                                            data_scenario="unseen"):
                                _, p = first(col["audio-paths-ours"])
                                tags.source(src=p, type="audio/wav")

    tags.script(type="text/javascript", src="script.js")

    raw(r"""
    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-71565185-2"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());

      gtag('config', 'UA-71565185-2');
    </script>
    """)
示例#42
0
def test_first():
    for p in pairs:
        first(p)
示例#43
0
def find_id_in_single_index(ind: Index, id: int) -> str:
    try:
        return t.first(key for key, value in ind.items() if id in value)
    except StopIteration:
        return None
示例#44
0
def test_create_index_unique(sql):
    create_index(sql, 'y', name='y_idx', unique=True)
    assert len(sql.data.indexes) == 1
    idx = first(sql.data.indexes)
    assert idx.unique
    assert idx.columns.y == sql.data.c.y
示例#45
0
def compute_up(t, s, **kwargs):
    assert len(s.c) == 1, \
            'Select cannot have more than a single column when filtering with `like`'
    return compute_up(t, first(s.inner_columns), **kwargs)
示例#46
0
def compute_up(expr, data, **kwargs):
    column = first(data.inner_columns)
    cast = sa.cast(column, dshape_to_alchemy(expr.to)).label(expr._name)
    return reconstruct_select([cast], data)
示例#47
0
    def compute(self, args, sync=False):
        """ Compute dask collections on cluster

        Parameters
        ----------
        args: iterable of dask objects or single dask object
            Collections like dask.array or dataframe or dask.value objects
        sync: bool (optional)
            Returns Futures if False (default) or concrete values if True

        Returns
        -------
        List of Futures if input is a sequence, or a single future otherwise

        Examples
        --------
        >>> from dask import do, value
        >>> from operator import add
        >>> x = dask.do(add)(1, 2)
        >>> y = dask.do(add)(x, x)
        >>> xx, yy = executor.compute([x, y])  # doctest: +SKIP
        >>> xx  # doctest: +SKIP
        <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e>
        >>> xx.result()  # doctest: +SKIP
        3
        >>> yy.result()  # doctest: +SKIP
        6

        Also support single arguments

        >>> xx = executor.compute(x)  # doctest: +SKIP

        See Also
        --------
        Executor.get: Normal synchronous dask.get function
        """
        if isinstance(args, (list, tuple, set, frozenset)):
            singleton = False
        else:
            args = [args]
            singleton = True

        variables = [a for a in args if isinstance(a, Base)]

        groups = groupby(lambda x: x._optimize, variables)
        dsk = merge([
            opt(merge([v.dask for v in val]), [v._keys() for v in val])
            for opt, val in groups.items()
        ])
        names = ['finalize-%s' % tokenize(v) for v in variables]
        dsk2 = {
            name: (v._finalize, v._keys())
            for name, v in zip(names, variables)
        }

        d = {k: unpack_remotedata(v) for k, v in merge(dsk, dsk2).items()}
        dsk3 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk3.items():
            dependencies[k] |= set(_deps(dsk, v))

        self._send_to_scheduler({
            'op': 'update-graph',
            'tasks': valmap(dumps_task, dsk3),
            'dependencies': dependencies,
            'keys': names,
            'client': self.id
        })

        i = 0
        futures = []
        for arg in args:
            if isinstance(arg, Base):
                futures.append(Future(names[i], self))
                i += 1
            else:
                futures.append(arg)

        if sync:
            result = self.gather(futures)
        else:
            result = futures

        if singleton:
            return first(result)
        else:
            return result
示例#48
0
def compute_up(expr, data, **kwargs):
    return data.with_only_columns(
        first(
            compute(expr._child[field], data,
                    post_compute=False).inner_columns)
        for field in expr.fields)
示例#49
0
    def persist(self, collections):
        """ Persist dask collections on cluster

        Starts computation of the collection on the cluster in the background.
        Provides a new dask collection that is semantically identical to the
        previous one, but now based off of futures currently in execution.

        Parameters
        ----------
        collections: sequence or single dask object
            Collections like dask.array or dataframe or dask.value objects

        Returns
        -------
        List of collections, or single collection, depending on type of input.

        Examples
        --------
        >>> xx = executor.persist(x)  # doctest: +SKIP
        >>> xx, yy = executor.persist([x, y])  # doctest: +SKIP

        See Also
        --------
        Executor.compute
        """
        if isinstance(collections, (tuple, list, set, frozenset)):
            singleton = False
        else:
            singleton = True
            collections = [collections]

        assert all(isinstance(c, Base) for c in collections)

        groups = groupby(lambda x: x._optimize, collections)
        dsk = merge([
            opt(merge([v.dask for v in val]), [v._keys() for v in val])
            for opt, val in groups.items()
        ])

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk2.items():
            dependencies[k] |= set(_deps(dsk, v))

        names = list({k for c in collections for k in flatten(c._keys())})

        self._send_to_scheduler({
            'op': 'update-graph',
            'tasks': valmap(dumps_task, dsk2),
            'dependencies': dependencies,
            'keys': names,
            'client': self.id
        })
        result = [
            redict_collection(c,
                              {k: Future(k, self)
                               for k in flatten(c._keys())})
            for c in collections
        ]
        if singleton:
            return first(result)
        else:
            return result
示例#50
0
def compute_up(t, s, **kwargs):
    assert len(s.foreign_keys) == 1, 'exactly one foreign key allowed'
    key_col = first(s.foreign_keys).column
    return sa.select([key_col.table.c[t._name]]).where(s == key_col)
示例#51
0
    def _do_fit(self, X_df, y_df, dtype):

        client = default_client()

        # Finding location of parts of y_df to distribute columns of X_df
        loc_dict = {}
        yield wait(y_df)
        tt = yield client.who_has(y_df)
        location = tuple(tt.values())
        for i in range(X_df.npartitions):
            part_number = eval(list(tt.keys())[i])[1]
            loc_dict[part_number] = parse_host_port(str(location[i])[:-3])

        # Lets divide the columns evenly, matching the order of the labels
        part_size = ceil(X_df.shape[1] / X_df.npartitions)

        # We scatter delayed operations to gather columns on the workers
        scattered = []
        coefs = []
        for i in range(X_df.npartitions):
            up_limit = min((i + 1) * part_size, X_df.shape[1])
            cols = X_df.columns.values[i * part_size:up_limit]
            loc_cudf = X_df[cols]
            yield wait(loc_cudf)
            scattered.append(
                client.submit(preprocess_on_worker,
                              loc_cudf,
                              workers=[loc_dict[i]]))
            yield wait(scattered)
            coefs.append(
                client.submit(dev_array_on_worker,
                              up_limit - i * part_size,
                              dtype=dtype,
                              unique=np.random.randint(0, 1e6),
                              workers=[loc_dict[i]]))
            yield wait(coefs)
            del (loc_cudf)

        # Break apart Dask.array/dataframe into chunks/parts
        # data_parts = map(delayed, scattered)
        data_parts = scattered
        label_parts = y_df.to_delayed()
        coef_parts = coefs

        # Arrange parts into pairs.  This enforces co-locality
        parts = list(map(delayed, zip(data_parts, label_parts, coef_parts)))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)

        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])
        """
        Create IP Handles on each worker hosting input data
        """

        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker,
                            client.submit(fit_to_device_arrays,
                                          part,
                                          workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)
        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """

        # Last worker is the only one that can have less items.
        exec_node = loc_dict[X_df.npartitions - 1]

        # Need to fetch parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(
            filter(lambda x: x[0] != exec_node, input_devarrays))

        ipc_handles = [
            client.submit(get_input_ipc_handles, future, workers=[a_worker])
            for a_worker, future in not_on_worker
        ]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython
        # Calls _fit_on_worker defined in the bottom
        intercept = client.submit(_fit_on_worker, (ipc_handles, raw_arrays),
                                  self._build_params_map(),
                                  workers=[exec_node])

        yield wait(intercept)

        coef_series = [
            client.submit(coef_on_worker,
                          coefs[i],
                          i,
                          X_df.shape[1],
                          X_df.npartitions,
                          loc_dict[i],
                          workers=[loc_dict[i]]) for i in range(len(loc_dict))
        ]

        # coef_on_worker(self, coef, locations, ncols, nparts, worker):

        raise gen.Return((coef_series, intercept, loc_dict))
示例#52
0
def binary_math_sql_select(t, lhs, rhs, **kwargs):
    left, right = first(lhs.inner_columns), first(rhs.inner_columns)
    result = getattr(sa.func, type(t).__name__)(left, right)
    assert lhs.table == rhs.table
    return reconstruct_select([result], lhs.table)
def _get_tenant(customer):
    tenant = frappe.get_all(
        "Tenant Master",
        filters={"customer": customer}
    )
    return first(tenant).get("name") if tenant else None
示例#54
0
 def _key(self):
     return first(self._dasks[0])
示例#55
0
def test_callables():
    cl = CL(lambda: (list(range(3)) for i in range(3)))

    assert first(cl) == [0, 1, 2]
    assert first(cl) == [0, 1, 2]
示例#56
0
def read_parquet(path,
                 columns=None,
                 filters=None,
                 categories=None,
                 index=None,
                 **kwargs):
    """
    Read Dask DataFrame from ParquetFile

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : string
        Source directory for data.
        Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data.
    columns: list or None
        List of column names to load
    filters: list
        List of filters to apply, like ``[('x', '>' 0), ...]``
    index: string or None
        Name of index column to use if that column is sorted
    categories: list or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.

    Examples
    --------
    >>> df = read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """
    if fastparquet is False:
        raise ImportError("fastparquet not installed")
    if filters is None:
        filters = []
    myopen = OpenFileCreator(path, compression=None, text=False)

    try:
        pf = fastparquet.ParquetFile(path + myopen.fs.sep + '_metadata',
                                     open_with=myopen,
                                     sep=myopen.fs.sep)
    except:
        pf = fastparquet.ParquetFile(path, open_with=myopen, sep=myopen.fs.sep)

    columns = columns or (pf.columns + list(pf.cats))
    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.helper))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    parts = [
        delayed(pf.read_row_group_file)(rg, columns, categories, **kwargs)
        for rg in rgs
    ]

    # TODO: if categories vary from one rg to next, need to cope
    dtypes = {
        k: ('category' if k in (categories or []) else v)
        for k, v in pf.dtypes.items() if k in columns
    }

    df = dd.from_delayed(parts, meta=dtypes)

    # Find an index among the partially sorted columns
    minmax = fastparquet.api.sorted_partitioned_columns(pf)

    if index is False:
        index_col = None
    elif len(minmax) > 1:
        if index:
            index_col = index
        else:
            raise ValueError("Multiple possible indexes exist: %s.  "
                             "Please select one with index='index-name'" %
                             sorted(minmax))
    elif len(minmax) == 1:
        index_col = first(minmax)
    else:
        index_col = None

    if index_col:
        divisions = list(
            minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]
        df = df.set_index(index_col, sorted=True, divisions=divisions)

    return df
示例#57
0
def merge_tables(target, tables, columns=None):
    """
    Merge a number of tables onto a target table. Tables must have
    registered merge rules via the `broadcast` function.

    Parameters
    ----------
    target : str, DataFrameWrapper, or TableFuncWrapper
        Name of the table (or wrapped table) onto which tables will be merged.
    tables : list of `DataFrameWrapper`, `TableFuncWrapper`, or str
        All of the tables to merge. Should include the target table.
    columns : list of str, optional
        If given, columns will be mapped to `tables` and only those columns
        will be requested from each table. The final merged table will have
        only these columns. By default all columns are used from every
        table.

    Returns
    -------
    merged : pandas.DataFrame

    """
    # allow target to be string or table wrapper
    if isinstance(target, (DataFrameWrapper, TableFuncWrapper)):
        target = target.name

    # allow tables to be strings or table wrappers
    tables = [
        get_table(t)
        if not isinstance(t, (DataFrameWrapper, TableFuncWrapper)) else t
        for t in tables
    ]

    merges = {t.name: {} for t in tables}
    tables = {t.name: t for t in tables}
    casts = _get_broadcasts(tables.keys())
    logger.debug('attempting to merge tables {} to target table {}'.format(
        tables.keys(), target))

    # relate all the tables by registered broadcasts
    for table, onto in casts:
        merges[onto][table] = merges[table]
    merges = {target: merges[target]}

    # verify that all the tables can be merged to the target
    all_tables = set(_all_reachable_tables(merges))

    if all_tables != set(tables.keys()):
        raise RuntimeError(
            ('Not all tables can be merged to target "{}". Unlinked tables: {}'
             ).format(target, list(set(tables.keys()) - all_tables)))

    # add any columns necessary for indexing into other tables
    # during merges
    if columns:
        columns = list(columns)
        for c in casts.values():
            if c.onto_on:
                columns.append(c.onto_on)
            if c.cast_on:
                columns.append(c.cast_on)

    # get column map for which columns go with which table
    colmap = column_map(tables.values(), columns)

    # get frames
    frames = {
        name: t.to_frame(columns=colmap[name])
        for name, t in tables.items()
    }

    # perform merges until there's only one table left
    while merges[target]:
        nm = _next_merge(merges)
        onto = toolz.first(nm)
        onto_table = frames[onto]

        # loop over all the tables that can be broadcast onto
        # the onto_table and merge them all in.
        for cast in nm[onto]:
            cast_table = frames[cast]
            bc = casts[(cast, onto)]

            with log_start_finish('merge tables {} and {}'.format(onto, cast),
                                  logger):

                onto_table = pd.merge(onto_table,
                                      cast_table,
                                      left_on=bc.onto_on,
                                      right_on=bc.cast_on,
                                      left_index=bc.onto_index,
                                      right_index=bc.cast_index)

        # replace the existing table with the merged one
        frames[onto] = onto_table

        # free up space by dropping the cast table
        del frames[cast]

        # mark the onto table as having no more things to broadcast
        # onto it.
        _recursive_getitem(merges, onto)[onto] = {}

    logger.debug('finished merge')
    return frames[target]
示例#58
0
import os
import pathlib
import sys
from importlib import metadata

import toml
import toolz
from appdirs import AppDirs
from kivy.factory import Factory

try:
    META = dict(metadata.metadata(__name__))
    __author__ = META["Author"]
    __version__ = META["Version"]
except metadata.PackageNotFoundError:
    pyproject_toml_path = toolz.first(
        pathlib.Path(__file__).parent.parent.glob("**/pyproject.toml"))
    with open(pyproject_toml_path) as file:
        pyproject_toml = toml.load(file)
    __author__ = pyproject_toml["tool"]["poetry"]["authors"][0]
    __version__ = pyproject_toml["tool"]["poetry"]["version"]

dirs = AppDirs(appname=__name__, appauthor=__author__, version=__version__)

sys.path.append(os.path.dirname(__file__))

CONFIG_DIR = pathlib.Path(dirs.user_config_dir)
CONFIG_PATH = CONFIG_DIR / "config.ini"
APP_DIR = pathlib.Path(dirs.user_data_dir)
HOME = pathlib.Path.home()
USER = HOME.stem
BASE_PATH = pathlib.Path(__file__).parent.absolute()
示例#59
0
 def _data(self):
     return first(self._dasks[0].values())
示例#60
0
文件: dag.py 项目: jakirkham/persist
 def _finalize(self, args):
     if len(self._keys()) > 1:
         return args
     else:
         return first(args)