示例#1
0
def _encode_dask_array(values, encode=False):
    # type: (da.Array, bool) -> Any
    if encode:
        uniques, encoded = da.unique(values, return_inverse=True)
        return uniques, encoded
    else:
        return da.unique(values)
示例#2
0
    def fit(self, X, y):
        """
        Fit a multi-node multi-GPU K-Nearest Neighbors Classifier index

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Index data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Index labels data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        self : KNeighborsClassifier model
        """

        if not isinstance(X._meta, (np.ndarray, pd.DataFrame, cudf.DataFrame)):
            raise ValueError('This chunk type is not supported')

        self.data_handler = \
            DistributedDataHandler.create(data=[X, y],
                                          client=self.client)

        # uniq_labels: set of possible labels for each labels column
        # n_unique: number of possible labels for each labels column

        uniq_labels = []
        if self.data_handler.datatype == 'cupy':
            if y.ndim == 1:
                uniq_labels.append(da.unique(y))
            else:
                n_targets = y.shape[1]
                for i in range(n_targets):
                    uniq_labels.append(da.unique(y[:, i]))
        else:
            if isinstance(y, DaskSeries):
                uniq_labels.append(y.unique())
            else:
                n_targets = len(y.columns)
                for i in range(n_targets):
                    uniq_labels.append(y.iloc[:, i].unique())

        uniq_labels = da.compute(uniq_labels)[0]
        if hasattr(uniq_labels[0], 'values_host'):  # for cuDF Series
            uniq_labels = list(map(lambda x: x.values_host, uniq_labels))
        elif hasattr(uniq_labels[0], 'values'):  # for pandas Series
            uniq_labels = list(map(lambda x: x.values, uniq_labels))
        self.uniq_labels = np.array(uniq_labels)
        self.n_unique = list(map(lambda x: len(x), self.uniq_labels))

        return self
示例#3
0
    def fit(self, X, y):
        """
        Fit a multi-node multi-GPU K-Nearest Neighbors Classifier index

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Index data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Index labels data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        self : KNeighborsClassifier model
        """
        self.data_handler = \
            DistributedDataHandler.create(data=[X, y],
                                          client=self.client)

        # Compute set of possible labels for each output column -> uniq_labels
        # Count possible labels for each columns -> n_unique

        uniq_labels = []
        if self.data_handler.datatype == 'cupy':
            if y.ndim == 1:
                uniq_labels.append(da.unique(y))
            else:
                n_targets = y.shape[1]
                for i in range(n_targets):
                    uniq_labels.append(da.unique(y[:, i]))
        else:
            if isinstance(y, DaskSeries):
                uniq_labels.append(y.unique())
            else:
                n_targets = len(y.columns)
                for i in range(n_targets):
                    uniq_labels.append(y.iloc[:, i].unique())

        uniq_labels = da.compute(uniq_labels)[0]
        if not isinstance(uniq_labels[0], np.ndarray):  # for cuDF Series
            uniq_labels = list(map(lambda x: x.values_host, uniq_labels))
        self.uniq_labels = np.array(uniq_labels)
        self.n_unique = list(map(lambda x: len(x), self.uniq_labels))

        return self
示例#4
0
    def compute_class_weight(class_weight, *, classes, y):
        if not DaskToolBox.is_dask_object(y):
            return sk_utils.class_weight.compute_class_weight(class_weight,
                                                              classes=classes,
                                                              y=y)

        y = DaskToolBox.make_chunk_size_known(y)
        if set(dask.compute(da.unique(y))[0]) - set(classes):
            raise ValueError(
                "classes should include all valid labels that can be in y")

        if class_weight == 'balanced':
            # Find the weight of each class as present in y.
            le = dm_pre.LabelEncoder()
            y_ind = le.fit_transform(y)
            # if not all(np.in1d(classes, le.classes_)):
            #     raise ValueError("classes should have valid labels that are in y")
            # recip_freq = len(y) / (len(le.classes_) *
            #                        np.bincount(y_ind).astype(np.float64))
            # weight = recip_freq[le.transform(classes)]
            y_shape, y_ind_bincount, le_classes_ = dask.compute(
                y.shape, da.bincount(y_ind), le.classes_)
            if not all(np.in1d(classes, le_classes_)):
                raise ValueError(
                    "classes should have valid labels that are in y")
            recip_freq = y_shape[0] / (len(le_classes_) *
                                       y_ind_bincount.astype(np.float64))
            weight = recip_freq[np.searchsorted(le_classes_, classes)]
        else:
            raise ValueError("Only class_weight == 'balanced' is supported.")

        return weight
示例#5
0
文件: io.py 项目: ogrisel/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
示例#6
0
def process_data(X, y=None, test_size=0.20, dummies=False):
    if y is None:
        y = da.ones(X.shape[0])
    
    len_ = X.shape[0]    
    X = prepare_dataset(X)
        
    if dummies:
        y = dd.get_dummies(y)
        
    shape_  = list(X.shape[1:])
    
    X_train, X_test, y_train, y_test  = train_test_split(X.flatten().reshape(len_,-1), y, test_size=test_size, random_state=4891)
    
    X_train = X_train.reshape([X_train.shape[0]]+shape_)
    X_test = X_test.reshape([X_test.shape[0]]+shape_)
     
    print('Training dataset shape: ', X_train.shape)
    print('Validation dataset shape: ', X_test.shape)

    train_dataset = Dataset(X_train, y_train)
    test_dataset = Dataset(X_test, y_test)
    
    samples = list()
    for _ in range(10):
        for y_uniq in da.unique(train_dataset.labels):
            samples.append(train_dataset.x[train_dataset.labels==y_uniq][random.randint(0,len(train_dataset.x[train_dataset.labels==y_uniq])-1)])
    
    train_dataset.samples = da.array(samples)
    return train_dataset, test_dataset
示例#7
0
文件: io.py 项目: amatthies/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
示例#8
0
def test_unique_kwargs(return_index, return_inverse, return_counts):
    kwargs = dict(
        return_index=return_index,
        return_inverse=return_inverse,
        return_counts=return_counts
    )

    a = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(a, chunks=(3,))

    r_a = np.unique(a, **kwargs)
    r_d = da.unique(d, **kwargs)

    if not any([return_index, return_inverse, return_counts]):
        assert isinstance(r_a, np.ndarray)
        assert isinstance(r_d, da.Array)

        r_a = (r_a,)
        r_d = (r_d,)

    assert len(r_a) == len(r_d)

    if return_inverse:
        i = 1 + int(return_index)
        assert (d.size,) == r_d[i].shape

    for e_r_a, e_r_d in zip(r_a, r_d):
        assert_eq(e_r_d, e_r_a)
示例#9
0
def test_unique_kwargs(return_index, return_inverse, return_counts):
    kwargs = dict(return_index=return_index,
                  return_inverse=return_inverse,
                  return_counts=return_counts)

    a = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(a, chunks=(3, ))

    r_a = np.unique(a, **kwargs)
    r_d = da.unique(d, **kwargs)

    if not any([return_index, return_inverse, return_counts]):
        assert isinstance(r_a, np.ndarray)
        assert isinstance(r_d, da.Array)

        r_a = (r_a, )
        r_d = (r_d, )

    assert len(r_a) == len(r_d)

    if return_inverse:
        i = 1 + int(return_index)
        assert (d.size, ) == r_d[i].shape

    for e_r_a, e_r_d in zip(r_a, r_d):
        assert_eq(e_r_d, e_r_a)
示例#10
0
文件: oracle.py 项目: a24lorie/DPyACL
    def __init__(self, labels, indexes=None, cost=None):
        """
        :param labels:
        :param indexes:
        :param cost:
        """
        if not check_one_to_one_correspondence(labels, indexes, cost):
            raise ValueError(
                "Different length of parameters found. "
                "All parameters should be list type with the same length")

        labels = check_array(labels, ensure_2d=False, dtype=None)

        if isinstance(labels[0], np.generic):
            self._label_type = type(labels[0].item())
        else:
            self._label_type = type(labels[0])

        self._label_dim = labels.ndim
        self._label_unique = da.unique(labels)

        # check parameters
        self._cost_flag = True if cost is not None else False

        # several _indexes construct
        if self._cost_flag:
            self._ind2all = dict(
                zip(
                    indexes if indexes is not None else
                    [i for i in range(len(labels))], zip(labels, cost)))
        else:
            self._ind2all = dict(
                zip(
                    indexes if indexes is not None else
                    [i for i in range(len(labels))], labels))
示例#11
0
def test_unique_rand(seed, low, high, shape, chunks):
    cupy.random.seed(seed)

    a = cupy.random.randint(low, high, size=shape)
    d = da.from_array(a, chunks=chunks)

    r_a = np.unique(a)
    r_d = da.unique(d)
    assert_eq(r_d, r_a)
示例#12
0
def predict(args):
    # Convert source data into dask arrays
    sky_model = parse_sky_model(args.sky_model, args.model_chunks)

    # Get the support tables
    tables = support_tables(
        args, ["FIELD", "DATA_DESCRIPTION", "SPECTRAL_WINDOW", "POLARIZATION"])

    field_ds = tables["FIELD"]
    ddid_ds = tables["DATA_DESCRIPTION"]
    spw_ds = tables["SPECTRAL_WINDOW"]
    pol_ds = tables["POLARIZATION"]

    # List of write operations
    writes = []

    # Construct a graph for each DATA_DESC_ID
    for xds in xds_from_ms(args.ms,
                           columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"],
                           group_cols=["FIELD_ID", "DATA_DESC_ID"],
                           chunks={"row": args.row_chunks}):

        # Extract frequencies from the spectral window associated
        # with this data descriptor id
        field = field_ds[xds.attrs['FIELD_ID']]
        ddid = ddid_ds[xds.attrs['DATA_DESC_ID']]
        spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]]
        pol = pol_ds[ddid.POLARIZATION_ID.data[0]]

        # Select single dataset row out
        corrs = pol.NUM_CORR.data[0]

        _, time_index = da.unique(xds.TIME.data, return_inverse=True)

        # Generate visibility expressions for each source type
        source_vis = [
            vis_factory(args, stype, sky_model, time_index, xds, field, spw,
                        pol) for stype in sky_model.keys()
        ]

        # Sum visibilities together
        vis = sum(source_vis)

        # Reshape (2, 2) correlation to shape (4,)
        if corrs == 4:
            vis = vis.reshape(vis.shape[:2] + (4, ))

        # Assign visibilities to MODEL_DATA array on the dataset
        xds = xds.assign(MODEL_DATA=(("row", "chan", "corr"), vis))
        # Create a write to the table
        write = xds_to_table(xds, args.ms, ['MODEL_DATA'])
        # Add to the list of writes
        writes.append(write)

    # Submit all graph computations in parallel
    with ProgressBar():
        dask.compute(writes)
示例#13
0
def _run_dask_numpy_quantile(data, k):
    w = 100.0 / k
    p = da.arange(w, 100 + w, w)

    if p[-1] > 100.0:
        p[-1] = 100.0

    q = da.percentile(data.flatten(), p)
    q = da.unique(q)
    return q
示例#14
0
 def unique(y):
     if isinstance(y, da.Array):
         uniques = da.unique(y).compute()
         uniques = set(uniques)
     elif isinstance(y, dd.Series):
         uniques = y.unique().compute()
         uniques = set(uniques)
     else:
         uniques = ToolBox.unique(y)
     return uniques
示例#15
0
def test_unique():
    a = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(a, chunks=(3, ))

    r_a = np.unique(a)
    r_d = da.unique(d)

    assert isinstance(r_d, da.Array)

    assert_eq(r_d, r_a)
示例#16
0
    def fit(self, y):
        y = self._check_array(y)

        if isinstance(y, da.Array):
            classes_ = da.unique(y)
            classes_ = classes_.compute()
        else:
            classes_ = np.unique(y)

        self.classes_ = classes_

        return self
示例#17
0
 def fetch(self, **params):
     return self._data.filter(lambda p: p.name in params) \
         .map(lambda p: p[ \
             reduce( \
                 lambda x, y: np.bitwise_and(x, (p[y[0]] == y[1])), \
                 params[p.name].items(), \
                 np.ones(p.shape, dtype=bool) \
             ) \
         ]) \
         .fold(lambda p, q: da.concatenate([p, q])) \
         .apply(lambda p: da.unique(p.compute())) \
         .compute()
示例#18
0
    def value_counts(ar):
        if isinstance(ar, da.Array):
            v_n = da.unique(ar, return_counts=True)
            v_n = dask.compute(*v_n)
            return {v: n for v, n in zip(*v_n)}
        elif isinstance(ar, dd.Series):
            s = ar
        elif isinstance(ar, dd.DataFrame):
            assert ar.shape[1] == 1
            s = ar.iloc[:, 0]
        else:
            return ToolBox.value_counts(ar)

        return s.value_counts().compute().to_dict()
示例#19
0
    def agreement(self, estimators):
        """
        Implementation of Query By Committee strategy, variant: Vote entropy.

        The vote entropy approach is used for measuring the level of disagreement.

        I. Dagan and S. Engelson. Committee-based sampling for training probabilistic
        classifiers. In Proceedings of the International Conference on Machine
        Learning (ICML), pages 150–157. Morgan Kaufmann, 1995.

        :param estimators:
        :return:
        """
        score = []
        input_shape, committee_size = QueryByCommitteeStategy.check_committee_results(
            estimators)
        if len(input_shape) == 2:
            ele_uni = da.unique(estimators).compute()
            if not (len(ele_uni) == 2 and 0 in ele_uni and 1 in ele_uni):
                raise ValueError(
                    "The predicted label matrix must only contain 0 and 1")

            # calc each instance
            for i in range(input_shape[0]):
                instance_mat = da.from_array(
                    np.array([X[i, :] for X in estimators
                              if X is not None])).compute()
                voting = da.sum(instance_mat, axis=0)

                tmp = []
                for vote in voting:
                    if vote != 0:
                        tmp.append(
                            delayed(vote / len(estimators) *
                                    np.log(vote / len(estimators))))
                score.append(-delayed(sum)(tmp))
        else:
            input_mat = da.from_array(
                np.array([X for X in estimators if X is not None])).compute()
            # for each instance
            for i in range(input_shape[0]):
                count_dict = collections.Counter(input_mat[:, i])
                tmp = []
                for key in count_dict:
                    tmp.append(
                        delayed(count_dict[key] / committee_size *
                                np.log(count_dict[key] / committee_size)))
                score.append(-delayed(sum)(tmp))

        return compute(score)[0]
def save_label_idx_map(trainFileName):
    #get labels
    labels = h5py.File(trainFileName,"r")["labels"];
    labels_da = da.from_array(labels,chunks=(4,512,512));
    
    label_idx_map = {}    
    #count number of occurences for each label
    for idx in range(1,NUMCLASSES):
        start = time.time()
        X,Y,Z = da.where(labels_da==idx)
        label_idx_map[idx] = da.unique(X).compute(); 
        print("Finished label {0} in {1:.3f} s".format(idx,time.time()-start));    

    with h5py.File(fileName.replace(".h5","_IDX_MAP.h5"),"w") as newFile:
        for idx in range(1,NUMCLASSES):
            newFile.create_dataset(str(idx), data=label_idx_map[idx], dtype=np.int16);
示例#21
0
def test_unique_rand(seed, low, high, shape, chunks):
    np.random.seed(seed)

    a = np.random.randint(low, high, size=shape)
    d = da.from_array(a, chunks=chunks)

    kwargs = dict(return_index=True, return_inverse=True, return_counts=True)

    r_a = np.unique(a, **kwargs)
    r_d = da.unique(d, **kwargs)

    assert len(r_a) == len(r_d)

    assert (d.size, ) == r_d[2].shape

    for e_r_a, e_r_d in zip(r_a, r_d):
        assert_eq(e_r_d, e_r_a)
示例#22
0
def unique(ar):
    r"""Find the unique elements of an array.

    It uses ``dask.array.unique`` if necessary.

    Args:
        ar (array_like): Input array.

    Returns:
        array_like: the sorted unique elements.
    """

    import dask.array as da

    if isinstance(ar, da.core.Array):
        return da.unique(ar)

    return _unique(ar)
示例#23
0
def compute_sample_weight(y):
    assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1)

    if is_dask_dataframe_or_series(y):
        y = y.values

    unique = compute(da.unique(y))[0] if is_dask_object(y) else np.unique(y)
    cw = list(compute_class_weight('balanced', unique, y))

    if is_dask_object(y):
        sample_weight = y.map_blocks(_compute_chunk_sample_weight,
                                     unique,
                                     cw,
                                     dtype=np.float64)
    else:
        sample_weight = _compute_chunk_sample_weight(y, unique, cw)

    return sample_weight
示例#24
0
def cue_times(data: Dict[str, da.Array], message: int) -> da.Array:
    """
    Find the timestamps of all instances of a cue message in a Tristan data set.

    The found timestamps are de-duplicated.

    Args:
        data:     A LATRD data dictionary (a dictionary with data set names as keys
                  and Dask arrays as values).  Must contain one entry for cue id
                  messages and one for cue timestamps.  The two arrays are assumed
                  to have the same length.
        message:  The message code, as defined in the Tristan standard.

    Returns:
        The timestamps, measured in clock cycles from the global synchronisation
        signal, de-duplicated.
    """
    index = da.flatnonzero(data[cue_id_key] == message)
    return da.unique(data[cue_time_key][index])
示例#25
0
    def _test_unique_kwargs():
        r_a = np.unique(a, **kwargs)
        r_d = da.unique(d, **kwargs)

        if not any([return_index, return_inverse, return_counts]):
            assert isinstance(r_a, cupy.ndarray)
            assert isinstance(r_d, da.Array)

            r_a = (r_a, )
            r_d = (r_d, )

        assert len(r_a) == len(r_d)

        if return_inverse:
            i = 1 + int(return_index)
            assert (d.size, ) == r_d[i].shape

        for e_r_a, e_r_d in zip(r_a, r_d):
            assert_eq(e_r_d, e_r_a)
示例#26
0
    def compute_sample_weight(y):
        assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1)

        if hasattr(y, 'values'):
            y = y.values

        unique = dask.compute(
            da.unique(y))[0] if DaskToolBox.is_dask_object(y) else np.unique(y)
        cw = list(
            DaskToolBox.compute_class_weight('balanced', classes=unique, y=y))

        if DaskToolBox.is_dask_object(y):
            sample_weight = y.map_blocks(_compute_chunk_sample_weight,
                                         unique,
                                         cw,
                                         dtype=np.float64)
        else:
            sample_weight = _compute_chunk_sample_weight(y, unique, cw)

        return sample_weight
示例#27
0
def plot_subfigure(X, Y, subplot, transform):
    if transform == "pca":
        X = PCA(n_components=2).fit_transform(X)
    elif transform == "cca":
        X = CCA(n_components=2).fit(X, Y).transform(X)
    else:
        raise ValueError

    min_x = da.min(X[:, 0])
    max_x = da.max(X[:, 0])

    min_y = da.min(X[:, 1])
    max_y = da.max(X[:, 1])

    classif = OneVsRestClassifier(LogisticRegression())
    classif.fit(X, Y)
    y_pred = classif.predict(X)

    print('{} + OneVsRestClassifier + LogisticRegression accuracy_score {}'.
          format(transform, accuracy_score(Y, y_pred)))

    plt.subplot(1, 2, subplot)
    plt.scatter(X[:, 0], X[:, 1], s=15, c='gray', edgecolors=(0, 0, 0))

    for i in da.unique(Y.argmax(axis=1)):
        class_ = da.where(Y[:, i])
        plt.scatter(X[class_, 0],
                    X[class_, 1],
                    s=25,
                    linewidths=2,
                    label='Class {}'.format(str(i)))

    for i in range(len(classif.estimators_)):
        plot_hyperplane(classif.estimators_[i], min_x, max_x, 'k--',
                        'Boundary\nfor class {}'.format(str(i)))

    plt.xticks(())
    plt.yticks(())

    plt.xlim(min_x - .1 * max_x, max_x + .1 * max_x)
    plt.ylim(min_y - .1 * max_y, max_y + .1 * max_y)
示例#28
0
def unique_baselines(ant1, ant2):
    """
    Returns unique baseline pairs across all dask chunks as 64 bit ints

    The resulting computed numpy array should be recast and shaped
    as follows:

    .. code-block:: python

        ubl_dask = unique_baselines(ant1, ant2)

        ubl = dask.compute(ubl_dask)[0].view(np.int32).reshape(-1, 2)
    """
    if not (ant1.dtype == np.int32 and ant2.dtype == np.int32):
        raise TypeError("antenna1 '%s' and antenna2 '%s' dtypes "
                        "must both be np.int32" % (ant1.dtype, ant2.dtype))

    # Stack, create a 64 bit baseline values
    bl = da.stack([ant1, ant2], axis=1)
    bl = bl.rechunk(-1, 2).view(np.int64)
    return da.unique(bl)
示例#29
0
def test_unique_rand(seed, low, high, shape, chunks):
    np.random.seed(seed)

    a = np.random.randint(low, high, size=shape)
    d = da.from_array(a, chunks=chunks)

    kwargs = dict(
        return_index=True,
        return_inverse=True,
        return_counts=True
    )

    r_a = np.unique(a, **kwargs)
    r_d = da.unique(d, **kwargs)

    assert len(r_a) == len(r_d)

    assert (d.size,) == r_d[2].shape

    for e_r_a, e_r_d in zip(r_a, r_d):
        assert_eq(e_r_d, e_r_a)
示例#30
0
文件: dask.py 项目: vcarpani/xgboost
    async def _fit_async(self, X, y, sample_weight, base_margin, eval_set,
                         sample_weight_eval_set, early_stopping_rounds,
                         verbose):
        dtrain = await DaskDMatrix(client=self.client,
                                   data=X,
                                   label=y,
                                   weight=sample_weight,
                                   base_margin=base_margin,
                                   missing=self.missing)
        params = self.get_xgb_params()

        # pylint: disable=attribute-defined-outside-init
        if isinstance(y, (da.Array)):
            self.classes_ = await self.client.compute(da.unique(y))
        else:
            self.classes_ = await self.client.compute(y.drop_duplicates())
        self.n_classes_ = len(self.classes_)

        if self.n_classes_ > 2:
            params["objective"] = "multi:softprob"
            params['num_class'] = self.n_classes_
        else:
            params["objective"] = "binary:logistic"

        evals = await _evaluation_matrices(self.client, eval_set,
                                           sample_weight_eval_set,
                                           self.missing)
        results = await train(client=self.client,
                              params=params,
                              dtrain=dtrain,
                              num_boost_round=self.get_num_boosting_rounds(),
                              evals=evals,
                              early_stopping_rounds=early_stopping_rounds,
                              verbose_eval=verbose)
        self._Booster = results['booster']
        # pylint: disable=attribute-defined-outside-init
        self.evals_result_ = results['history']
        return self
示例#31
0
def test_unique():
    x = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(x, chunks=(3,))
    assert eq(da.unique(d), np.unique(x))
示例#32
0
文件: io.py 项目: danielballan/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock,
               **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int, optional
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                np.issubdtype(x.dtype[name], np.unicode_) or
                np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1,)
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 columns, categories, lock))
               for i in range(0, int(ceil(len(x) / chunksize))))

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
def test_unique():
    x = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(x, chunks=(3, ))
    assert_eq(da.unique(d), np.unique(x))
示例#34
0
def from_bcolz(x,
               chunksize=None,
               categorize=True,
               index=None,
               lock=lock,
               **kwargs):
    """ Read BColz CTable into a Dask Dataframe

    BColz is a fast on-disk compressed column store with careful attention
    given to compression.  https://bcolz.readthedocs.io/en/latest/

    Parameters
    ----------
    x : bcolz.ctable
    chunksize : int, optional
        The size(rows) of blocks to pull out from ctable.
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------
    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_)
                    or np.issubdtype(x.dtype[name], np.unicode_)
                    or np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names), ))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1, )
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i), (dataframe_from_ctable, x,
                                (slice(i * chunksize, (i + 1) * chunksize), ),
                                columns, categories, lock))
               for i in range(0, int(ceil(len(x) / chunksize))))

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names), ))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = tuple(da.percentile(a, q).compute())
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
示例#35
0
    def _test_basic(c, s, a, b):
        rng = da.random.RandomState(42)

        n, d = (50, 2)
        # create observations we know linear models can fit
        X = rng.normal(size=(n, d), chunks=n // 2)
        coef_star = rng.uniform(size=d, chunks=d)
        y = da.sign(X.dot(coef_star))

        if array_type == "numpy":
            X, y = yield c.compute((X, y))

        params = {
            "loss":
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            "average": [True, False],
            "learning_rate": ["constant", "invscaling", "optimal"],
            "eta0":
            np.logspace(-2, 0, num=1000),
        }
        model = SGDClassifier(tol=-np.inf,
                              penalty="elasticnet",
                              random_state=42,
                              eta0=0.1)
        if library == "dask-ml":
            model = Incremental(model)
            params = {"estimator__" + k: v for k, v in params.items()}
        elif library == "ConstantFunction":
            model = ConstantFunction()
            params = {"value": np.linspace(0, 1, num=1000)}

        search = HyperbandSearchCV(model,
                                   params,
                                   max_iter=max_iter,
                                   random_state=42)
        classes = c.compute(da.unique(y))
        yield search.fit(X, y, classes=classes)

        if library == "dask-ml":
            X, y = yield c.compute((X, y))
        score = search.best_estimator_.score(X, y)
        assert score == search.score(X, y)
        assert 0 <= score <= 1

        if library == "ConstantFunction":
            assert score == search.best_score_
        else:
            # These are not equal because IncrementalSearchCV uses a train/test
            # split and we're testing on the entire train dataset, not only the
            # validation/test set.
            assert abs(score - search.best_score_) < 0.1

        assert type(search.best_estimator_) == type(model)
        assert isinstance(search.best_params_, dict)

        num_fit_models = len(set(search.cv_results_["model_id"]))
        num_pf_calls = sum([
            v[-1]["partial_fit_calls"] for v in search.model_history_.values()
        ])
        models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143}
        pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581}
        assert num_fit_models == models[max_iter]
        assert num_pf_calls == pf_calls[max_iter]

        best_idx = search.best_index_
        if isinstance(model, ConstantFunction):
            assert search.cv_results_["test_score"][best_idx] == max(
                search.cv_results_["test_score"])
        model_ids = {h["model_id"] for h in search.history_}

        if math.log(max_iter, 3) % 1.0 == 0:
            # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a
            # power of search.aggressiveness
            # In this case, assert that more models are tried then the max_iter
            assert len(model_ids) > max_iter
        else:
            # Otherwise, give some padding "almost as many estimators are tried
            # as max_iter". 3 is a fudge number chosen to be the minimum; when
            # max_iter=20, len(model_ids) == 17.
            assert len(model_ids) + 3 >= max_iter

        assert all("bracket" in id_ for id_ in model_ids)
示例#36
0
def test_unique():
    x = np.array([1, 2, 4, 4, 5, 2])
    d = da.from_array(x, blockshape=(3,))
    assert eq(da.unique(d), np.unique(x))