def _encode_dask_array(values, encode=False): # type: (da.Array, bool) -> Any if encode: uniques, encoded = da.unique(values, return_inverse=True) return uniques, encoded else: return da.unique(values)
def fit(self, X, y): """ Fit a multi-node multi-GPU K-Nearest Neighbors Classifier index Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Index data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Index labels data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- self : KNeighborsClassifier model """ if not isinstance(X._meta, (np.ndarray, pd.DataFrame, cudf.DataFrame)): raise ValueError('This chunk type is not supported') self.data_handler = \ DistributedDataHandler.create(data=[X, y], client=self.client) # uniq_labels: set of possible labels for each labels column # n_unique: number of possible labels for each labels column uniq_labels = [] if self.data_handler.datatype == 'cupy': if y.ndim == 1: uniq_labels.append(da.unique(y)) else: n_targets = y.shape[1] for i in range(n_targets): uniq_labels.append(da.unique(y[:, i])) else: if isinstance(y, DaskSeries): uniq_labels.append(y.unique()) else: n_targets = len(y.columns) for i in range(n_targets): uniq_labels.append(y.iloc[:, i].unique()) uniq_labels = da.compute(uniq_labels)[0] if hasattr(uniq_labels[0], 'values_host'): # for cuDF Series uniq_labels = list(map(lambda x: x.values_host, uniq_labels)) elif hasattr(uniq_labels[0], 'values'): # for pandas Series uniq_labels = list(map(lambda x: x.values, uniq_labels)) self.uniq_labels = np.array(uniq_labels) self.n_unique = list(map(lambda x: len(x), self.uniq_labels)) return self
def fit(self, X, y): """ Fit a multi-node multi-GPU K-Nearest Neighbors Classifier index Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Index data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Index labels data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- self : KNeighborsClassifier model """ self.data_handler = \ DistributedDataHandler.create(data=[X, y], client=self.client) # Compute set of possible labels for each output column -> uniq_labels # Count possible labels for each columns -> n_unique uniq_labels = [] if self.data_handler.datatype == 'cupy': if y.ndim == 1: uniq_labels.append(da.unique(y)) else: n_targets = y.shape[1] for i in range(n_targets): uniq_labels.append(da.unique(y[:, i])) else: if isinstance(y, DaskSeries): uniq_labels.append(y.unique()) else: n_targets = len(y.columns) for i in range(n_targets): uniq_labels.append(y.iloc[:, i].unique()) uniq_labels = da.compute(uniq_labels)[0] if not isinstance(uniq_labels[0], np.ndarray): # for cuDF Series uniq_labels = list(map(lambda x: x.values_host, uniq_labels)) self.uniq_labels = np.array(uniq_labels) self.n_unique = list(map(lambda x: len(x), self.uniq_labels)) return self
def compute_class_weight(class_weight, *, classes, y): if not DaskToolBox.is_dask_object(y): return sk_utils.class_weight.compute_class_weight(class_weight, classes=classes, y=y) y = DaskToolBox.make_chunk_size_known(y) if set(dask.compute(da.unique(y))[0]) - set(classes): raise ValueError( "classes should include all valid labels that can be in y") if class_weight == 'balanced': # Find the weight of each class as present in y. le = dm_pre.LabelEncoder() y_ind = le.fit_transform(y) # if not all(np.in1d(classes, le.classes_)): # raise ValueError("classes should have valid labels that are in y") # recip_freq = len(y) / (len(le.classes_) * # np.bincount(y_ind).astype(np.float64)) # weight = recip_freq[le.transform(classes)] y_shape, y_ind_bincount, le_classes_ = dask.compute( y.shape, da.bincount(y_ind), le.classes_) if not all(np.in1d(classes, le_classes_)): raise ValueError( "classes should have valid labels that are in y") recip_freq = y_shape[0] / (len(le_classes_) * y_ind_bincount.astype(np.float64)) weight = recip_freq[np.searchsorted(le_classes_, classes)] else: raise ValueError("Only class_weight == 'balanced' is supported.") return weight
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1,) new_name = 'from_bcolz' + next(tokens) dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), None, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: y = da.ones(X.shape[0]) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_,-1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]]+shape_) X_test = X_test.reshape([X_test.shape[0]]+shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) samples = list() for _ in range(10): for y_uniq in da.unique(train_dataset.labels): samples.append(train_dataset.x[train_dataset.labels==y_uniq][random.randint(0,len(train_dataset.x[train_dataset.labels==y_uniq])-1)]) train_dataset.samples = da.array(samples) return train_dataset, test_dataset
def test_unique_kwargs(return_index, return_inverse, return_counts): kwargs = dict( return_index=return_index, return_inverse=return_inverse, return_counts=return_counts ) a = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(a, chunks=(3,)) r_a = np.unique(a, **kwargs) r_d = da.unique(d, **kwargs) if not any([return_index, return_inverse, return_counts]): assert isinstance(r_a, np.ndarray) assert isinstance(r_d, da.Array) r_a = (r_a,) r_d = (r_d,) assert len(r_a) == len(r_d) if return_inverse: i = 1 + int(return_index) assert (d.size,) == r_d[i].shape for e_r_a, e_r_d in zip(r_a, r_d): assert_eq(e_r_d, e_r_a)
def test_unique_kwargs(return_index, return_inverse, return_counts): kwargs = dict(return_index=return_index, return_inverse=return_inverse, return_counts=return_counts) a = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(a, chunks=(3, )) r_a = np.unique(a, **kwargs) r_d = da.unique(d, **kwargs) if not any([return_index, return_inverse, return_counts]): assert isinstance(r_a, np.ndarray) assert isinstance(r_d, da.Array) r_a = (r_a, ) r_d = (r_d, ) assert len(r_a) == len(r_d) if return_inverse: i = 1 + int(return_index) assert (d.size, ) == r_d[i].shape for e_r_a, e_r_d in zip(r_a, r_d): assert_eq(e_r_d, e_r_a)
def __init__(self, labels, indexes=None, cost=None): """ :param labels: :param indexes: :param cost: """ if not check_one_to_one_correspondence(labels, indexes, cost): raise ValueError( "Different length of parameters found. " "All parameters should be list type with the same length") labels = check_array(labels, ensure_2d=False, dtype=None) if isinstance(labels[0], np.generic): self._label_type = type(labels[0].item()) else: self._label_type = type(labels[0]) self._label_dim = labels.ndim self._label_unique = da.unique(labels) # check parameters self._cost_flag = True if cost is not None else False # several _indexes construct if self._cost_flag: self._ind2all = dict( zip( indexes if indexes is not None else [i for i in range(len(labels))], zip(labels, cost))) else: self._ind2all = dict( zip( indexes if indexes is not None else [i for i in range(len(labels))], labels))
def test_unique_rand(seed, low, high, shape, chunks): cupy.random.seed(seed) a = cupy.random.randint(low, high, size=shape) d = da.from_array(a, chunks=chunks) r_a = np.unique(a) r_d = da.unique(d) assert_eq(r_d, r_a)
def predict(args): # Convert source data into dask arrays sky_model = parse_sky_model(args.sky_model, args.model_chunks) # Get the support tables tables = support_tables( args, ["FIELD", "DATA_DESCRIPTION", "SPECTRAL_WINDOW", "POLARIZATION"]) field_ds = tables["FIELD"] ddid_ds = tables["DATA_DESCRIPTION"] spw_ds = tables["SPECTRAL_WINDOW"] pol_ds = tables["POLARIZATION"] # List of write operations writes = [] # Construct a graph for each DATA_DESC_ID for xds in xds_from_ms(args.ms, columns=["UVW", "ANTENNA1", "ANTENNA2", "TIME"], group_cols=["FIELD_ID", "DATA_DESC_ID"], chunks={"row": args.row_chunks}): # Extract frequencies from the spectral window associated # with this data descriptor id field = field_ds[xds.attrs['FIELD_ID']] ddid = ddid_ds[xds.attrs['DATA_DESC_ID']] spw = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]] pol = pol_ds[ddid.POLARIZATION_ID.data[0]] # Select single dataset row out corrs = pol.NUM_CORR.data[0] _, time_index = da.unique(xds.TIME.data, return_inverse=True) # Generate visibility expressions for each source type source_vis = [ vis_factory(args, stype, sky_model, time_index, xds, field, spw, pol) for stype in sky_model.keys() ] # Sum visibilities together vis = sum(source_vis) # Reshape (2, 2) correlation to shape (4,) if corrs == 4: vis = vis.reshape(vis.shape[:2] + (4, )) # Assign visibilities to MODEL_DATA array on the dataset xds = xds.assign(MODEL_DATA=(("row", "chan", "corr"), vis)) # Create a write to the table write = xds_to_table(xds, args.ms, ['MODEL_DATA']) # Add to the list of writes writes.append(write) # Submit all graph computations in parallel with ProgressBar(): dask.compute(writes)
def _run_dask_numpy_quantile(data, k): w = 100.0 / k p = da.arange(w, 100 + w, w) if p[-1] > 100.0: p[-1] = 100.0 q = da.percentile(data.flatten(), p) q = da.unique(q) return q
def unique(y): if isinstance(y, da.Array): uniques = da.unique(y).compute() uniques = set(uniques) elif isinstance(y, dd.Series): uniques = y.unique().compute() uniques = set(uniques) else: uniques = ToolBox.unique(y) return uniques
def test_unique(): a = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(a, chunks=(3, )) r_a = np.unique(a) r_d = da.unique(d) assert isinstance(r_d, da.Array) assert_eq(r_d, r_a)
def fit(self, y): y = self._check_array(y) if isinstance(y, da.Array): classes_ = da.unique(y) classes_ = classes_.compute() else: classes_ = np.unique(y) self.classes_ = classes_ return self
def fetch(self, **params): return self._data.filter(lambda p: p.name in params) \ .map(lambda p: p[ \ reduce( \ lambda x, y: np.bitwise_and(x, (p[y[0]] == y[1])), \ params[p.name].items(), \ np.ones(p.shape, dtype=bool) \ ) \ ]) \ .fold(lambda p, q: da.concatenate([p, q])) \ .apply(lambda p: da.unique(p.compute())) \ .compute()
def value_counts(ar): if isinstance(ar, da.Array): v_n = da.unique(ar, return_counts=True) v_n = dask.compute(*v_n) return {v: n for v, n in zip(*v_n)} elif isinstance(ar, dd.Series): s = ar elif isinstance(ar, dd.DataFrame): assert ar.shape[1] == 1 s = ar.iloc[:, 0] else: return ToolBox.value_counts(ar) return s.value_counts().compute().to_dict()
def agreement(self, estimators): """ Implementation of Query By Committee strategy, variant: Vote entropy. The vote entropy approach is used for measuring the level of disagreement. I. Dagan and S. Engelson. Committee-based sampling for training probabilistic classifiers. In Proceedings of the International Conference on Machine Learning (ICML), pages 150–157. Morgan Kaufmann, 1995. :param estimators: :return: """ score = [] input_shape, committee_size = QueryByCommitteeStategy.check_committee_results( estimators) if len(input_shape) == 2: ele_uni = da.unique(estimators).compute() if not (len(ele_uni) == 2 and 0 in ele_uni and 1 in ele_uni): raise ValueError( "The predicted label matrix must only contain 0 and 1") # calc each instance for i in range(input_shape[0]): instance_mat = da.from_array( np.array([X[i, :] for X in estimators if X is not None])).compute() voting = da.sum(instance_mat, axis=0) tmp = [] for vote in voting: if vote != 0: tmp.append( delayed(vote / len(estimators) * np.log(vote / len(estimators)))) score.append(-delayed(sum)(tmp)) else: input_mat = da.from_array( np.array([X for X in estimators if X is not None])).compute() # for each instance for i in range(input_shape[0]): count_dict = collections.Counter(input_mat[:, i]) tmp = [] for key in count_dict: tmp.append( delayed(count_dict[key] / committee_size * np.log(count_dict[key] / committee_size))) score.append(-delayed(sum)(tmp)) return compute(score)[0]
def save_label_idx_map(trainFileName): #get labels labels = h5py.File(trainFileName,"r")["labels"]; labels_da = da.from_array(labels,chunks=(4,512,512)); label_idx_map = {} #count number of occurences for each label for idx in range(1,NUMCLASSES): start = time.time() X,Y,Z = da.where(labels_da==idx) label_idx_map[idx] = da.unique(X).compute(); print("Finished label {0} in {1:.3f} s".format(idx,time.time()-start)); with h5py.File(fileName.replace(".h5","_IDX_MAP.h5"),"w") as newFile: for idx in range(1,NUMCLASSES): newFile.create_dataset(str(idx), data=label_idx_map[idx], dtype=np.int16);
def test_unique_rand(seed, low, high, shape, chunks): np.random.seed(seed) a = np.random.randint(low, high, size=shape) d = da.from_array(a, chunks=chunks) kwargs = dict(return_index=True, return_inverse=True, return_counts=True) r_a = np.unique(a, **kwargs) r_d = da.unique(d, **kwargs) assert len(r_a) == len(r_d) assert (d.size, ) == r_d[2].shape for e_r_a, e_r_d in zip(r_a, r_d): assert_eq(e_r_d, e_r_a)
def unique(ar): r"""Find the unique elements of an array. It uses ``dask.array.unique`` if necessary. Args: ar (array_like): Input array. Returns: array_like: the sorted unique elements. """ import dask.array as da if isinstance(ar, da.core.Array): return da.unique(ar) return _unique(ar)
def compute_sample_weight(y): assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1) if is_dask_dataframe_or_series(y): y = y.values unique = compute(da.unique(y))[0] if is_dask_object(y) else np.unique(y) cw = list(compute_class_weight('balanced', unique, y)) if is_dask_object(y): sample_weight = y.map_blocks(_compute_chunk_sample_weight, unique, cw, dtype=np.float64) else: sample_weight = _compute_chunk_sample_weight(y, unique, cw) return sample_weight
def cue_times(data: Dict[str, da.Array], message: int) -> da.Array: """ Find the timestamps of all instances of a cue message in a Tristan data set. The found timestamps are de-duplicated. Args: data: A LATRD data dictionary (a dictionary with data set names as keys and Dask arrays as values). Must contain one entry for cue id messages and one for cue timestamps. The two arrays are assumed to have the same length. message: The message code, as defined in the Tristan standard. Returns: The timestamps, measured in clock cycles from the global synchronisation signal, de-duplicated. """ index = da.flatnonzero(data[cue_id_key] == message) return da.unique(data[cue_time_key][index])
def _test_unique_kwargs(): r_a = np.unique(a, **kwargs) r_d = da.unique(d, **kwargs) if not any([return_index, return_inverse, return_counts]): assert isinstance(r_a, cupy.ndarray) assert isinstance(r_d, da.Array) r_a = (r_a, ) r_d = (r_d, ) assert len(r_a) == len(r_d) if return_inverse: i = 1 + int(return_index) assert (d.size, ) == r_d[i].shape for e_r_a, e_r_d in zip(r_a, r_d): assert_eq(e_r_d, e_r_a)
def compute_sample_weight(y): assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1) if hasattr(y, 'values'): y = y.values unique = dask.compute( da.unique(y))[0] if DaskToolBox.is_dask_object(y) else np.unique(y) cw = list( DaskToolBox.compute_class_weight('balanced', classes=unique, y=y)) if DaskToolBox.is_dask_object(y): sample_weight = y.map_blocks(_compute_chunk_sample_weight, unique, cw, dtype=np.float64) else: sample_weight = _compute_chunk_sample_weight(y, unique, cw) return sample_weight
def plot_subfigure(X, Y, subplot, transform): if transform == "pca": X = PCA(n_components=2).fit_transform(X) elif transform == "cca": X = CCA(n_components=2).fit(X, Y).transform(X) else: raise ValueError min_x = da.min(X[:, 0]) max_x = da.max(X[:, 0]) min_y = da.min(X[:, 1]) max_y = da.max(X[:, 1]) classif = OneVsRestClassifier(LogisticRegression()) classif.fit(X, Y) y_pred = classif.predict(X) print('{} + OneVsRestClassifier + LogisticRegression accuracy_score {}'. format(transform, accuracy_score(Y, y_pred))) plt.subplot(1, 2, subplot) plt.scatter(X[:, 0], X[:, 1], s=15, c='gray', edgecolors=(0, 0, 0)) for i in da.unique(Y.argmax(axis=1)): class_ = da.where(Y[:, i]) plt.scatter(X[class_, 0], X[class_, 1], s=25, linewidths=2, label='Class {}'.format(str(i))) for i in range(len(classif.estimators_)): plot_hyperplane(classif.estimators_[i], min_x, max_x, 'k--', 'Boundary\nfor class {}'.format(str(i))) plt.xticks(()) plt.yticks(()) plt.xlim(min_x - .1 * max_x, max_x + .1 * max_x) plt.ylim(min_y - .1 * max_y, max_y + .1 * max_y)
def unique_baselines(ant1, ant2): """ Returns unique baseline pairs across all dask chunks as 64 bit ints The resulting computed numpy array should be recast and shaped as follows: .. code-block:: python ubl_dask = unique_baselines(ant1, ant2) ubl = dask.compute(ubl_dask)[0].view(np.int32).reshape(-1, 2) """ if not (ant1.dtype == np.int32 and ant2.dtype == np.int32): raise TypeError("antenna1 '%s' and antenna2 '%s' dtypes " "must both be np.int32" % (ant1.dtype, ant2.dtype)) # Stack, create a 64 bit baseline values bl = da.stack([ant1, ant2], axis=1) bl = bl.rechunk(-1, 2).view(np.int64) return da.unique(bl)
def test_unique_rand(seed, low, high, shape, chunks): np.random.seed(seed) a = np.random.randint(low, high, size=shape) d = da.from_array(a, chunks=chunks) kwargs = dict( return_index=True, return_inverse=True, return_counts=True ) r_a = np.unique(a, **kwargs) r_d = da.unique(d, **kwargs) assert len(r_a) == len(r_d) assert (d.size,) == r_d[2].shape for e_r_a, e_r_d in zip(r_a, r_d): assert_eq(e_r_d, e_r_a)
async def _fit_async(self, X, y, sample_weight, base_margin, eval_set, sample_weight_eval_set, early_stopping_rounds, verbose): dtrain = await DaskDMatrix(client=self.client, data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing) params = self.get_xgb_params() # pylint: disable=attribute-defined-outside-init if isinstance(y, (da.Array)): self.classes_ = await self.client.compute(da.unique(y)) else: self.classes_ = await self.client.compute(y.drop_duplicates()) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: params["objective"] = "multi:softprob" params['num_class'] = self.n_classes_ else: params["objective"] = "binary:logistic" evals = await _evaluation_matrices(self.client, eval_set, sample_weight_eval_set, self.missing) results = await train(client=self.client, params=params, dtrain=dtrain, num_boost_round=self.get_num_boosting_rounds(), evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose) self._Booster = results['booster'] # pylint: disable=attribute-defined-outside-init self.evals_result_ = results['history'] return self
def test_unique(): x = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(x, chunks=(3,)) assert eq(da.unique(d), np.unique(x))
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int, optional The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1,) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), columns, categories, lock)) for i in range(0, int(ceil(len(x) / chunksize)))) meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def test_unique(): x = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(x, chunks=(3, )) assert_eq(da.unique(d), np.unique(x))
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """ Read BColz CTable into a Dask Dataframe BColz is a fast on-disk compressed column store with careful attention given to compression. https://bcolz.readthedocs.io/en/latest/ Parameters ---------- x : bcolz.ctable chunksize : int, optional The size(rows) of blocks to pull out from ctable. categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names), )) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1, ) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize), ), columns, categories, lock)) for i in range(0, int(ceil(len(x) / chunksize)))) meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names), )) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = tuple(da.percentile(a, q).compute()) return set_partition(result, index, divisions, **kwargs) else: return result
def _test_basic(c, s, a, b): rng = da.random.RandomState(42) n, d = (50, 2) # create observations we know linear models can fit X = rng.normal(size=(n, d), chunks=n // 2) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) if array_type == "numpy": X, y = yield c.compute((X, y)) params = { "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "average": [True, False], "learning_rate": ["constant", "invscaling", "optimal"], "eta0": np.logspace(-2, 0, num=1000), } model = SGDClassifier(tol=-np.inf, penalty="elasticnet", random_state=42, eta0=0.1) if library == "dask-ml": model = Incremental(model) params = {"estimator__" + k: v for k, v in params.items()} elif library == "ConstantFunction": model = ConstantFunction() params = {"value": np.linspace(0, 1, num=1000)} search = HyperbandSearchCV(model, params, max_iter=max_iter, random_state=42) classes = c.compute(da.unique(y)) yield search.fit(X, y, classes=classes) if library == "dask-ml": X, y = yield c.compute((X, y)) score = search.best_estimator_.score(X, y) assert score == search.score(X, y) assert 0 <= score <= 1 if library == "ConstantFunction": assert score == search.best_score_ else: # These are not equal because IncrementalSearchCV uses a train/test # split and we're testing on the entire train dataset, not only the # validation/test set. assert abs(score - search.best_score_) < 0.1 assert type(search.best_estimator_) == type(model) assert isinstance(search.best_params_, dict) num_fit_models = len(set(search.cv_results_["model_id"])) num_pf_calls = sum([ v[-1]["partial_fit_calls"] for v in search.model_history_.values() ]) models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143} pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581} assert num_fit_models == models[max_iter] assert num_pf_calls == pf_calls[max_iter] best_idx = search.best_index_ if isinstance(model, ConstantFunction): assert search.cv_results_["test_score"][best_idx] == max( search.cv_results_["test_score"]) model_ids = {h["model_id"] for h in search.history_} if math.log(max_iter, 3) % 1.0 == 0: # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a # power of search.aggressiveness # In this case, assert that more models are tried then the max_iter assert len(model_ids) > max_iter else: # Otherwise, give some padding "almost as many estimators are tried # as max_iter". 3 is a fudge number chosen to be the minimum; when # max_iter=20, len(model_ids) == 17. assert len(model_ids) + 3 >= max_iter assert all("bracket" in id_ for id_ in model_ids)
def test_unique(): x = np.array([1, 2, 4, 4, 5, 2]) d = da.from_array(x, blockshape=(3,)) assert eq(da.unique(d), np.unique(x))