예제 #1
0
def cg_warmstart_reduces_iterations(A):
    options = dict(graph_iters=1, maxiter=1000)
    b = da.ones(A.shape[0], chunks=A.chunks[0])
    x, res, iters = cg.cg_graph(A, b, **options)

    # no iters if solved
    _, _, iters_ws = cg.cg_graph(A, b, x_init=x, **options)
    assert iters_ws == 0

    # iters deterministic
    x_partial, res_partial, iters_partial = cg.cg_graph(A, b, **options)
    _, _, iters_ws = cg.cg_graph(A, b, x_init=x_partial, **options)
    assert iters_partial + iters_ws == iters

    perturb_x = 0.1 * da.mean(x).compute() / (x.size**0.5)
    perturb_b = 0.1 * da.mean(b).compute() / (b.size**0.5)
    xp = x * (1 + da.random.normal(0, perturb_x, x.size, chunks=x.chunks))
    bp = b * (1 + da.random.normal(0, perturb_b, b.size, chunks=b.chunks))

    # nearby b
    _, _, iterp = cg.cg_graph(A, bp, x_init=x, **options)
    assert iters > iterp, '{} > {}'.format(iters, iterp)

    # nearby x0
    _, _, iters_perturb = cg.cg_graph(A, b, x_init=xp, **options)
    assert iters > iterp, '{} > {}'.format(iters, iterp)

    # nearby (b, x0)
    _, _, iters_perturb = cg.cg_graph(A, bp, x_init=xp, **options)
    assert iters > iterp, '{} > {}'.format(iters, iterp)
    return True
예제 #2
0
def test_0d_array():
    x = da.mean(da.ones(4, chunks=4), axis=()).compute()
    x = da.mean(da.ones(4, chunks=4), axis=0).compute()
    y = np.mean(np.ones(4))
    assert type(x) == type(y)

    x = da.sum(da.zeros(4, chunks=1)).compute()
    y = np.sum(np.zeros(4))
    assert type(x) == type(y)
예제 #3
0
def ttest_ind(a, b, axis=0, equal_var=True):
    v1 = da.var(a, axis, ddof=1)  # XXX: np -> da
    v2 = da.var(b, axis, ddof=1)  # XXX: np -> da
    n1 = a.shape[axis]
    n2 = b.shape[axis]

    if equal_var:
        df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
    else:
        df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)

    res = _ttest_ind_from_stats(da.mean(a, axis), da.mean(b, axis), denom, df)

    return delayed(Ttest_indResult, nout=2)(*res)
예제 #4
0
파일: stats.py 프로젝트: caseyclements/dask
def ttest_ind(a, b, axis=0, equal_var=True):
    v1 = da.var(a, axis, ddof=1)  # XXX: np -> da
    v2 = da.var(b, axis, ddof=1)  # XXX: np -> da
    n1 = a.shape[axis]
    n2 = b.shape[axis]

    if equal_var:
        df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
    else:
        df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)

    res = _ttest_ind_from_stats(da.mean(a, axis), da.mean(b, axis), denom, df)

    return delayed(Ttest_indResult, nout=2)(*res)
예제 #5
0
    def fit(self, X, y=None):
        # CHECKING THE TYPES
        if isinstance(X, dask.array.Array):
            import dask.array as numerical_module

            from dask.array.linalg import cholesky, inv

        else:
            import numpy as numerical_module

            from scipy.linalg import cholesky, inv

        # 1. Computes the mean vector and the covariance matrix of the training set
        mu = numerical_module.mean(X, axis=0)
        cov = numerical_module.cov(X.T)

        # 2. Computes the inverse of the covariance matrix
        inv_cov = pinv(cov) if self.pinv else inv(cov)

        # 3. Computes the Cholesky decomposition of the inverse covariance matrix
        self.weights = cholesky(
            inv_cov, lower=True
        )  # Setting lower true to have the same implementation as in the previous code
        self.input_subtract = mu
        self.input_divide = 1.0

        return self
예제 #6
0
def score_gene_sets(ds, gs, z_score_ds=True, use_dask=False):
    if use_dask:
        import dask.array as np
    else:
        import numpy as np
    # gene sets has genes on rows, sets on columns
    # ds has cells on rows, genes on columns
    gs_x = gs.x
    ds_x = ds.x
    if z_score_ds:
        ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x
    gene_indices = (gs_x.sum(axis=1) > 0) & (
        ds_x.std(axis=0) > 0
    )  # keep genes that are in gene sets and have standard deviation > 0

    gs_x = gs_x[gene_indices]
    ds_x = ds_x[:, gene_indices]
    if z_score_ds:
        ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x
        std = np.std(ds_x, axis=0)
        mean = np.mean(ds_x, axis=0)
        ds_x = (ds_x - mean) / std
        ds_x[ds_x < -5] = -5
        ds_x[ds_x > 5] = 5
        ds_x[ds_x == np.nan] = 0

    scores = ds_x.dot(gs_x)
    ngenes_in_set = gs_x.sum(axis=0)
    ngenes_in_set[ngenes_in_set == 0] = 1  # avoid divide by zero
    scores = scores / ngenes_in_set  # scores contains cells on rows, gene sets on columns
    return wot.Dataset(x=scores, row_meta=ds.row_meta, col_meta=gs.col_meta)
예제 #7
0
    def score(self, X, y):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Outputs test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        score
        """
        y_pred = self.predict(X, convert_dtype=True)
        if not isinstance(y_pred, da.Array):
            y_pred = y_pred.to_dask_array(lengths=True)
        if not isinstance(y, da.Array):
            y = y.to_dask_array(lengths=True)
        y_true = y.squeeze()
        y_mean = y_true.mean(axis=0)
        residual_sss = ((y_true - y_pred)**2).sum(axis=0, dtype='float64')
        total_sss = ((y_true - y_mean)**2).sum(axis=0, dtype='float64')
        r2_score = da.mean(1 - (residual_sss / total_sss))
        return r2_score.compute()
예제 #8
0
    def score(self, X, y):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Outputs test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        score
        """
        labels, _, _ = self.predict(X, convert_dtype=True)
        diff = (labels == y)
        if self.data_handler.datatype == 'cupy':
            mean = da.mean(diff)
            return mean.compute()
        else:
            raise ValueError("Only Dask arrays are supported")
예제 #9
0
    def score(self, X, y, convert_dtype=True):
        """
        Predict labels for a query from previously stored index
        and index labels.
        The process is done in a multi-node multi-GPU fashion.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Labels test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        score
        """
        if self.data_handler.datatype == 'cupy':
            preds, _, _ = self.predict(X, convert_dtype=convert_dtype)
            diff = (preds == y)
            mean = da.mean(diff)
            return mean.compute()
        else:
            raise ValueError("Only Dask arrays are supported")
예제 #10
0
    def normalize(self, gropuname1, groupname2):
        # ## normalize y ## #
        with h5py.File(self.OUTPATH, mode='r+') as f:
            for atom in self.MAINCHAIN:
                # load
                train_y = da.from_array(
                    f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'],
                    chunks=("auto", 3))
                val_y = da.from_array(
                    f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'],
                    chunks=("auto", 3))

                total_y = da.concatenate([train_y, val_y], axis=0)
                y_mean = da.mean(total_y.reshape(-1), axis=0).compute()
                y_std = da.std(total_y.reshape(-1), axis=0).compute()

                # normalize
                train_y = da.divide(da.subtract(train_y, y_mean), y_std)
                val_y = da.divide(da.subtract(val_y, y_mean), y_std)

                # save
                da.to_hdf5(self.OUTPATH,
                           f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}',
                           train_y)
                da.to_hdf5(self.OUTPATH,
                           f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y)

                f.create_dataset(name=f'/{atom}/normalization',
                                 data=np.array([y_mean, y_std]))

                print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
예제 #11
0
        def load_data(statistic, axis):
            import dask.array as da
            import numpy as np
            from glue.utils import view_shape
            x = da.from_zarr('/mnt/cephfs/zarr_data_full')
            f = 1500
            scale = 2

            lh = []
            for k in range(scale):
                lc = []
                for i in range(scale):
                    lr = []
                    for j in range(scale):
                        lr.append(x[f % 3500])
                        f = f + 1
                    lc.append(da.concatenate(lr))
                lh.append(da.concatenate(lc, 1))
            z = da.concatenate(lh, 2)

            if statistic == 'minimum':
                return da.min(z, axis).compute()
            elif statistic == 'maximum':
                return da.max(z, axis).compute()
            elif statistic == 'mean' or statistic == 'median':
                return da.mean(z, axis).compute()
            elif statistic == 'percentile':
                return percentile / 100
            elif statistic == 'sum':
                return da.sum(z.axis).compute()
            return 0
예제 #12
0
def test_PowerMethod_project():
    N, P = 1000, 1000
    k = 10
    svd_array = da.random.random(size=(N, P)).persist()
    proj_array = da.random.random(size=(10, P)).persist()
    mu = da.mean(svd_array, axis=0).persist()
    std = da.diag(1 / da.std(svd_array, axis=0)).persist()

    for scale in [True, False]:
        for center in [True, False]:
            svd_array1 = svd_array
            proj_array1 = proj_array
            if center:
                svd_array1 = svd_array1 - mu
                proj_array1 = proj_array1 - mu
            if scale:
                svd_array1 = svd_array1.dot(std)
                proj_array1 = proj_array1.dot(std)

            U, S, V = da.linalg.svd(svd_array1)
            U_k, S_k, V_k = svd_to_trunc_svd(U, S, V, k=k)

            PM = PowerMethod(k=k,
                             scale=scale,
                             center=center,
                             factor=None,
                             tol=1e-12)
            U_PM, S_PM, V_PM = PM.svd(array=svd_array)

            np.testing.assert_array_almost_equal(
                PM.project(proj_array, onto=V_k.T), proj_array1.dot(V_k.T))
예제 #13
0
    def score(self, X, y):
        """
        Provide score by comparing predictions and ground truth.

        Parameters
        ----------
        X : array-like (device or host) shape = (n_samples, n_features)
            Query test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        y : array-like (device or host) shape = (n_samples, n_features)
            Outputs test data.
            Acceptable formats: dask CuPy/NumPy/Numba Array

        Returns
        -------
        score
        """
        if self.data_handler.datatype == 'cupy':
            preds, _, _ = self.predict(X, convert_dtype=True)
            y_mean = y.mean(axis=0)
            residual_sss = ((y - preds)**2).sum(axis=0)
            total_sss = ((y - y_mean)**2).sum(axis=0)
            r2_score = da.mean(1 - (residual_sss / total_sss))
            return r2_score.compute()
        else:
            raise ValueError("Only Dask arrays are supported")
    def extract(self):

        df_path = pd.read_csv('path_to_file.csv', sep=';')

        df_path = df_path.rename(columns={'Unnamed: 0': 'id'})
        df_path = df_path.set_index('id')

        print(df_path)

        ds_batch = xr.open_mfdataset(df_path['path'],
                                     parallel=True)  #loading ncdf files

        print(ds_batch)

        print("--- Total size (GB):")
        print(ds_batch.nbytes * (2**-30))  # get size of the dataset in GB

        #getting average albedos over whole time period (used for maps and scatter plots)
        darr = ds_batch['QFLAG']  #getting data for specific band
        print(darr)

        #res = darr.mean(['lon','lat'])
        #res = da.count_nonzero( da.bitwise_and(darr//2**5, 1), ['lon','lat'])
        #res = (darr==32).sum(['lon','lat'])
        #res = xr.ufunc.bitwise_and(darr, 0b100000).sum(['lon','lat'])
        func = lambda x: np.bitwise_and(np.right_shift(x, 5), np.uint64(1))
        func = lambda x: np.bitwise_and(x, np.uint64(1))
        res = xr.apply_ufunc(func,
                             darr,
                             input_core_dims=[['lon', 'lat']],
                             dask='parallelized',
                             vectorize=True)
        #res = itwise_and(np.right_shift(darr, 5), 1).sum(['lon','lat])
        #res = (darr==32).max(['lon','lat'])
        print(np.array(res))

        sys.exit()

        da_count = ((da >> 5) & 1)  #calculate mean over time
        #da_mean_lowres = da_mean.sel(lat=slice(70, 30)).sel(lon=slice(-25, 70)) # this can be used to zoom in over Europe
        da_mean_lowres = da_mean.isel(lat=slice(None, None, 10)).isel(
            lon=slice(None, None, 10))  #downsampling for faster plotting

        #getting average, min and max albedos for each time step (used to plot timeline)
        da_timeline_mean = da.mean(['lon', 'lat'])
        da_timeline_max = da.max(['lon', 'lat'])
        da_timeline_min = da.min(['lon', 'lat'])

        #closing arrays to free memory
        DS.close()
        da.close()
        da_mean.close()

        return da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min

        da_mean_lowres.close()
        da_timeline_mean.close()
        da_timeline_max.close()
        da_timeline_min.close()
예제 #15
0
def test_func(default_val, dataset_flat, shape, dataset):
    shift_up = array.hstack([
        array.zeros((shape[0], 1, shape[2])), dataset[:, :-1, :]
    ]).transpose([1, 2, 0]).reshape([shape[1] * shape[2], -1])

    shift_up_mult = dataset_flat * shift_up
    del shift_up
    return array.mean(shift_up_mult, axis=1)
예제 #16
0
def test_make_snp_array_case_binom(shape, threshold):
    assume(shape[0] > 1 and shape[1] > 1)  # Assumes not degenerate 2d Array

    arr = da.random.random(size=shape)
    arr[arr > threshold] = float('nan')

    assume(da.mean(da.mean(da.isnan(arr), axis=0) < 1) == 1)
    # Asserts that every tested arr has at least 1 non-nan value in each column

    snp_array = utils.make_snp_array(arr,
                                     mean=True,
                                     std=True,
                                     std_method='binom',
                                     dtype='float')

    mean = snp_array.mean(axis=0)
    np.testing.assert_array_almost_equal(1 + mean, np.ones(shape[1]))
예제 #17
0
def test_0d_array():
    x = da.mean(da.ones(4, chunks=4), axis=0).compute()
    y = np.mean(np.ones(4))
    assert type(x) == type(y)

    x = da.sum(da.zeros(4, chunks=1)).compute()
    y = np.sum(np.zeros(4))
    assert type(x) == type(y)
예제 #18
0
def compute_importance_gbt(x, y, x_test, y_test):
    """Compute importance based on gradient boosted trees."""
    print("Computing importance based on gradient boosted trees ... ")
    num_factors = y.shape[1]
    #num_codes = x.shape[0]
    importance_matrix = list()
    train_loss = []
    test_loss = []
    for i in range(num_factors):
        model = GradientBoostingClassifier(verbose=1)
        model.fit(x, y[:, i])

        importance_matrix.append(np.abs(model.feature_importances_))
        train_loss.append(da.mean(model.predict(x) == y[:, i]))
        test_loss.append(da.mean(model.predict(x_test) == y_test[:, i]))

    return da.vstack(importance_matrix), np.mean(train_loss), np.mean(
        test_loss)
예제 #19
0
def ttest_1samp(a, popmean, axis=0, nan_policy="propagate"):
    if nan_policy != "propagate":
        raise NotImplementedError(
            "`nan_policy` other than 'propagate' have not been implemented.")
    n = a.shape[axis]
    df = n - 1

    d = da.mean(a, axis) - popmean
    v = da.var(a, axis, ddof=1)
    denom = da.sqrt(v / float(n))

    with np.errstate(divide="ignore", invalid="ignore"):
        t = da.divide(d, denom)
    t, prob = _ttest_finish(df, t)
    return delayed(Ttest_1sampResult, nout=2)(t, prob)
예제 #20
0
파일: stats.py 프로젝트: caseyclements/dask
def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'):
    if nan_policy != 'propagate':
        raise NotImplementedError("`nan_policy` other than 'propagate' "
                                  "have not been implemented.")
    n = a.shape[axis]
    df = n - 1

    d = da.mean(a, axis) - popmean
    v = da.var(a, axis, ddof=1)
    denom = da.sqrt(v / float(n))

    with np.errstate(divide='ignore', invalid='ignore'):
        t = da.divide(d, denom)
    t, prob = _ttest_finish(df, t)
    return delayed(Ttest_1sampResult, nout=2)(t, prob)
예제 #21
0
def scale_drop(Xmat, h5write):
    xmeans = da.mean(Xmat, axis=0)
    print("Centering X columns")
    Xmat = Xmat - xmeans
    xnorms = da.linalg.norm(Xmat, axis=0)
    xnorms, xmeans = da.compute(xnorms, xmeans)
    keepcols = np.arange(Xmat.shape[1])[np.nonzero(xnorms)]
    dropcols = np.arange(Xmat.shape[1])[xnorms == 0]
    print("Dropping column with norm zero:")
    xcolnames = []
    for colname in xinfo['xcolnames']:
        xcolnames.append(colname)
        for dropix in dropcols:
            if xinfo['xcolnames'][colname] == dropix:
                print(colname)
    Xmat = Xmat[:, keepcols]
    xnorms = xnorms[keepcols]
    xmeans = xmeans[keepcols]
    xcolnames_keep = np.array(xcolnames)[keepcols]
    print("Standardizing X columns")
    Xmat = Xmat / xnorms
    tol = 1e-8
    ## IF using scipy QR
    #Qx, Rx, Px = scipy.linalg.qr(Xmat, mode='economic', pivoting=True)
    #dropcols_qr = Px[np.nonzero(abs(np.diag(Rx))<tol)]
    #keepcols_qr = Px[np.nonzero(abs(np.diag(Rx))>=tol)]
    #rank = np.sum(abs(np.diag(Rx)) >= tol)
    ## USING BLOCKED QR
    Qx, Rx, PImat = bk.tsqr_pivot_seq(Xmat)
    #Rx = Rx[0:rank, 0:rank]
    #Qx = Qx[:, 0:rank]
    keepcols_qr = np.argmax(PImat, axis=0)
    dropmask = np.ones(Xmat.shape[1], dtype=bool)
    dropmask[keepcols_qr] = False
    dropcols_qr = np.arange(Xmat.shape[1])[dropmask]
    rank = keepcols_qr.shape[0]
    print("Dropping columns based on pivoted QR:")
    print("\t" + "\n\t".join(xcolnames_keep[dropcols_qr]))
    xnorms = xnorms[keepcols_qr]
    xmeans = xmeans[keepcols_qr]
    Xmat = Xmat[:, keepcols_qr]
    xcolnames_keep = xcolnames_keep[keepcols_qr]
    #keepcols_store = h5write.create_array(h5write.root, 'keepcols',
    #                                        keepcols)
    #cols_orig_store = h5write.create_array(h5write.root, 'xcolnames_all', xcolnames)
    #cols_keep_store = h5write.create_array(h5write.root, 'xcolnames_keep', xcolnames_keep)
    #da.store([xcolnames, xcolnames_keep], [cols_orig_store, cols_keep_store])
    return Xmat, Qx, Rx
예제 #22
0
    def fit(self, X, y):

        # CHECKING THE TYPES
        if isinstance(X, dask.array.Array):
            import dask.array as numerical_module

            from dask.array.linalg import cholesky, inv
        else:
            import numpy as numerical_module

            from scipy.linalg import cholesky, inv

        possible_labels = set(y)
        y_ = numerical_module.array(y)

        n_classes = len(possible_labels)

        # 1. compute the means for each label
        mu_l = numerical_module.array(
            [
                numerical_module.mean(
                    X[numerical_module.where(y_ == label)[0]], axis=0
                )
                for label in possible_labels
            ]
        )

        # 2. Compute Sw
        Sw = numerical_module.zeros((X.shape[1], X.shape[1]), dtype=float)

        for label in possible_labels:
            indexes = numerical_module.where(y_ == label)[0]
            X_l_mu_l = X[indexes] - mu_l[label]

            Sw += X_l_mu_l.T @ X_l_mu_l

        # 3. Compute inv
        scaled_Sw = (1 / n_classes) * Sw
        inv_scaled_Sw = pinv(scaled_Sw) if self.pinv else inv(scaled_Sw)

        # 3. Computes the Cholesky decomposition
        self.weights = cholesky(
            inv_scaled_Sw, lower=True
        )  # Setting lower true to have the same implementation as in the previous code
        self.input_subtract = 0
        self.input_divide = 1.0

        return self
예제 #23
0
파일: stacked.py 프로젝트: TNonet/lmdec
 def mean(self,
          axis=None,
          dtype=None,
          keepdims=False,
          split_every=None,
          out=None) -> da.core.Array:
     if out is not None:
         raise NotImplementedError(
             f'`out` argument is not supported for {StackedArray.__name__}')
     means = (da.mean(array,
                      axis=axis,
                      dtype=dtype,
                      keepdims=keepdims,
                      split_every=split_every,
                      out=None) for array in expand_arrays(self.arrays))
     return self.reduce(means, da.add)
예제 #24
0
def test_make_snp_array_case_normal(shape, threshold):
    assume(shape[0] > 1 and shape[1] > 1)  # Assumes not degenerate 2d Array

    arr = da.random.random(size=shape)
    arr[arr > threshold] = float('nan')

    assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1)
    # Asserts that every tested arr has a non-zero std for each column

    snp_array = utils.make_snp_array(arr,
                                     mean=True,
                                     std=True,
                                     std_method='norm',
                                     dtype='float')

    np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0),
                                         np.ones(shape[1]))
예제 #25
0
def const_features_for_single_grid_single_file(grid_indx, wind_grid_indx, data):
    client = Client()
    dims = data['no2'].shape
    ntime = dims[0] - 1
    nvel = dims[2]
    data_dict = dict()
    data_hours = da.array(data['hour'][1:])
    data_dict['hour'] = da.repeat(data_hours[:, :], nvel, axis=1)
    data_dict['date'] = da.zeros((ntime, nvel)) + da.mean(data['date'][:])
    data_dict['date'] = data_dict['date']
    cum_ic_flash = da.array(data['IC_FLASHCOUNT'][:, grid_indx, :])
    cum_cg_flash = da.array(data['CG_FLASHCOUNT'][:, grid_indx, :])
    data_dict['IC_FLASHCOUNT'] = da.repeat(cum_ic_flash[1:, :] - cum_ic_flash[:-1, :], nvel, axis=1)
    data_dict['CG_FLASHCOUNT'] = da.repeat(cum_cg_flash[1:, :] - cum_cg_flash[:-1, :], nvel, axis=1)
    e_no_lower = da.array(data['E_NO'])[1:, grid_indx, :]
    e_no_upper = da.zeros((ntime, nvel - e_no_lower.shape[1]))
    data_dict['E_NO'] = da.concatenate([e_no_lower, e_no_upper], axis=1)
    data_dict['U'] = (data['U'][1:, wind_grid_indx[0][0], :] + data['U'][1:, wind_grid_indx[0][1], :])/2
    data_dict['V'] = (data['V'][1:, wind_grid_indx[1][0], :] + data['V'][1:, wind_grid_indx[1][1], :])/2

    match_vars = ['no2', 'pres', 'temp', 'CLDFRA']
    print('Variables read directly from wrf: {}'.format(match_vars[:]))
    for var in match_vars:
        data_dict[var] = da.array(data[var])[1:, grid_indx, :]

    reduce_dim_vars = ['elev', 'W']
    print('Variables average vertically: {}'.format(reduce_dim_vars[:]))
    for var in reduce_dim_vars:
        this_value = da.array(data[var])[1:, grid_indx, :]
        data_dict[var] = (this_value[:, 1:] + this_value[:, :-1]) / 2

    add_dim_vars = ['COSZEN', 'PBLH', 'LAI', 'HGT', 'SWDOWN', 'GLW']
    print('Variables add vertical layers: {}'.format(add_dim_vars[:]))

    for var in add_dim_vars:
        this_value = da.array(data[var])[1:, grid_indx, :]
        data_dict[var] = da.repeat(this_value, nvel, axis=1)

    print('Key of dict:{}'.format(data_dict.keys()))
    save_arr = []
    for var in data_dict.keys():
        data_dict[var] = data_dict[var].flatten()
        save_arr.append(data_dict[var])
    save_arr = da.array(save_arr).compute()
    return save_arr
예제 #26
0
def ttest_rel(a, b, axis=0, nan_policy="propagate"):
    if nan_policy != "propagate":
        raise NotImplementedError(
            "`nan_policy` other than 'propagate' have not been implemented.")

    n = a.shape[axis]
    df = float(n - 1)

    d = (a - b).astype(np.float64)
    v = da.var(d, axis, ddof=1)
    dm = da.mean(d, axis)
    denom = da.sqrt(v / float(n))

    with np.errstate(divide="ignore", invalid="ignore"):
        t = da.divide(dm, denom)
    t, prob = _ttest_finish(df, t)

    return delayed(Ttest_relResult, nout=2)(t, prob)
예제 #27
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, blockshape=(2, ))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #28
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, chunks=(2,))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #29
0
파일: stats.py 프로젝트: caseyclements/dask
def ttest_rel(a, b, axis=0, nan_policy='propagate'):
    if nan_policy != 'propagate':
        raise NotImplementedError("`nan_policy` other than 'propagate' "
                                  "have not been implemented.")

    n = a.shape[axis]
    df = float(n - 1)

    d = (a - b).astype(np.float64)
    v = da.var(d, axis, ddof=1)
    dm = da.mean(d, axis)
    denom = da.sqrt(v / float(n))

    with np.errstate(divide='ignore', invalid='ignore'):
        t = da.divide(dm, denom)
    t, prob = _ttest_finish(df, t)

    return delayed(Ttest_relResult, nout=2)(t, prob)
예제 #30
0
def test_make_snp_array_case_normal(shape, max_value, mask_nans):
    assume(shape[0] > 1 and shape[1] > 1)  # Assumes not degenerate 2d Array

    arr = da.random.randint(0, max_value, size=shape)
    if mask_nans:
        arr[arr == max_value - 1] = float('nan')

    assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1)
    # Asserts that every tested arr has a non-zero std for each column

    snp_array = utils.make_snp_array(arr,
                                     mean=True,
                                     std=True,
                                     std_method='norm',
                                     mask_nan=mask_nans,
                                     dtype='int8')

    np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0),
                                         np.ones(shape[1]))
예제 #31
0
    def fit(
        self,
        X: ArrayLike,
        y: Optional[ArrayLike] = None,
    ) -> "PattersonScaler":
        """Fit scaler parameters

        Parameters
        ----------
        X : (samples, variants) array_like
            Alternate allele counts with missing values encoded as either nan
            or negative numbers.
        """
        X = da.ma.masked_array(X, mask=da.isnan(X) | (X < 0))
        self.mean_ = da.ma.filled(da.mean(X, axis=0), fill_value=np.nan)
        p = self.mean_ / self.ploidy
        self.scale_ = da.sqrt(p * (1 - p))
        self.n_features_in_ = X.shape[1]
        return self
예제 #32
0
파일: viirs.py 프로젝트: zhuwenjian/satpy
    def __call__(self, datasets, **info):
        """Create HNCC DNB composite."""
        if len(datasets) != 4:
            raise ValueError("Expected 4 datasets, got %d" % (len(datasets), ))

        dnb_data = datasets[0]
        sza_data = datasets[1]
        lza_data = datasets[2]
        # this algorithm assumes units of "W cm-2 sr-1" so if there are other
        # units we need to adjust for that
        if dnb_data.attrs.get("units", "W m-2 sr-1") == "W m-2 sr-1":
            unit_factor = 10000.
        else:
            unit_factor = 1.

        mda = dnb_data.attrs.copy()
        dnb_data = dnb_data.copy() / unit_factor

        # convert to decimal instead of %
        moon_illum_fraction = da.mean(datasets[3].data) * 0.01

        phi = da.rad2deg(da.arccos(2. * moon_illum_fraction - 1))

        vfl = 0.026 * phi + 4.0e-9 * (phi**4.)

        m_fullmoon = -12.74
        m_sun = -26.74
        m_moon = vfl + m_fullmoon

        gs_ = self.gain_factor(sza_data.data)

        r_sun_moon = 10.**((m_sun - m_moon) / -2.5)
        gl_ = r_sun_moon * self.gain_factor(lza_data.data)
        gtot = 1. / (1. / gs_ + 1. / gl_)

        dnb_data += 2.6e-10
        dnb_data *= gtot

        mda['name'] = self.attrs['name']
        mda['standard_name'] = 'ncc_radiance'
        dnb_data.attrs = mda
        return dnb_data
예제 #33
0
파일: pca.py 프로젝트: souravsingh/dask-ml
    def score(self, X, y=None):
        """Return the average log-likelihood of all samples.

        See. "Pattern Recognition and Machine Learning"
        by C. Bishop, 12.2.1 p. 574
        or http://www.miketipping.com/papers/met-mppca.pdf

        Parameters
        ----------
        X : array, shape(n_samples, n_features)
            The data.

        y : Ignored

        Returns
        -------
        ll : float
            Average log-likelihood of the samples under the current model
        """
        return da.mean(self.score_samples(X))
예제 #34
0
def cluster_centroids(data, clusters, k=None):
    """Return centroids of clusters & clusters in data.

    data is an array of observations with shape (A, B, ...).

    clusters is an array of integers of shape (A,) giving the index
    (from 0 to k-1) of the cluster to which each observation belongs.
    The clusters must all be non-empty.

    k is the number of clusters. If omitted, it is deduced from the
    values in the clusters array.

    The result is an array of shape (k, B, ...) containing the
    centroid of each cluster.

    >>> data = np.array([[12, 10, 87],
    ...                  [ 2, 12, 33],
    ...                  [68, 31, 32],
    ...                  [88, 13, 66],
    ...                  [79, 40, 89],
    ...                  [ 1, 77, 12]])
    >>> cluster_centroids(data, np.array([1, 1, 2, 2, 0, 1]))
    array([[ 79.,  40.,  89.],
           [  5.,  33.,  44.],
           [ 78.,  22.,  49.]])

    """
    if k is None:
        k = (da.max(clusters)).compute() + 1

    result = []

    result = [
        da.mean(data[clusters.compute() == i], axis=0) for i in xrange(k)
    ]

    return da.reshape(da.concatenate(result, axis=0),
                      shape=(k, ) + data.shape[1:])
예제 #35
0
def test_ScaledArray_fromArrayMoment_array():
    N1, P = 7, 10
    N2 = 5
    array1 = da.random.random(size=(N1, P)).persist()
    mu = da.mean(array1, axis=0)
    std = da.diag(1/da.std(array1, axis=0))
    array2 = da.random.random(size=(N2, P)).persist()
    for scale in [True, False]:
        for center in [True, False]:
            for factor1 in [None, 'n', 'p']:
                sa1 = ScaledCenterArray(scale=scale, center=center, factor=factor1)
                sa1.fit(array1)

                for factor2, factor_value in zip([None, 'n', 'p'], [1, N2, P]):
                    sa2 = ScaledCenterArray.fromScaledArray(array=array2, scaled_array=sa1, factor=factor2)
                    sa2_array = array2

                    if center:
                        sa2_array = sa2_array - mu
                    if scale:
                        sa2_array = sa2_array.dot(std)

                    np.testing.assert_array_almost_equal(sa2.array, sa2_array)
예제 #36
0
파일: test_xarray.py 프로젝트: yliapis/dask
def test_xarray():
    y = da.mean(xr.DataArray([1, 2, 3.0]))

    assert_eq(y, y)