Exemplo n.º 1
0
def test_corrcoef():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert_eq(da.corrcoef(d), np.corrcoef(x))
    assert_eq(da.corrcoef(d, rowvar=0), np.corrcoef(x, rowvar=0))
    assert_eq(da.corrcoef(d, d), np.corrcoef(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4,))

    assert_eq(da.corrcoef(d, e), np.corrcoef(x, y))
    assert_eq(da.corrcoef(e, d), np.corrcoef(y, x))
def test_corrcoef():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert_eq(da.corrcoef(d), np.corrcoef(x))
    assert_eq(da.corrcoef(d, rowvar=0), np.corrcoef(x, rowvar=0))
    assert_eq(da.corrcoef(d, d), np.corrcoef(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4, ))

    assert_eq(da.corrcoef(d, e), np.corrcoef(x, y))
    assert_eq(da.corrcoef(e, d), np.corrcoef(y, x))
Exemplo n.º 3
0
def pearson_1xn(
    x: da.Array,
    data: da.Array,
    value_range: Optional[Tuple[float, float]] = None,
    k: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Parameters
    ----------
    x : da.Array
    data : da.Array
    value_range : Optional[Tuple[float, float]] = None
    k : Optional[int] = None
    """
    _, ncols = data.shape

    corrs = []
    for j in range(ncols):
        mask = ~(da.isnan(x) | da.isnan(data[:, j]))
        _, (corr,
            _) = da.corrcoef(np.array(x)[mask],
                             np.array(data[:, j])[mask])
        corrs.append(corr)

    (corrs, ) = da.compute(corrs)
    corrs = np.asarray(corrs)

    return corr_filter(corrs, value_range, k)
Exemplo n.º 4
0
 def estimate_ld(hd5, filename, chromosome, threads, memory):
     print("Estimating LD for Chromosome", chromosome)
     dset = hd5['/%s' % chromosome][:]
     chunks = estimate_chunks(dset.shape, threads, memory)
     array = da.from_array(dset, chunks=chunks)
     del dset
     gc.collect()
     rho = da.corrcoef(da.ma.masked_invalid(array).T) ** 2
     filename='%s_ld.hdf5' % filename
     da.to_hdf5(filename, '/%s' % chromosome, rho)
     return chromosome, filename
Exemplo n.º 5
0
def missing_heatmap(df: EDAFrame) -> Optional[pd.DataFrame]:
    """Calculate a heatmap visualization of nullity correlation
    in the given DataFrame."""

    return da.corrcoef(df.nulls, rowvar=False)
Exemplo n.º 6
0
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))

    def get_profile(path):
        """Get geospatial metadata profile such as projections, pixel sizes, etc"""
        with rasterio.open(path) as src:
            return src.profile.copy()

    if B:
        raw_image_a = read_raster(A, block_size=block_size)
        raw_image_b = read_raster(B, block_size=block_size)
        raw_image = da.vstack((raw_image_a, raw_image_b))
    else:
        raw_image = read_raster(A, block_size=block_size)

    # flat each dimension (bands)
    flat_dims = raw_image.reshape(
        (raw_image.shape[0], raw_image.shape[1] * raw_image.shape[2]))

    n_bands = raw_image.shape[0]

    ########
    # subtract the mean of column i from column i, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(da.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    estimation_matrix = np.empty((n_bands, n_bands))
    for i in range(n_bands):
        deviation_scores_band_i = flat_dims[i] - band_mean[i]
        for j in range(i, n_bands):
            deviation_scores_band_j = flat_dims[j] - band_mean[j]
            if estimator_matrix == "Correlation":
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
            if estimator_matrix == "Covariance":
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    # output image profile
    prof = get_profile(A)
    prof.update(count=1, driver='GTiff', dtype=np.float32)

    @dask.delayed
    def get_principal_component(i, j):
        return eigenvectors[j, i] * (raw_image[j] - band_mean[j])

    pca_files = []
    for i in range(n_pc):
        pc = dask.delayed(sum)(
            [get_principal_component(i, j) for j in range(n_bands)])
        pc = pc.astype(np.float32)
        # save component as file
        tmp_pca_file = Path(out_dir, 'pc_{}.tif'.format(i + 1))
        write_raster(tmp_pca_file, pc.compute(), **prof)
        pca_files.append(tmp_pca_file)

    # compute the pyramids for each pc image
    @dask.delayed
    def pyramids(pca_file):
        call('gdaladdo --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file),
             shell=True)

    dask.compute(*[pyramids(pca_file) for pca_file in pca_files],
                 num_workers=2)

    ########
    # pca statistics
    pca_stats = {}
    pca_stats["eigenvals"] = eigenvals
    pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands
    pca_stats["eigenvectors"] = eigenvectors

    return pca_files, pca_stats
Exemplo n.º 7
0
def pca(a, b, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    A = a
    B = b
    # get/set the nodata
    if nodata is None:
        ds = gdal.Open(A, gdal.GA_ReadOnly)
        nodata = ds.GetRasterBand(1).GetNoDataValue()
        del ds

    print("\nPRINCIPAL COMPONENTS ANALYSIS")
    print("    Compute {} components for:".format(n_pc))
    print("    A: {}".format(A))
    if B is not None:
        print("    B: {}".format(B))

    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))
    # registered Dask progress bar
    pbar = ProgressBar()
    pbar.register()

    print("\nRead and prepare data:")

    raw_image = []
    nodata_mask = None
    src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
    src_ds_B = None
    for band in range(src_ds_A.RasterCount):
        ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype(
            np.float32)
        if nodata is not None:
            nodata_mask = ds == nodata if nodata_mask is None else np.logical_or(
                nodata_mask, ds == nodata)
        raw_image.append(ds)
    if B is not None:
        src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
        for band in range(src_ds_B.RasterCount):
            ds = src_ds_B.GetRasterBand(band +
                                        1).ReadAsArray().flatten().astype(
                                            np.float32)
            if nodata is not None:
                nodata_mask = np.logical_or(nodata_mask, ds == nodata)
            raw_image.append(ds)

    # pair-masking data, let only the valid data across all dimensions/bands
    if nodata is not None:
        raw_image = [b[~nodata_mask] for b in raw_image]
    # flat each dimension (bands)
    flat_dims = da.vstack(raw_image).rechunk((1, block_size**2))
    # bands
    n_bands = flat_dims.shape[0]

    ########
    # compute the mean of each band, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(np.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    print("\nComputing the estimator matrix:")
    estimation_matrix = np.empty((n_bands, n_bands))
    if estimator_matrix == "correlation":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    if estimator_matrix == "covariance":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    # free mem
    del raw_image, flat_dims, src_ds_B, ds

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    def get_raw_band_from_stack(band):
        src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
        if band < src_ds_A.RasterCount:
            return src_ds_A.GetRasterBand(band +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)
        if band >= src_ds_A.RasterCount:
            src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
            return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)

    @dask.delayed
    def get_principal_component(i, j):
        return eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j])

    print("\nComputing and saving the components in pca-stack.tif:")

    # save component as file
    tmp_pca_file = Path(out_dir) / 'pca-stack.tif'
    driver = gdal.GetDriverByName("GTiff")
    out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize,
                           src_ds_A.RasterYSize, n_pc, gdal.GDT_Float32)

    for i in range(n_pc):
        pc = dask.delayed(sum)(
            [get_principal_component(i, j) for j in range(n_bands)])
        pc = pc.astype(np.float32)
        pc = np.array(pc.compute())
        if nodata is not None:
            pc[nodata_mask] = 0
        pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize))

        pcband = out_pc.GetRasterBand(i + 1)
        if nodata is not None:
            pcband.SetNoDataValue(0)
        pcband.WriteArray(pc)
        del pc, pcband
    # set projection and geotransform
    if src_ds_A.GetGeoTransform() is not None:
        out_pc.SetGeoTransform(src_ds_A.GetGeoTransform())
    if src_ds_A.GetProjection() is not None:
        out_pc.SetProjection(src_ds_A.GetProjection())
    out_pc.FlushCache()

    # free mem
    del src_ds_A, nodata_mask, out_pc

    print("\nDONE")
Exemplo n.º 8
0
 def fit(self):
     if self.corr_mat is None:
         dense = da.from_delayed(delayed(toarray)(self.matrix),
             shape=self.matrix.shape,dtype=self.matrix.dtype)
         self.corr_mat = da.corrcoef(dense).compute()
     self.fitted = True
Exemplo n.º 9
0
def pca(A,
        B,
        n_pc,
        estimator_matrix,
        out_dir,
        n_threads,
        block_size,
        nodata=None):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))

    raw_image = []
    nodata_mask = None
    src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly)
    src_ds_B = None
    for band in range(src_ds_A.RasterCount):
        ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype(
            np.float32)
        if nodata is not None:
            nodata_mask = ds == nodata if nodata_mask is None else np.logical_or(
                nodata_mask, ds == nodata)
        raw_image.append(ds)
    if B:
        src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly)
        for band in range(src_ds_B.RasterCount):
            ds = src_ds_B.GetRasterBand(band +
                                        1).ReadAsArray().flatten().astype(
                                            np.float32)
            if nodata is not None:
                nodata_mask = np.logical_or(nodata_mask, ds == nodata)
            raw_image.append(ds)

    # pair-masking data, let only the valid data across all dimensions/bands
    if nodata is not None:
        raw_image = [b[~nodata_mask] for b in raw_image]
    # flat each dimension (bands)
    flat_dims = da.vstack(raw_image).rechunk((1, block_size**2))
    # bands
    n_bands = flat_dims.shape[0]

    ########
    # compute the mean of each band, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(np.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    estimation_matrix = np.empty((n_bands, n_bands))
    if estimator_matrix == "Correlation":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    if estimator_matrix == "Covariance":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    # free mem
    del raw_image, flat_dims, src_ds_B, ds

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    def get_raw_band_from_stack(band):
        src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly)
        if band < src_ds_A.RasterCount:
            return src_ds_A.GetRasterBand(band +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)
        if band >= src_ds_A.RasterCount:
            src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly)
            return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)

    pca_files = []
    for i in range(n_pc):
        pc = 0
        for j in range(n_bands):
            pc = pc + eigenvectors[j, i] * (get_raw_band_from_stack(j) -
                                            band_mean[j])

        if nodata is not None:
            pc[nodata_mask] = 0
        pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize))
        # save component as file
        tmp_pca_file = Path(out_dir) / 'pc_{}.tif'.format(i + 1)
        driver = gdal.GetDriverByName("GTiff")
        out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize,
                               src_ds_A.RasterYSize, 1, gdal.GDT_Float32)
        pcband = out_pc.GetRasterBand(1)
        if nodata is not None:
            pcband.SetNoDataValue(0)
        pcband.WriteArray(pc)
        # set projection and geotransform
        if src_ds_A.GetGeoTransform() is not None:
            out_pc.SetGeoTransform(src_ds_A.GetGeoTransform())
        if src_ds_A.GetProjection() is not None:
            out_pc.SetProjection(src_ds_A.GetProjection())
        out_pc.FlushCache()
        del pc, pcband, out_pc

        pca_files.append(tmp_pca_file)

    # free mem
    del src_ds_A, nodata_mask

    # compute the pyramids for each pc image
    for pca_file in pca_files:
        call('gdaladdo -q --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file),
             shell=True)

    ########
    # pca statistics
    pca_stats = {}
    pca_stats["eigenvals"] = eigenvals
    pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands
    pca_stats["eigenvectors"] = eigenvectors

    return pca_files, pca_stats