def _spatial_interp(z_model: da.Array, x_model: da.Array, y_model: da.Array, x_sat: np.ndarray, y_sat: np.ndarray) -> np.ndarray: """Spatial interpolation of the SSH on the selected maps. Args: z_model (numpy.ndarray): model SSH x_model (numpy.ndarray): model longitude y_model (numpy.ndarray): model latitude x_sat (numpy.ndarray): satellite longitude y_sat (numpy.ndarray): satellite latitude Returns: numpy.ndarray: interpolated SSH in space. """ mesh = pyinterp.RTree() mesh.packing( np.vstack((x_model.compute(), y_model.compute())).T, z_model.compute()) z, _ = mesh.radial_basis_function( np.vstack((x_sat, y_sat)).T.astype("float32"), within=True, k=11, rbf="thin_plate", num_threads=1, ) return z.astype("float32")
def spearman_1xn( x: da.Array, data: da.Array, value_range: Optional[Tuple[float, float]] = None, k: Optional[int] = None, ) -> Tuple[np.ndarray, np.ndarray]: """ Parameters ---------- x : da.Array data : da.Array value_range : Optional[Tuple[float, float]] = None k : Optional[int] = None """ _, ncols = data.shape data = data.compute() # TODO: How to compute rank distributedly? ranks = np.empty_like(data) for j in range(ncols): ranks[:, j] = pd.Series(data[:, j]).rank() ranks = da.from_array(ranks) xrank = pd.Series(x.compute()).rank() xrank = da.from_array(xrank) return pearson_1xn(xrank, ranks, value_range, k)
def _spatial_interp( z_model: da.Array, x_model: da.Array, y_model: da.Array, x_sat: np.ndarray, y_sat: np.ndarray, ) -> np.ndarray: """Spatial interpolation of SSH from NATL60 model. Args: z_model (da.Array): SSH model x_model (da.Array): longitude model y_model (da.Array): latitude model x_sat (np.ndarray): longitude satellite y_sat (np.ndarray): latitude satellite Returns: np.ndarray: SSH satellite """ mesh = pyinterp.RTree(dtype=np.dtype("float32")) start_time = time.time() ssh = z_model.compute() defined = ~np.isnan(ssh) ssh = ssh[defined] lon = x_model[defined].compute() lat = y_model[defined].compute() # The tree is built and the interpolation is calculated coordinates = np.vstack((lon, lat)).T del lon, lat LOGGER.debug( "loaded %d MB in %.2fs", (coordinates.nbytes + ssh.nbytes) // 1024**2, time.time() - start_time, ) start_time = time.time() mesh.packing(coordinates, ssh) del coordinates, ssh LOGGER.debug("mesh build in %.2fs", time.time() - start_time) start_time = time.time() z_sat, _ = mesh.radial_basis_function( np.vstack((x_sat, y_sat)).T.astype("float32"), within=True, k=11, radius=8000, rbf="thin_plate", num_threads=1, ) LOGGER.debug("interpolation done in %.2fs", time.time() - start_time) del mesh return z_sat.astype("float32")
def spearman_nxn(data: da.Array) -> da.Array: """ Spearman correlation calculation of a n x n correlation matrix for n columns """ _, ncols = data.shape data = data.compute() # TODO: How to compute rank distributedly? ranks = np.empty_like(data) for j in range(ncols): ranks[:, j] = pd.Series(data[:, j]).rank() ranks = da.from_array(ranks) corrmat = pearson_nxn(ranks) return corrmat
def calc_hist_kde( data: da.Array, bins: int, bandwidth: float) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]: """ Calculate a density histogram and its corresponding kernel density estimate over a given series. The kernel is guassian. Parameters ---------- data: da.Array one numerical column over which to compute the histogram and kde bins : int number of bins to use in the histogram bandwidth: float bandwidth for the kde Returns ------- Tuple[pd.DataFrame, np.ndarray, np.ndarray] The histogram in a dataframe, range of points for the kde, and the kde calculated at the specified points """ minv, maxv = dask.compute(data.min(), data.max()) hist_arr, bins_arr = da.histogram(data, range=[minv, maxv], bins=bins, density=True) hist_arr = hist_arr.compute() intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, }) pts_rng = np.linspace(minv, maxv, 1000) pdf = gaussian_kde(data.compute(), bw_method=bandwidth)(pts_rng) return hist_df, pts_rng, pdf