def calculate(self, **kwargs): smiles_dataset = kwargs['smiles_dataset'] fingerprint_dataset = kwargs['fingerprint_dataset'] properties = kwargs['properties'] estimator = kwargs['estimator'] param_dict = kwargs['param_dict'] embeddings = self.sample_many(smiles_dataset, zero_padded_vals=False, average_tokens=True) embeddings = cupy.asarray(embeddings, dtype=cupy.float32) fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack()) fingerprints = cupy.asarray(fingerprints, order='C', dtype=cupy.float32) metric, fingerprint_errors, embedding_errors = self._calculate_metric( embeddings, fingerprints, properties, estimator, param_dict) logger.info( f'{type(metric)} {type(fingerprint_errors)} {type(embedding_errors)}' ) metric = cupy.nanmean(metric) fingerprint_errors = cupy.nanmean(fingerprint_errors) embedding_errors = cupy.nanmean(embedding_errors) return pd.Series({ 'name': self.name, 'value': metric, 'fingerprint_error': fingerprint_errors, 'embedding_error': embedding_errors })
def corr_pairwise(x, y, return_pearson=False): """Covariance and Pearson product-moment correlation coefficients on the GPU for paired data with tolerance of NaNs. Curently only supports rows as samples and columns as observations. Parameters ---------- x : array_like The baseline array of values. y : array_like The comparison array of values. Returns ------- corr : cupy ndarray Array of correlation values """ def _cov_pairwise(x1, x2, factor): return cupy.nansum(x1 * x2, axis=1, keepdims=True) * cupy.true_divide( 1, factor) # Coerce arrays into 2D format and set dtype dtype = cupy.result_type(x, y, cupy.float64) x = cupy.asarray(x, dtype=dtype) y = cupy.asarray(y, dtype=dtype) assert x.shape == y.shape if x.ndim < 2: x = x[None, :] y = y[None, :] n_samples, n_obs = x.shape # Calculate degrees of freedom for each sample pair ddof = 1 nan_count = (cupy.isnan(x) | cupy.isnan(y)).sum(axis=1, keepdims=True) fact = n_obs - nan_count - ddof # Mean normalize x -= cupy.nanmean(x, axis=1, keepdims=True) y -= cupy.nanmean(y, axis=1, keepdims=True) # Calculate covariance matrix corr = _cov_pairwise(x, y, fact) if return_pearson: x_corr = _cov_pairwise(x, x, fact) y_corr = _cov_pairwise(y, y, fact) auto_corr = cupy.sqrt(x_corr) * cupy.sqrt(y_corr) corr = corr / auto_corr corr = cupy.clip(corr.real, -1, 1, out=corr.real) return corr return corr.squeeze()
def _compute_spearman_rho(self, fp_sample, Xt_sample, top_k=100): if hasattr(fp_sample, 'values'): fp_sample = fp_sample.values dist_array_tani = tanimoto_calculate(fp_sample, calc_distance=True) dist_array_eucl = pairwise_distances(Xt_sample) return cupy.nanmean( spearmanr(dist_array_tani, dist_array_eucl, top_k=top_k))
def calculate(self, **kwargs): smiles_dataset = kwargs['smiles_dataset'] fingerprint_dataset = kwargs['fingerprint_dataset'] top_k = kwargs['top_k'] embeddings = self.sample_many(smiles_dataset, zero_padded_vals=True, average_tokens=False) # Calculate pairwise distances for fingerprints fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack()) fingerprints = cupy.asarray(fingerprints, order='C') metric = self._calculate_metric(embeddings, fingerprints, top_k) metric = cupy.nanmean(metric) top_k = embeddings.shape[0] - 1 if not top_k else top_k return pd.Series({'name': self.name, 'value': metric, 'top_k': top_k})
def _hotspots_cupy(raster, kernel): if not (issubclass(raster.data.dtype.type, cupy.integer) or issubclass(raster.data.dtype.type, cupy.floating)): raise ValueError("data type must be integer or float") # apply kernel to raster values mean_array = convolve_2d(raster.data, kernel / kernel.sum()) # calculate z-scores global_mean = cupy.nanmean(raster.data) global_std = cupy.nanstd(raster.data) if global_std == 0: raise ZeroDivisionError( "Standard deviation of the input raster values is 0.") z_array = (mean_array - global_mean) / global_std out = _calc_hotspots_cupy(z_array) return out
def _hotspots_cupy(raster, kernel): if not (issubclass(raster.data.dtype.type, cupy.integer) or issubclass(raster.data.dtype.type, cupy.floating)): raise ValueError("data type must be integer or float") data = raster.data.astype(cupy.float32) # apply kernel to raster values mean_array = convolve_2d(data, kernel / kernel.sum()) # calculate z-scores global_mean = cupy.nanmean(data) global_std = cupy.nanstd(data) if global_std == 0: raise ZeroDivisionError( "Standard deviation of the input raster values is 0.") z_array = (mean_array - global_mean) / global_std out = cupy.zeros_like(z_array, dtype=cupy.int8) griddim, blockdim = cuda_args(z_array.shape) _run_gpu_hotspots[griddim, blockdim](z_array, out) return out
def hl_ratio(x): return cp.sum(x >= cp.nanmean(x)) / cp.sum(x < cp.nanmean(x))
def mean_variance(x): return cp.nanstd(x) / cp.nanmean(x)