Пример #1
0
    def calculate(self, **kwargs):
        smiles_dataset = kwargs['smiles_dataset']
        fingerprint_dataset = kwargs['fingerprint_dataset']
        properties = kwargs['properties']
        estimator = kwargs['estimator']
        param_dict = kwargs['param_dict']

        embeddings = self.sample_many(smiles_dataset,
                                      zero_padded_vals=False,
                                      average_tokens=True)
        embeddings = cupy.asarray(embeddings, dtype=cupy.float32)

        fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack())
        fingerprints = cupy.asarray(fingerprints,
                                    order='C',
                                    dtype=cupy.float32)

        metric, fingerprint_errors, embedding_errors = self._calculate_metric(
            embeddings, fingerprints, properties, estimator, param_dict)
        logger.info(
            f'{type(metric)}  {type(fingerprint_errors)} {type(embedding_errors)}'
        )
        metric = cupy.nanmean(metric)
        fingerprint_errors = cupy.nanmean(fingerprint_errors)
        embedding_errors = cupy.nanmean(embedding_errors)

        return pd.Series({
            'name': self.name,
            'value': metric,
            'fingerprint_error': fingerprint_errors,
            'embedding_error': embedding_errors
        })
Пример #2
0
def corr_pairwise(x, y, return_pearson=False):
    """Covariance and Pearson product-moment correlation coefficients on the GPU for paired data with tolerance of NaNs.
       Curently only supports rows as samples and columns as observations.

    Parameters
    ----------
    x : array_like
        The baseline array of values.
    y : array_like
        The comparison array of values.

    Returns
    -------
    corr : cupy ndarray
         Array of correlation values
    """
    def _cov_pairwise(x1, x2, factor):
        return cupy.nansum(x1 * x2, axis=1, keepdims=True) * cupy.true_divide(
            1, factor)

    # Coerce arrays into 2D format and set dtype
    dtype = cupy.result_type(x, y, cupy.float64)
    x = cupy.asarray(x, dtype=dtype)
    y = cupy.asarray(y, dtype=dtype)

    assert x.shape == y.shape
    if x.ndim < 2:
        x = x[None, :]
        y = y[None, :]
    n_samples, n_obs = x.shape

    # Calculate degrees of freedom for each sample pair
    ddof = 1
    nan_count = (cupy.isnan(x) | cupy.isnan(y)).sum(axis=1, keepdims=True)
    fact = n_obs - nan_count - ddof

    # Mean normalize
    x -= cupy.nanmean(x, axis=1, keepdims=True)
    y -= cupy.nanmean(y, axis=1, keepdims=True)

    # Calculate covariance matrix
    corr = _cov_pairwise(x, y, fact)

    if return_pearson:
        x_corr = _cov_pairwise(x, x, fact)
        y_corr = _cov_pairwise(y, y, fact)
        auto_corr = cupy.sqrt(x_corr) * cupy.sqrt(y_corr)
        corr = corr / auto_corr
        corr = cupy.clip(corr.real, -1, 1, out=corr.real)
        return corr

    return corr.squeeze()
Пример #3
0
 def _compute_spearman_rho(self, fp_sample, Xt_sample, top_k=100):
     if hasattr(fp_sample, 'values'):
         fp_sample = fp_sample.values
     dist_array_tani = tanimoto_calculate(fp_sample, calc_distance=True)
     dist_array_eucl = pairwise_distances(Xt_sample)
     return cupy.nanmean(
         spearmanr(dist_array_tani, dist_array_eucl, top_k=top_k))
Пример #4
0
    def calculate(self, **kwargs):
        smiles_dataset = kwargs['smiles_dataset']
        fingerprint_dataset = kwargs['fingerprint_dataset']
        top_k = kwargs['top_k']

        embeddings = self.sample_many(smiles_dataset,
                                      zero_padded_vals=True,
                                      average_tokens=False)

        # Calculate pairwise distances for fingerprints
        fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack())
        fingerprints = cupy.asarray(fingerprints, order='C')

        metric = self._calculate_metric(embeddings, fingerprints, top_k)
        metric = cupy.nanmean(metric)
        top_k = embeddings.shape[0] - 1 if not top_k else top_k
        return pd.Series({'name': self.name, 'value': metric, 'top_k': top_k})
Пример #5
0
def _hotspots_cupy(raster, kernel):
    if not (issubclass(raster.data.dtype.type, cupy.integer)
            or issubclass(raster.data.dtype.type, cupy.floating)):
        raise ValueError("data type must be integer or float")

    # apply kernel to raster values
    mean_array = convolve_2d(raster.data, kernel / kernel.sum())

    # calculate z-scores
    global_mean = cupy.nanmean(raster.data)
    global_std = cupy.nanstd(raster.data)
    if global_std == 0:
        raise ZeroDivisionError(
            "Standard deviation of the input raster values is 0.")
    z_array = (mean_array - global_mean) / global_std

    out = _calc_hotspots_cupy(z_array)
    return out
Пример #6
0
def _hotspots_cupy(raster, kernel):
    if not (issubclass(raster.data.dtype.type, cupy.integer)
            or issubclass(raster.data.dtype.type, cupy.floating)):
        raise ValueError("data type must be integer or float")

    data = raster.data.astype(cupy.float32)

    # apply kernel to raster values
    mean_array = convolve_2d(data, kernel / kernel.sum())

    # calculate z-scores
    global_mean = cupy.nanmean(data)
    global_std = cupy.nanstd(data)
    if global_std == 0:
        raise ZeroDivisionError(
            "Standard deviation of the input raster values is 0.")
    z_array = (mean_array - global_mean) / global_std

    out = cupy.zeros_like(z_array, dtype=cupy.int8)
    griddim, blockdim = cuda_args(z_array.shape)
    _run_gpu_hotspots[griddim, blockdim](z_array, out)
    return out
Пример #7
0
def hl_ratio(x):
    return cp.sum(x >= cp.nanmean(x)) / cp.sum(x < cp.nanmean(x))
Пример #8
0
def mean_variance(x):
    return cp.nanstd(x) / cp.nanmean(x)