예제 #1
0
def cluster(matrix, labels=None, threshold=None, maxsteps=25, destroy=False,
            normalized=False, nsamples=2500, maxsize=2500, logfile=None):
    """Iterative medoid cluster generator. Yields (medoid), set(labels) pairs.

    Inputs:
        matrix: A (obs x features) Numpy matrix of data type numpy.float32
        labels: None or Numpy array with labels for matrix rows [None = indices]
        threshold: Optimal medoid search in this distance from medoid [None = auto]
        maxsteps: Stop searching for optimal medoid after N futile attempts [25]
        destroy: Destroy input matrix, saving memory. [False]
        normalized: Matrix is already zscore-normalized across axis 1 [False]
        nsamples: Estimate threshold from N samples [2500]
        maxsize: Discard sample if more than N contigs are within threshold [2500]
        logfile: Print threshold estimates and certainty to file [None]

    Output: Generator of (medoid, set(labels_in_cluster)) tuples.
    """

    if not destroy:
        matrix = _np.copy(matrix)

    if not normalized:
        _vambtools.zscore(matrix, axis=1, inplace=True)

    labels, threshold = _check_params(matrix, threshold, labels, nsamples, maxsize, maxsteps, logfile)

    return _cluster(matrix, labels, threshold, maxsteps)
예제 #2
0
def make_dataloader(rpkm, tnf, batchsize=256, destroy=False, cuda=False):
    """Create a DataLoader and a contig mask from RPKM and TNF.

    The dataloader is an object feeding minibatches of contigs to the VAE.
    The data are normalized versions of the input datasets, with zero-contigs,
    i.e. contigs where a row in either TNF or RPKM are all zeros, removed.
    The mask is a boolean mask designating which contigs have been kept.

    Inputs:
        rpkm: RPKM matrix (N_contigs x N_samples)
        tnf: TNF matrix (N_contigs x N_TNF)
        batchsize: Starting size of minibatches for dataloader
        destroy: Mutate rpkm and tnf array in-place instead of making a copy.
        cuda: Pagelock memory of dataloader (use when using GPU acceleration)

    Outputs:
        DataLoader: An object feeding data to the VAE
        mask: A boolean mask of which contigs are kept
    """

    if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
        raise ValueError('TNF and RPKM must be Numpy arrays')

    if batchsize < 1:
        raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize))

    if len(rpkm) != len(tnf):
        raise ValueError('Lengths of RPKM and TNF must be the same')

    if not (rpkm.dtype == tnf.dtype == _np.float32):
        raise ValueError('TNF and RPKM must be Numpy arrays of dtype float32')

    mask = tnf.sum(axis=1) != 0

    # If multiple samples, also include nonzero depth as requirement for accept
    # of sequences
    if rpkm.shape[1] > 1:
        depthssum = rpkm.sum(axis=1)
        mask &= depthssum != 0
        depthssum = depthssum[mask]

    if mask.sum() < batchsize:
        raise ValueError(
            'Fewer sequences left after filtering than the batch size.')

    if destroy:
        rpkm = _vambtools.numpy_inplace_maskarray(rpkm, mask)
        tnf = _vambtools.numpy_inplace_maskarray(tnf, mask)
    else:
        # The astype operation does not copy due to "copy=False", but the masking
        # operation does.
        rpkm = rpkm[mask].astype(_np.float32, copy=False)
        tnf = tnf[mask].astype(_np.float32, copy=False)

    # If multiple samples, normalize to sum to 1, else zscore normalize
    if rpkm.shape[1] > 1:
        rpkm /= depthssum.reshape((-1, 1))
    else:
        _vambtools.zscore(rpkm, axis=0, inplace=True)

    # Normalize arrays and create the Tensors (the tensors share the underlying memory)
    # of the Numpy arrays
    _vambtools.zscore(tnf, axis=0, inplace=True)
    depthstensor = _torch.from_numpy(rpkm)
    tnftensor = _torch.from_numpy(tnf)

    # Create dataloader
    n_workers = 4 if cuda else 1
    dataset = _TensorDataset(depthstensor, tnftensor)
    dataloader = _DataLoader(dataset=dataset,
                             batch_size=batchsize,
                             drop_last=True,
                             shuffle=True,
                             num_workers=n_workers,
                             pin_memory=cuda)

    return dataloader, mask
예제 #3
0
파일: encode.py 프로젝트: czbiohub/vamb
def make_dataloader(rpkm, tnf, batchsize=64, destroy=False, cuda=False):
    """Create a DataLoader and a contig mask from RPKM and TNF.

    The dataloader is an object feeding minibatches of contigs to the VAE.
    The data are normalized versions of the input datasets, with zero-contigs,
    i.e. contigs where a row in either TNF or RPKM are all zeros, removed.
    The mask is a boolean mask designating which contigs have been kept.

    Inputs:
        rpkm: RPKM matrix (N_contigs x N_samples)
        tnf: TNF matrix (N_contigs x 136)
        batchsize: Starting size of minibatches for dataloader
        destroy: Mutate rpkm and tnf array in-place instead of making a copy.
        cuda: Pagelock memory of dataloader (use when using GPU acceleration)

    Outputs:
        DataLoader: An object feeding data to the VAE
        mask: A boolean mask of which contigs are kept
    """

    if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
        raise ValueError('TNF and RPKM must be Numpy arrays')

    if batchsize < 1:
        raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize))

    if len(rpkm) != len(tnf):
        raise ValueError('Lengths of RPKM and TNF must be the same')

    if tnf.shape[1] != 136:
        raise ValueError('TNF must be 136 long along axis 1')

    tnfsum = tnf.sum(axis=1)
    mask = tnfsum != 0
    del tnfsum
    depthssum = rpkm.sum(axis=1)
    mask &= depthssum != 0

    if destroy:
        if not (rpkm.dtype == tnf.dtype == _np.float32):
            raise ValueError(
                'Arrays must be of data type np.float32 if destroy is True')

        rpkm = _vambtools.inplace_maskarray(rpkm, mask)
        tnf = _vambtools.inplace_maskarray(tnf, mask)
    else:
        rpkm = rpkm[mask].astype(_np.float32, copy=False)
        tnf = tnf[mask].astype(_np.float32, copy=False)

    depthssum = depthssum[mask]

    # Normalize arrays and create the Tensors
    rpkm /= depthssum.reshape((-1, 1))
    _vambtools.zscore(tnf, axis=0, inplace=True)
    depthstensor = _torch.from_numpy(rpkm)
    tnftensor = _torch.from_numpy(tnf)

    # Create dataloader
    dataset = _TensorDataset(depthstensor, tnftensor)
    dataloader = _DataLoader(dataset=dataset,
                             batch_size=batchsize,
                             drop_last=True,
                             shuffle=True,
                             num_workers=1,
                             pin_memory=cuda)

    return dataloader, mask