def cluster(matrix, labels=None, threshold=None, maxsteps=25, destroy=False, normalized=False, nsamples=2500, maxsize=2500, logfile=None): """Iterative medoid cluster generator. Yields (medoid), set(labels) pairs. Inputs: matrix: A (obs x features) Numpy matrix of data type numpy.float32 labels: None or Numpy array with labels for matrix rows [None = indices] threshold: Optimal medoid search in this distance from medoid [None = auto] maxsteps: Stop searching for optimal medoid after N futile attempts [25] destroy: Destroy input matrix, saving memory. [False] normalized: Matrix is already zscore-normalized across axis 1 [False] nsamples: Estimate threshold from N samples [2500] maxsize: Discard sample if more than N contigs are within threshold [2500] logfile: Print threshold estimates and certainty to file [None] Output: Generator of (medoid, set(labels_in_cluster)) tuples. """ if not destroy: matrix = _np.copy(matrix) if not normalized: _vambtools.zscore(matrix, axis=1, inplace=True) labels, threshold = _check_params(matrix, threshold, labels, nsamples, maxsize, maxsteps, logfile) return _cluster(matrix, labels, threshold, maxsteps)
def make_dataloader(rpkm, tnf, batchsize=256, destroy=False, cuda=False): """Create a DataLoader and a contig mask from RPKM and TNF. The dataloader is an object feeding minibatches of contigs to the VAE. The data are normalized versions of the input datasets, with zero-contigs, i.e. contigs where a row in either TNF or RPKM are all zeros, removed. The mask is a boolean mask designating which contigs have been kept. Inputs: rpkm: RPKM matrix (N_contigs x N_samples) tnf: TNF matrix (N_contigs x N_TNF) batchsize: Starting size of minibatches for dataloader destroy: Mutate rpkm and tnf array in-place instead of making a copy. cuda: Pagelock memory of dataloader (use when using GPU acceleration) Outputs: DataLoader: An object feeding data to the VAE mask: A boolean mask of which contigs are kept """ if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray): raise ValueError('TNF and RPKM must be Numpy arrays') if batchsize < 1: raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize)) if len(rpkm) != len(tnf): raise ValueError('Lengths of RPKM and TNF must be the same') if not (rpkm.dtype == tnf.dtype == _np.float32): raise ValueError('TNF and RPKM must be Numpy arrays of dtype float32') mask = tnf.sum(axis=1) != 0 # If multiple samples, also include nonzero depth as requirement for accept # of sequences if rpkm.shape[1] > 1: depthssum = rpkm.sum(axis=1) mask &= depthssum != 0 depthssum = depthssum[mask] if mask.sum() < batchsize: raise ValueError( 'Fewer sequences left after filtering than the batch size.') if destroy: rpkm = _vambtools.numpy_inplace_maskarray(rpkm, mask) tnf = _vambtools.numpy_inplace_maskarray(tnf, mask) else: # The astype operation does not copy due to "copy=False", but the masking # operation does. rpkm = rpkm[mask].astype(_np.float32, copy=False) tnf = tnf[mask].astype(_np.float32, copy=False) # If multiple samples, normalize to sum to 1, else zscore normalize if rpkm.shape[1] > 1: rpkm /= depthssum.reshape((-1, 1)) else: _vambtools.zscore(rpkm, axis=0, inplace=True) # Normalize arrays and create the Tensors (the tensors share the underlying memory) # of the Numpy arrays _vambtools.zscore(tnf, axis=0, inplace=True) depthstensor = _torch.from_numpy(rpkm) tnftensor = _torch.from_numpy(tnf) # Create dataloader n_workers = 4 if cuda else 1 dataset = _TensorDataset(depthstensor, tnftensor) dataloader = _DataLoader(dataset=dataset, batch_size=batchsize, drop_last=True, shuffle=True, num_workers=n_workers, pin_memory=cuda) return dataloader, mask
def make_dataloader(rpkm, tnf, batchsize=64, destroy=False, cuda=False): """Create a DataLoader and a contig mask from RPKM and TNF. The dataloader is an object feeding minibatches of contigs to the VAE. The data are normalized versions of the input datasets, with zero-contigs, i.e. contigs where a row in either TNF or RPKM are all zeros, removed. The mask is a boolean mask designating which contigs have been kept. Inputs: rpkm: RPKM matrix (N_contigs x N_samples) tnf: TNF matrix (N_contigs x 136) batchsize: Starting size of minibatches for dataloader destroy: Mutate rpkm and tnf array in-place instead of making a copy. cuda: Pagelock memory of dataloader (use when using GPU acceleration) Outputs: DataLoader: An object feeding data to the VAE mask: A boolean mask of which contigs are kept """ if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray): raise ValueError('TNF and RPKM must be Numpy arrays') if batchsize < 1: raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize)) if len(rpkm) != len(tnf): raise ValueError('Lengths of RPKM and TNF must be the same') if tnf.shape[1] != 136: raise ValueError('TNF must be 136 long along axis 1') tnfsum = tnf.sum(axis=1) mask = tnfsum != 0 del tnfsum depthssum = rpkm.sum(axis=1) mask &= depthssum != 0 if destroy: if not (rpkm.dtype == tnf.dtype == _np.float32): raise ValueError( 'Arrays must be of data type np.float32 if destroy is True') rpkm = _vambtools.inplace_maskarray(rpkm, mask) tnf = _vambtools.inplace_maskarray(tnf, mask) else: rpkm = rpkm[mask].astype(_np.float32, copy=False) tnf = tnf[mask].astype(_np.float32, copy=False) depthssum = depthssum[mask] # Normalize arrays and create the Tensors rpkm /= depthssum.reshape((-1, 1)) _vambtools.zscore(tnf, axis=0, inplace=True) depthstensor = _torch.from_numpy(rpkm) tnftensor = _torch.from_numpy(tnf) # Create dataloader dataset = _TensorDataset(depthstensor, tnftensor) dataloader = _DataLoader(dataset=dataset, batch_size=batchsize, drop_last=True, shuffle=True, num_workers=1, pin_memory=cuda) return dataloader, mask