Пример #1
0
def pca(data, dim=None, validation_split=None, batch_size=100, whiten=False):
    '''Perform a principal component analysis for dimensionality reduction.

    We compute the first <dim> eigenvectors of the instantaneous covariance
    matrix and use them to rotate/project the data into a lower dimensional
    subspace.

    Arguments:
        data (numpy-ndarray of list thereof): the data to be transformed
        dim (int): the target dimensionality
        validation_split (float): fraction of the data reserved for validation
        batch_size (int): specify a batch size for the minibatch process
        whiten (boolean): set to True to whiten the transformed data

    Returns:
        (numpy.ndarray of list thereof): the transformed data
        (float): training loss
        (float): validation loss
    '''
    data_0 = _create_dataset(data, lag=0)
    if validation_split is None:
        train_loader = _DataLoader(data_0, batch_size=batch_size)
        test_loader = None
    else:
        data_test, data_train = _random_split(
            data_0, f_active=validation_split)
        train_loader = _DataLoader(data_train, batch_size=batch_size)
        test_loader = _DataLoader(data_test, batch_size=batch_size)
    model = _PCA()
    train_loss, test_loss = model.fit(
        train_loader, dim=dim, test_loader=test_loader)
    transformed_data = _transform(model, data, data_0, batch_size, whiten)
    return transformed_data, train_loss, test_loss
Пример #2
0
 def __call__(self):
     data, ref_data, dsc_data = self.wrapper(**self.wrapper_args)
     results = [self.reference(dsc_data)]
     data_0 = _create_dataset(data, lag=0)
     data_0_loader = _DataLoader(data_0, batch_size=self.batch_size)
     data_0_test, data_0_train = _stride_split(data_0, stride=3)
     data_0_train_loader = _DataLoader(data_0_train,
                                       batch_size=self.batch_size,
                                       shuffle=True)
     data_0_test_loader = _DataLoader(data_0_test,
                                      batch_size=self.batch_size)
     results.append(
         self.pca(data_0_train_loader, data_0_test_loader, data_0_loader,
                  ref_data))
     for lag in self.trns_lags:
         data_lag = _create_dataset(data, lag=lag)
         data_lag_test, data_lag_train = _stride_split(data_lag, stride=3)
         data_lag_train_loader = _DataLoader(data_lag_train,
                                             batch_size=self.batch_size,
                                             shuffle=True)
         data_lag_test_loader = _DataLoader(data_lag_test,
                                            batch_size=self.batch_size)
         results.append(
             self.tica(data_lag_train_loader, data_lag_test_loader,
                       data_0_loader, ref_data, lag))
         results.append(
             self.ae(data_lag_train_loader, data_lag_test_loader,
                     data_0_loader, ref_data, lag))
     return results
Пример #3
0
def vae(
    data, dim=None, lag=1, n_epochs=50, validation_split=None,
    batch_size=100, whiten=False, pin_memory=False, **kwargs):
    '''Use a time-lagged variational autoencoder model for dimensionality
    reduction.

    We train a deep (or shallow) time-lagged variational autoencoder type
    neural network and use the first half (encoder stage) to transform the
    supplied data.

    Arguments:
        data (numpy-ndarray of list thereof): the data to be transformed
        dim (int): the target dimensionality
        lag (int): specifies the lag in time steps
        n_epochs (int): number of training epochs
        validation_split (float): fraction of the data reserved for validation
        batch_size (int): specify a batch size for the minibatch process
        whiten (boolean): set to True to whiten the transformed data
        pin_memory (boolean): make DataLoaders return pinned memory

    Returns:
        (numpy.ndarray of list thereof): the transformed data
        (list of float): training loss
        (list of float): validation loss
    '''
    vae_args = dict(
        hid_size=[100],
        beta=1.0,
        dropout=0.5,
        alpha=0.01,
        prelu=False,
        bias=True,
        lr=0.001,
        cuda=False,
        non_blocking=False)
    vae_args.update(kwargs)
    try:
        size = data.shape[1]
    except AttributeError:
        size = data[0].shape[1]
    data_0 = _create_dataset(data, lag=0)
    data_lag = _create_dataset(data, lag=lag)
    if validation_split is None:
        train_loader = _DataLoader(
            data_lag, batch_size=batch_size, pin_memory=pin_memory)
        test_loader = None
    else:
        data_test, data_train = _random_block_split(
            data_lag, lag, f_active=validation_split)
        train_loader = _DataLoader(
            data_train, batch_size=batch_size, pin_memory=pin_memory)
        test_loader = _DataLoader(
            data_test, batch_size=batch_size, pin_memory=pin_memory)
    model = _VAE(size, dim, **vae_args)
    train_loss, test_loss = model.fit(
        train_loader, n_epochs, test_loader=test_loader)
    transformed_data = _transform(model, data, data_0, batch_size, whiten)
    return transformed_data, train_loss, test_loss
Пример #4
0
def vampnet(
    data, dim=None, lag=1, n_epochs=50, validation_split=None,
    batch_size=100, whiten=False, pin_memory=False, **kwargs):
    '''Use a vampnet model for dimensionality reduction and/or clustering.

    ....

    Arguments:
        data (numpy-ndarray of list thereof): the data to be transformed
        dim (int): the target dimensionality
        lag (int): specifies the lag in time steps
        n_epochs (int): number of training epochs
        validation_split (float): fraction of the data reserved for validation
        batch_size (int): specify a batch size for the minibatch process
        whiten (boolean): set to True to whiten the transformed data
        pin_memory (boolean): make DataLoaders return pinned memory

    Returns:
        (numpy.ndarray of list thereof): the transformed data
        (list of float): training score
        (list of float): validation score
    '''
    vn_args = dict(
        hid_size=[100],
        dropout=0.5,
        alpha=0.01,
        prelu=False,
        bias=True,
        lr=0.001,
        cuda=False,
        non_blocking=False)
    vn_args.update(kwargs)
    try:
        size = data.shape[1]
    except AttributeError:
        size = data[0].shape[1]
    data_0 = _create_dataset(data, lag=0)
    data_lag = _create_dataset(data, lag=lag)
    if validation_split is None:
        train_loader = _DataLoader(
            data_lag, batch_size=batch_size, pin_memory=pin_memory)
        test_loader = None
    else:
        data_test, data_train = _random_block_split(
            data_lag, lag, f_active=validation_split)
        train_loader = _DataLoader(
            data_train, batch_size=batch_size, pin_memory=pin_memory)
        test_loader = _DataLoader(
            data_test, batch_size=batch_size, pin_memory=pin_memory)
    model = _VAMPNet(size, dim, **vn_args)
    train_loss, test_loss = model.fit(
        train_loader, n_epochs, test_loader=test_loader)
    transformed_data = _transform(model, data, data_0, batch_size, whiten)
    train_loss = [-loss for loss in train_loss]
    test_loss = [-loss for loss in test_loss]
    return transformed_data, train_loss, test_loss
Пример #5
0
def make_dataloader(rpkm, tnf, batchsize=256, destroy=False, cuda=True):
    mask = tnf.sum(axis=1) != 0

    if rpkm.shape[1] > 1:
        depthssum = rpkm.sum(axis=1)
        mask &= depthssum != 0
        depthssum = depthssum[mask]

    if destroy:
        rpkm = numpy_inplace_maskarray(rpkm, mask)
        tnf = numpy_inplace_maskarray(tnf, mask)
    else:
        rpkm = rpkm[mask].astype(_np.float32, copy=False)
        tnf = tnf[mask].astype(_np.float32, copy=False)

    if rpkm.shape[1] > 1:
        rpkm /= depthssum.reshape((-1, 1))
    else:
        zscore(rpkm, axis=0, inplace=True)

    zscore(tnf, axis=0, inplace=True)
    depthstensor = _torch.from_numpy(rpkm)
    tnftensor = _torch.from_numpy(tnf)

    # 创建dataloader
    n_workers = 4 if cuda else 1
    dataset = _TensorDataset(depthstensor, tnftensor)
    dataloader = _DataLoader(dataset=dataset, batch_size=batchsize, drop_last=True,
                             shuffle=True, num_workers=n_workers, pin_memory=cuda)

    return dataloader
Пример #6
0
    def __init__(self, dataset, batch_size: int, shuffle: bool = True, num_workers: int = 0, device="auto", collate_fn=_default_collate):
        """
        Converts a dataset into a pytorch dataloader.

        :param dataset: The dataset to be wrapped. Only needs to implement list interface.
        :param shuffle: If the data should be shuffled.
        :param num_workers: The number of workers used for preloading.
        :param collate_fn: A function that converts numpy to tensor and batches inputs together.
        :return: A pytorch dataloader object.
        """
        self.dataset = dataset
        if device == "auto":
            if torch.cuda.is_available():
                device = "cuda"
            else:
                device = "cpu"
        self.device = device
        if isinstance(dataset, _IterableDataset):
            shuffle = False
        self.native_dataloader = _DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            drop_last=True,
            pin_memory=True,
            collate_fn=collate_fn
        )
Пример #7
0
def tica(data,
         dim=None,
         lag=1,
         kinetic_map=True,
         symmetrize=False,
         validation_split=None,
         batch_size=100,
         whiten=False):
    '''Perform a time-lagged independent component analysis for
    dimensionality reduction.

    We compute a rank-d approximation to the Koopman operator and use it to
    rotate/project the data into a lower dimensional subspace.

    Arguments:
        data (numpy-ndarray of list thereof): the data to be transformed
        dim (int): the target dimensionality
        lag (int): specifies the lag in time steps
        kinetic_map (boolean): use the kinetic map variant of TICA
        symmetrize (boolean): enforce symmetry and reversibility
        validation_split (float): fraction of the data reserved for validation
        batch_size (int): specify a batch size for the minibatch process
        whiten (boolean): set to True to whiten the transformed data

    Returns:
        (numpy.ndarray of list thereof): the transformed data
        (float): training loss
        (float): validation loss
    '''
    data_0 = _create_dataset(data, lag=0)
    data_lag = _create_dataset(data, lag=lag)
    if validation_split is None:
        train_loader = _DataLoader(data_lag, batch_size=batch_size)
        test_loader = None
    else:
        data_test, data_train = _random_block_split(data_lag,
                                                    lag,
                                                    f_active=validation_split)
        train_loader = _DataLoader(data_train, batch_size=batch_size)
        test_loader = _DataLoader(data_test, batch_size=batch_size)
    model = _TICA(kinetic_map=kinetic_map, symmetrize=symmetrize)
    train_loss, test_loss = model.fit(train_loader,
                                      dim=dim,
                                      test_loader=test_loader)
    transformed_data = _transform(model, data, data_0, batch_size, whiten)
    return transformed_data, train_loss, test_loss
Пример #8
0
def calculate_inception_score(path=None,
                              images=None,
                              dataset=None,
                              n_split=10,
                              eps=1E-16,
                              batch_size=2048,
                              device="GPU:0",
                              topK=None):
    if images is None:
        if dataset is None:
            dataset = _datasets.ImageFolder(root=path,
                                            transform=_transforms.Compose([
                                                _transforms.Resize((299, 299)),
                                                _transforms.ToTensor(),
                                            ]))
        dataset = _DataLoader(dataset,
                              batch_size=batch_size,
                              drop_last=False,
                              num_workers=8)
        images = _extract_from_dataset(dataset, 0, topK=topK)
        # load inception v3 model
        with _tf.device("/CPU"):
            images = images.permute(0, 2, 3,
                                    1).mul(255).data.numpy().astype(_np.uint8)
    model = _InceptionV3()
    # enumerate splits of images/predictions
    scores = list()
    n_part = images.shape[0] // n_split
    print("Starting to calculate inception score...", file=_sys.stderr)
    for i in range(n_split):
        # retrieve images
        ix_start, ix_end = i * n_part, (i + 1) * n_part
        subset = images[ix_start:ix_end]
        # convert from uint8 to float32
        subset = subset.astype('float32')
        # scale images to the required size
        # pre-process images, scale to [-1,1]
        with _tf.device(device):
            subset = _preprocess_input(subset)
            # predict p(y|x)
            p_yx = model.predict(subset)
        print(f"step[{i + 1}/{n_split}] 第 {1 + i} 轮计算完成", file=_sys.stderr)
        # calculate p(y)
        p_y = _np.expand_dims(p_yx.mean(axis=0), 0)
        # calculate KL divergence using log probabilities
        kl_d = p_yx * (_np.log(p_yx + eps) - _np.log(p_y + eps))
        # sum over classes
        sum_kl_d = kl_d.sum(axis=1)
        # average over images
        avg_kl_d = _np.mean(sum_kl_d)
        # undo the log
        is_score = _np.exp(avg_kl_d)
        # store
        scores.append(is_score)
    # average across images
    is_avg, is_std = _np.mean(scores), _np.std(scores)
    return is_avg, is_std
Пример #9
0
    def trainepoch(self, data_loader, epoch, optimizer, batchsteps, logfile):
        self.train()

        epoch_loss = 0
        epoch_kldloss = 0
        epoch_sseloss = 0
        epoch_celoss = 0

        if epoch in batchsteps:
            data_loader = _DataLoader(dataset=data_loader.dataset,
                                      batch_size=data_loader.batch_size * 2,
                                      shuffle=True,
                                      drop_last=True,
                                      num_workers=data_loader.num_workers,
                                      pin_memory=data_loader.pin_memory)

        for depths_in, tnf_in in data_loader:
            depths_in.requires_grad = True
            tnf_in.requires_grad = True

            if self.usecuda:
                depths_in = depths_in.cuda()
                tnf_in = tnf_in.cuda()

            optimizer.zero_grad()

            depths_out, tnf_out, mu, logsigma = self(depths_in, tnf_in)

            loss, ce, sse, kld = self.calc_loss(depths_in, depths_out, tnf_in,
                                                tnf_out, mu, logsigma)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.data.item()
            epoch_kldloss += kld.data.item()
            epoch_sseloss += sse.data.item()
            epoch_celoss += ce.data.item()

        if logfile is not None:
            print(
                '\tEpoch: {}\tLoss: {:.6f}\tCE: {:.7f}\tSSE: {:.6f}\tKLD: {:.4f}\tBatchsize: {}'
                .format(
                    epoch + 1,
                    epoch_loss / len(data_loader),
                    epoch_celoss / len(data_loader),
                    epoch_sseloss / len(data_loader),
                    epoch_kldloss / len(data_loader),
                    data_loader.batch_size,
                ),
                file=logfile)

            logfile.flush()

        return data_loader
 def val(self):
     testing_set = Test_Dataset(dic_test=self.dic_test_idx, opt=self.opt)
     if self.opt.Fusion:
         print('==> Testing data : %d clips,' % len(testing_set),
               testing_set[1][1][0].size(), testing_set[1][1][1].size())
     else:
         print('==> Testing data : %d clips,' % len(testing_set),
               testing_set[1][1].size())
     test_loader = _DataLoader(dataset=testing_set,
                               batch_size=self.BATCH_SIZE,
                               shuffle=False,
                               num_workers=self.num_workers)
     return test_loader
Пример #11
0
    def get_loader(self, handle_key='', distributed=False, use_unpadded_sampler=False, **kw):
        args = {**self.args}
        args['distributed'] = distributed
        args['use_unpadded_sampler'] = use_unpadded_sampler
        args.update(self.dataloader_args.get(handle_key, {}))
        args.update(**kw)

        if args.get('dataset') is None:
            return None

        loader_args = {
            'dataset': None,
            'batch_size': 1,
            'sampler': None,
            'shuffle': False,
            'batch_sampler': None,
            'num_workers': 0,
            'pin_memory': False,
            'drop_last': False,
            'timeout': 0,
            'worker_init_fn': _multi.seed_worker if args.get('seed_all') else None
        }

        for k in loader_args.keys():
            loader_args[k] = args.get(k, loader_args.get(k))

        if loader_args['sampler'] is not None:
            """Sampler and shuffle are mutually exclusive"""
            loader_args['shuffle'] = False

        if args['distributed']:
            sampler_args = {
                'num_replicas': args.get('replicas'),
                'rank': args.get('rank'),
                'shuffle': args.get('shuffle'),
                'seed': args.get('seed')
            }

            if loader_args.get('sampler') is None:
                loader_args['shuffle'] = False  # Shuffle is mutually exclusive with sampler
                if args['use_unpadded_sampler']:
                    loader_args['sampler'] = UnPaddedDDPSampler(loader_args['dataset'], **sampler_args)
                else:
                    loader_args['sampler'] = _data.distributed.DistributedSampler(loader_args['dataset'],
                                                                                  **sampler_args)

            loader_args['num_workers'] = _multi.num_workers(args, loader_args, True)
            loader_args['batch_size'] = _multi.batch_size(args, loader_args, True)

        return _DataLoader(collate_fn=_multi.safe_collate, **loader_args)
    def train(self):
        training_set = Train_Dataset(dic_train=self.dic_video_train,
                                     opt=self.opt)
        if self.opt.Fusion:
            print('==> Training data : %d videos,' % len(training_set),
                  training_set[1][1][0].size(), training_set[1][1][1].size())
        else:
            print('==> Training data : %d videos,' % len(training_set),
                  training_set[1][1].size())

        train_loader = _DataLoader(dataset=training_set,
                                   batch_size=self.BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=self.num_workers)
        return train_loader
Пример #13
0
def _transform(model, data, data_0, batch_size, whiten, pin_memory=False):
    loader = _DataLoader(data_0, batch_size=batch_size, pin_memory=pin_memory)
    if whiten:
        transformed_data = _whiten_data(model.transform(loader)).numpy()
    else:
        transformed_data = model.transform(loader).numpy()
    if isinstance(data, (list, tuple)):
        collect = []
        p = 0
        lengths = [d.shape[0] for d in data]
        for length in lengths:
            collect.append(transformed_data[p:p+length, :])
            p += length
        return collect
    return transformed_data
Пример #14
0
def cca(data_tensor_x, data_tensor_y, batch_size=100):
    '''Perform canonical correlation analysis for two data tensors.

    Arguments:
        data_tensor_x (Tensor): contains the first data tensor
        data_tensor_y (Tensor): contains the second data tensor
        batch_size (int): specify a batch size for the CCA calculation
    '''
    loader = _DataLoader(_TensorDataset(data_tensor_x, data_tensor_y),
                         batch_size=batch_size)
    x_mean, y_mean = get_mean(loader)
    cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
    ixx = get_sqrt_inverse(cxx)
    iyy = get_sqrt_inverse(cyy)
    return _torch.svd(_torch.mm(_torch.mm(ixx, cxy), iyy))
Пример #15
0
def whiten_data(data_tensor, batch_size=100):
    '''Whiten a Tensor in the PCA basis.

    Arguments:
        data_tensor (Tensor): contains the data you want to whiten
        batch_size (int): specify a batch size for the whitening process
    '''
    loader = _DataLoader(LaggedDataset(data_tensor, lag=0),
                         batch_size=batch_size)
    x_mean, y_mean = get_mean(loader)
    cxx, cxy, cyy = get_covariance(loader, x_mean, y_mean)
    ixx = get_sqrt_inverse(cxx)
    whitened_data = []
    for x, _ in loader:
        x.sub_(x_mean[None, :])
        whitened_data.append(x.mm(ixx))
    return _torch.cat(whitened_data)
Пример #16
0
    def trainepoch(self, data_loader, epoch, optimizer, batchsteps):
        self.train()

        epoch_loss = 0
        epoch_kldloss = 0
        epoch_sseloss = 0
        epoch_celoss = 0

        if epoch in batchsteps:
            data_loader = _DataLoader(dataset=data_loader.dataset,
                                      batch_size=data_loader.batch_size * 2,
                                      shuffle=True,
                                      drop_last=True,
                                      num_workers=data_loader.num_workers,
                                      pin_memory=data_loader.pin_memory)

        for depths_in, tnf_in in data_loader:
            depths_in.requires_grad = True
            tnf_in.requires_grad = True

            if self.usecuda:
                depths_in = depths_in.cuda()
                tnf_in = tnf_in.cuda()

            optimizer.zero_grad()

            depths_out, tnf_out, mu, logsigma, q_z, p_z = self(
                depths_in, tnf_in)

            loss, ce, sse, kld = self.calc_loss(depths_in, depths_out, tnf_in,
                                                tnf_out, q_z, p_z)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.data.item()
            epoch_kldloss += kld.data.item()
            epoch_sseloss += sse.data.item()
            epoch_celoss += ce.data.item()

        print(
            "[Epoch %d] [epoch_loss: %f] [epoch_kldloss: %f] [epoch_sse: %f] [epoch_celoss: %f]"
            % (epoch, epoch_loss, epoch_kldloss, epoch_sseloss, epoch_celoss))

        return data_loader
Пример #17
0
    def encode(self, data_loader):
        """Encode a data loader to a latent representation with VAE

        Input: data_loader: As generated by train_vae

        Output: A (n_contigs x n_latent) Numpy array of latent repr.
        """

        self.eval()

        new_data_loader = _DataLoader(dataset=data_loader.dataset,
                                      batch_size=data_loader.batch_size,
                                      shuffle=False,
                                      drop_last=False,
                                      num_workers=1,
                                      pin_memory=data_loader.pin_memory)

        depths_array, tnf_array = data_loader.dataset.tensors
        length = len(depths_array)

        # We make a Numpy array instead of a Torch array because, if we create
        # a Torch array, then convert it to Numpy, Numpy will believe it doesn't
        # own the memory block, and array resizes will not be permitted.
        latent = _np.empty((length, self.nlatent), dtype=_np.float32)

        row = 0
        with _torch.no_grad():
            for depths, tnf in new_data_loader:
                # Move input to GPU if requested
                if self.usecuda:
                    depths = depths.cuda()
                    tnf = tnf.cuda()

                # Evaluate
                out_depths, out_tnf, mu, logsigma = self(depths, tnf)

                if self.usecuda:
                    mu = mu.cpu()

                latent[row:row + len(mu)] = mu
                row += len(mu)

        assert row == length
        return latent
Пример #18
0
    def encode(self, data_loader):
        """利用VAE编码的过程
        Input: data_loader: 训练数据
        Output: A (n_contigs x n_latent) 隐向量
        """

        self.eval()

        new_data_loader = _DataLoader(dataset=data_loader.dataset,
                                      batch_size=data_loader.batch_size,
                                      shuffle=False,
                                      drop_last=False,
                                      num_workers=1,
                                      pin_memory=data_loader.pin_memory)

        depths_array, tnf_array = data_loader.dataset.tensors
        length = len(depths_array)
 
        latent = _np.empty((length, self.nlatent), dtype=_np.float32)

        row = 0
        with _torch.no_grad():
            for depths, tnf in new_data_loader:
                # Move input to GPU if requested
                if self.usecuda:
                    depths = depths.cuda()
                    tnf = tnf.cuda()

                # Evaluate
                out_depths, out_tnf, mu, logsigma, q_z, p_z = self(depths, tnf)

                if self.usecuda:
                    mu = mu.cpu()

                latent[row: row + len(mu)] = mu
                row += len(mu)

        assert row == length
        return latent
Пример #19
0
def make_dataloader(rpkm, tnf, batchsize=64, destroy=False, cuda=False):
    """Create a DataLoader and a contig mask from RPKM and TNF.

    The dataloader is an object feeding minibatches of contigs to the VAE.
    The data are normalized versions of the input datasets, with zero-contigs,
    i.e. contigs where a row in either TNF or RPKM are all zeros, removed.
    The mask is a boolean mask designating which contigs have been kept.

    Inputs:
        rpkm: RPKM matrix (N_contigs x N_samples)
        tnf: TNF matrix (N_contigs x 136)
        batchsize: Starting size of minibatches for dataloader
        destroy: Mutate rpkm and tnf array in-place instead of making a copy.
        cuda: Pagelock memory of dataloader (use when using GPU acceleration)

    Outputs:
        DataLoader: An object feeding data to the VAE
        mask: A boolean mask of which contigs are kept
    """

    if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
        raise ValueError('TNF and RPKM must be Numpy arrays')

    if batchsize < 1:
        raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize))

    if len(rpkm) != len(tnf):
        raise ValueError('Lengths of RPKM and TNF must be the same')

    if tnf.shape[1] != 136:
        raise ValueError('TNF must be 136 long along axis 1')

    tnfsum = tnf.sum(axis=1)
    mask = tnfsum != 0
    del tnfsum
    depthssum = rpkm.sum(axis=1)
    mask &= depthssum != 0

    if destroy:
        if not (rpkm.dtype == tnf.dtype == _np.float32):
            raise ValueError(
                'Arrays must be of data type np.float32 if destroy is True')

        rpkm = _vambtools.inplace_maskarray(rpkm, mask)
        tnf = _vambtools.inplace_maskarray(tnf, mask)
    else:
        rpkm = rpkm[mask].astype(_np.float32, copy=False)
        tnf = tnf[mask].astype(_np.float32, copy=False)

    depthssum = depthssum[mask]

    # Normalize arrays and create the Tensors
    rpkm /= depthssum.reshape((-1, 1))
    _vambtools.zscore(tnf, axis=0, inplace=True)
    depthstensor = _torch.from_numpy(rpkm)
    tnftensor = _torch.from_numpy(tnf)

    # Create dataloader
    dataset = _TensorDataset(depthstensor, tnftensor)
    dataloader = _DataLoader(dataset=dataset,
                             batch_size=batchsize,
                             drop_last=True,
                             shuffle=True,
                             num_workers=1,
                             pin_memory=cuda)

    return dataloader, mask
Пример #20
0
 def as_dataloader(self, **kwargs):
     """
     Build a PyTorch DataLoader view of this Dataset
     """
     return _DataLoader(dataset=self, **kwargs)
Пример #21
0
def make_dataloader(rpkm, tnf, batchsize=256, destroy=False, cuda=False):
    """Create a DataLoader and a contig mask from RPKM and TNF.

    The dataloader is an object feeding minibatches of contigs to the VAE.
    The data are normalized versions of the input datasets, with zero-contigs,
    i.e. contigs where a row in either TNF or RPKM are all zeros, removed.
    The mask is a boolean mask designating which contigs have been kept.

    Inputs:
        rpkm: RPKM matrix (N_contigs x N_samples)
        tnf: TNF matrix (N_contigs x N_TNF)
        batchsize: Starting size of minibatches for dataloader
        destroy: Mutate rpkm and tnf array in-place instead of making a copy.
        cuda: Pagelock memory of dataloader (use when using GPU acceleration)

    Outputs:
        DataLoader: An object feeding data to the VAE
        mask: A boolean mask of which contigs are kept
    """

    if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
        raise ValueError('TNF and RPKM must be Numpy arrays')

    if batchsize < 1:
        raise ValueError('Minimum batchsize of 1, not {}'.format(batchsize))

    if len(rpkm) != len(tnf):
        raise ValueError('Lengths of RPKM and TNF must be the same')

    if not (rpkm.dtype == tnf.dtype == _np.float32):
        raise ValueError('TNF and RPKM must be Numpy arrays of dtype float32')

    mask = tnf.sum(axis=1) != 0

    # If multiple samples, also include nonzero depth as requirement for accept
    # of sequences
    if rpkm.shape[1] > 1:
        depthssum = rpkm.sum(axis=1)
        mask &= depthssum != 0
        depthssum = depthssum[mask]

    if mask.sum() < batchsize:
        raise ValueError(
            'Fewer sequences left after filtering than the batch size.')

    if destroy:
        rpkm = _vambtools.numpy_inplace_maskarray(rpkm, mask)
        tnf = _vambtools.numpy_inplace_maskarray(tnf, mask)
    else:
        # The astype operation does not copy due to "copy=False", but the masking
        # operation does.
        rpkm = rpkm[mask].astype(_np.float32, copy=False)
        tnf = tnf[mask].astype(_np.float32, copy=False)

    # If multiple samples, normalize to sum to 1, else zscore normalize
    if rpkm.shape[1] > 1:
        rpkm /= depthssum.reshape((-1, 1))
    else:
        _vambtools.zscore(rpkm, axis=0, inplace=True)

    # Normalize arrays and create the Tensors (the tensors share the underlying memory)
    # of the Numpy arrays
    _vambtools.zscore(tnf, axis=0, inplace=True)
    depthstensor = _torch.from_numpy(rpkm)
    tnftensor = _torch.from_numpy(tnf)

    # Create dataloader
    n_workers = 4 if cuda else 1
    dataset = _TensorDataset(depthstensor, tnftensor)
    dataloader = _DataLoader(dataset=dataset,
                             batch_size=batchsize,
                             drop_last=True,
                             shuffle=True,
                             num_workers=n_workers,
                             pin_memory=cuda)

    return dataloader, mask