Пример #1
0
 def fit(self, X, y=None):
     if issparse(X):
         X = X.todense()
     ds = TensorDataset(torch.from_numpy(X.astype(np.float32)))
     self.autoencoder = StackedDenoisingAutoEncoder(self.dimensions, final_activation=self.final_activation)
     if self.cuda:
         self.autoencoder.cuda()
     ae.pretrain(
         ds,
         self.autoencoder,
         cuda=self.cuda,
         epochs=self.pretrain_epochs,
         batch_size=self.batch_size,
         optimizer=self.optimiser_pretrain,
         scheduler=self.scheduler,
         corruption=0.2,
         silent=True
     )
     ae_optimizer = self.optimiser_train(self.autoencoder)
     ae.train(
         ds,
         self.autoencoder,
         cuda=self.cuda,
         epochs=self.finetune_epochs,
         batch_size=self.batch_size,
         optimizer=ae_optimizer,
         scheduler=self.scheduler(ae_optimizer),
         corruption=self.corruption,
         silent=True
     )
     return self
Пример #2
0
def pretrain(
    dataset,
    autoencoder: StackedDenoisingAutoEncoder,
    epochs: int,
    batch_size: int,
    optimizer: Callable[[torch.nn.Module], torch.optim.Optimizer],
    scheduler: Optional[Callable[[torch.optim.Optimizer], Any]] = None,
    validation: Optional[torch.utils.data.Dataset] = None,
    corruption: Optional[float] = None,
    cuda: bool = True,
    sampler: Optional[torch.utils.data.sampler.Sampler] = None,
    silent: bool = False,
    update_freq: Optional[int] = 1,
    update_callback: Optional[Callable[[float, float], None]] = None,
    num_workers: Optional[int] = None,
    epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None,
) -> None:
    """
    Given an autoencoder, train it using the data provided in the dataset; for simplicity the accuracy is reported only
    on the training dataset. If the training dataset is a 2-tuple or list of (feature, prediction), then the prediction
    is stripped away.

    :param dataset: instance of Dataset to use for training
    :param autoencoder: instance of an autoencoder to train
    :param epochs: number of training epochs
    :param batch_size: batch size for training
    :param corruption: proportion of masking corruption to apply, set to None to disable, defaults to None
    :param optimizer: function taking model and returning optimizer
    :param scheduler: function taking optimizer and returning scheduler, or None to disable
    :param validation: instance of Dataset to use for validation
    :param cuda: whether CUDA is used, defaults to True
    :param sampler: sampler to use in the DataLoader, defaults to None
    :param silent: set to True to prevent printing out summary statistics, defaults to False
    :param update_freq: frequency of batches with which to update counter, None disables, default 1
    :param update_callback: function of loss and validation loss to update
    :param num_workers: optional number of workers to use for data loading
    :param epoch_callback: function of epoch and model
    :return: None
    """
    current_dataset = dataset
    current_validation = validation
    number_of_subautoencoders = len(autoencoder.dimensions) - 1
    for index in range(number_of_subautoencoders):
        encoder, decoder = autoencoder.get_stack(index)
        embedding_dimension = autoencoder.dimensions[index]
        hidden_dimension = autoencoder.dimensions[index + 1]
        # manual override to prevent corruption for the last subautoencoder
        if index == (number_of_subautoencoders - 1):
            corruption = None
        # initialise the subautoencoder
        sub_autoencoder = DenoisingAutoencoder(
            embedding_dimension=embedding_dimension,
            hidden_dimension=hidden_dimension,
            activation=torch.nn.ReLU()
            if index != (number_of_subautoencoders - 1)
            else None,
            corruption=nn.Dropout(corruption) if corruption is not None else None,
        )
        if cuda:
            sub_autoencoder = sub_autoencoder.cuda()
        ae_optimizer = optimizer(sub_autoencoder)
        ae_scheduler = scheduler(ae_optimizer) if scheduler is not None else scheduler
        train(
            current_dataset,
            sub_autoencoder,
            epochs,
            batch_size,
            ae_optimizer,
            validation=current_validation,
            corruption=None,  # already have dropout in the DAE
            scheduler=ae_scheduler,
            cuda=cuda,
            sampler=sampler,
            silent=silent,
            update_freq=update_freq,
            update_callback=update_callback,
            num_workers=num_workers,
            epoch_callback=epoch_callback,
        )
        # copy the weights
        sub_autoencoder.copy_weights(encoder, decoder)
        # pass the dataset through the encoder part of the subautoencoder
        if index != (number_of_subautoencoders - 1):
            current_dataset = TensorDataset(
                predict(
                    current_dataset,
                    sub_autoencoder,
                    batch_size,
                    cuda=cuda,
                    silent=silent,
                )
            )
            if current_validation is not None:
                current_validation = TensorDataset(
                    predict(
                        current_validation,
                        sub_autoencoder,
                        batch_size,
                        cuda=cuda,
                        silent=silent,
                    )
                )
        else:
            current_dataset = None  # minor optimisation on the last subautoencoder
            current_validation = None
Пример #3
0
def main(data_dir, cuda, batch_size, pretrain_epochs, finetune_epochs,
         testing_mode):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope

    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'validation_loss': validation_loss,
        }, epoch)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    ds_train = CachedMNIST(data_dir,
                           is_train=True,
                           device=device,
                           testing_mode=testing_mode)  # training dataset
    ds_val = CachedMNIST(data_dir,
                         is_train=False,
                         device=device,
                         testing_mode=testing_mode)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)

    autoencoder = autoencoder.to(device)
    print('Pretraining stage.')
    ae.pretrain(
        ds_train,
        autoencoder,
        device=device,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        silent=True,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 20000, gamma=0.1),
        corruption=0.2)
    print('Training stage.')
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(ds_train,
             autoencoder,
             device=device,
             validation=ds_val,
             epochs=finetune_epochs,
             batch_size=batch_size,
             silent=True,
             optimizer=ae_optimizer,
             scheduler=StepLR(ae_optimizer, 20000, gamma=0.1),
             corruption=0.2,
             update_callback=training_callback)
    print('DEC stage.')
    model = DEC(cluster_number=10,
                embedding_dimension=28 * 28,
                hidden_dimension=10,
                encoder=autoencoder.encoder)

    model = model.to(device)
    dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    train(dataset=ds_train,
          model=model,
          epochs=20000,
          batch_size=256,
          silent=True,
          optimizer=dec_optimizer,
          stopping_delta=0.000001,
          cuda=cuda)
    predicted, actual = predict(ds_train,
                                model,
                                1024,
                                silent=True,
                                return_actual=True,
                                cuda=cuda)
    actual = actual.cpu().numpy()
    predicted = predicted.cpu().numpy()
    reassignment, accuracy = cluster_accuracy(actual, predicted)
    print('Final DEC accuracy: %s' % accuracy)
    if not testing_mode:
        predicted_reassigned = [reassignment[item]
                                for item in predicted]  # TODO numpify
        confusion = confusion_matrix(actual, predicted_reassigned)
        normalised_confusion = confusion.astype('float') / confusion.sum(
            axis=1)[:, np.newaxis]
        confusion_id = uuid.uuid4().hex
        sns.heatmap(normalised_confusion).get_figure().savefig(
            'confusion_%s.png' % confusion_id)
        print('Writing out confusion diagram with UUID: %s' % confusion_id)
        writer.close()
Пример #4
0
 def setUpClass(cls):
     cls.ae = StackedDenoisingAutoEncoder([100, 50, 5])
     cls.dec = DEC(2, 100, 5, cls.ae.encoder)
Пример #5
0
 def setUpClass(cls):
     cls.dimensions = list(reversed(range(5, 11)))
     cls.ae = StackedDenoisingAutoEncoder(cls.dimensions)
Пример #6
0
def main(cuda, batch_size, pretrain_epochs, finetune_epochs):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'validation_loss': validation_loss,
        }, epoch)

    ds_train = CachedMNIST(train=True, cuda=cuda)  # training dataset
    ds_val = CachedMNIST(train=False, cuda=cuda)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)
    if cuda:
        autoencoder.cuda()
    print('Pretraining stage.')
    ae.pretrain(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 100, gamma=0.1),
        corruption=0.2)
    print('Training stage.')
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(ds_train,
             autoencoder,
             cuda=cuda,
             validation=ds_val,
             epochs=finetune_epochs,
             batch_size=batch_size,
             optimizer=ae_optimizer,
             scheduler=StepLR(ae_optimizer, 100, gamma=0.1),
             corruption=0.2,
             update_callback=training_callback)
    print('k-Means stage')
    dataloader = DataLoader(ds_train, batch_size=1024, shuffle=False)
    kmeans = KMeans(n_clusters=10, n_init=20)
    autoencoder.eval()
    features = []
    actual = []
    for index, batch in enumerate(dataloader):
        if (isinstance(batch, tuple)
                or isinstance(batch, list)) and len(batch) == 2:
            batch, value = batch  # if we have a prediction label, separate it to actual
            actual.append(value)
        if cuda:
            batch = batch.cuda(async=True)
        batch = batch.squeeze(1).view(batch.size(0), -1)
        features.append(autoencoder.encoder(batch).detach().cpu())
    actual = torch.cat(actual).long().cpu().numpy()
    predicted = kmeans.fit_predict(torch.cat(features).numpy())
    reassignment, accuracy = cluster_accuracy(predicted, actual)
    print('Final k-Means accuracy: %s' % accuracy)
    predicted_reassigned = [reassignment[item]
                            for item in predicted]  # TODO numpify
    confusion = confusion_matrix(actual, predicted_reassigned)
    normalised_confusion = confusion.astype('float') / confusion.sum(
        axis=1)[:, np.newaxis]
    confusion_id = uuid.uuid4().hex
    sns.heatmap(normalised_confusion).get_figure().savefig('confusion_%s.png' %
                                                           confusion_id)
    print('Writing out confusion diagram with UUID: %s' % confusion_id)
    writer.add_embedding(
        torch.cat(features),
        metadata=predicted,
        label_img=ds_train.ds.train_data.float().unsqueeze(1),  # TODO bit ugly
        tag='predicted')
    writer.close()
Пример #7
0
class SDAETransformerBase(TransformerMixin, BaseEstimator):
    def __init__(self,
                 dimensions: List[int],
                 cuda: Optional[bool] = None,
                 batch_size: int = 256,
                 pretrain_epochs: int = 200,
                 finetune_epochs: int = 500,
                 corruption: Optional[float] = 0.2,
                 optimiser_pretrain: Callable[[torch.nn.Module], torch.optim.Optimizer] = lambda x: SGD(x.parameters(), lr=0.1, momentum=0.9),
                 optimiser_train: Callable[[torch.nn.Module], torch.optim.Optimizer] = lambda x: SGD(x.parameters(), lr=0.1, momentum=0.9),
                 scheduler: Optional[Callable[[torch.optim.Optimizer], Any]] = lambda x: StepLR(x, 100, gamma=0.1),
                 final_activation: Optional[torch.nn.Module] = None) -> None:
        self.cuda = torch.cuda.is_available() if cuda is None else cuda
        self.batch_size = batch_size
        self.dimensions = dimensions
        self.pretrain_epochs = pretrain_epochs
        self.finetune_epochs = finetune_epochs
        self.optimiser_pretrain = optimiser_pretrain
        self.optimiser_train = optimiser_train
        self.scheduler = scheduler
        self.corruption = corruption
        self.autoencoder = None
        self.final_activation = final_activation

    def fit(self, X, y=None):
        if issparse(X):
            X = X.todense()
        ds = TensorDataset(torch.from_numpy(X.astype(np.float32)))
        self.autoencoder = StackedDenoisingAutoEncoder(self.dimensions, final_activation=self.final_activation)
        if self.cuda:
            self.autoencoder.cuda()
        ae.pretrain(
            ds,
            self.autoencoder,
            cuda=self.cuda,
            epochs=self.pretrain_epochs,
            batch_size=self.batch_size,
            optimizer=self.optimiser_pretrain,
            scheduler=self.scheduler,
            corruption=0.2,
            silent=True
        )
        ae_optimizer = self.optimiser_train(self.autoencoder)
        ae.train(
            ds,
            self.autoencoder,
            cuda=self.cuda,
            epochs=self.finetune_epochs,
            batch_size=self.batch_size,
            optimizer=ae_optimizer,
            scheduler=self.scheduler(ae_optimizer),
            corruption=self.corruption,
            silent=True
        )
        return self

    def score(self, X, y=None, sample_weight=None) -> float:
        loss_function = torch.nn.MSELoss()
        if self.autoencoder is None:
            raise NotFittedError
        if issparse(X):
            X = X.todense()
        self.autoencoder.eval()
        ds = TensorDataset(torch.from_numpy(X.astype(np.float32)))
        dataloader = DataLoader(
            ds,
            batch_size=self.batch_size,
            shuffle=False
        )
        loss = 0
        for index, batch in enumerate(dataloader):
            batch = batch[0]
            if self.cuda:
                batch = batch.cuda(non_blocking=True)
            output = self.autoencoder(batch)
            loss += float(loss_function(output, batch).item())
        return loss
Пример #8
0
    print('got dataset', flush=True)
    ds_train.output = 2

    # pretrain
    pretrain_epochs = 300
    finetune_epochs = 500
    training_callback = None
    cuda = torch.cuda.is_available()
    ds_val = None
    embedded_dim = get_embedded_dim()

    try:
        autoencoder = pickle.load(open(autoencoder_path, 'rb'))
    except:
        autoencoder = StackedDenoisingAutoEncoder(
            dimensions=[embedded_dim, 500, 500, 2000, 10],
            final_activation=None,
        )
        if cuda:
            autoencoder.cuda()

        print('SDAE Pretraining stage.', flush=True)
        print(f'@ {time.time() - start_time}\n', flush=True)
        ae.pretrain(
            ds_train,
            autoencoder,
            cuda=cuda,
            validation=ds_val,
            epochs=pretrain_epochs,
            batch_size=batch_size,
            optimizer=lambda model: SGD(
                model.parameters(), lr=0.1, momentum=0.9),
    def get_opt(model, lr=args.pretrain_lr):
        return torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)

    def get_sched(opt):
        return torch.optim.lr_scheduler.StepLR(optimizer=opt,
                                               step_size=1,
                                               gamma=args.lr_step,
                                               last_epoch=-1)

    print("Loading Data ...")
    sys.stdout.flush()
    dataset = get_dataset(args)

    validation = None

    ae = SDAE([dataset.dims] + args.layers)

    timestamp = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

    print("Pretraining ...")
    sys.stdout.flush()
    # pretrain
    ptsdae.model.pretrain(dataset,
                          autoencoder=ae,
                          epochs=args.pretrain_epochs,
                          batch_size=args.batch_size,
                          optimizer=get_opt,
                          scheduler=get_sched,
                          validation=validation,
                          update_freq=args.pretrain_epochs // 50,
                          cuda=True,
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

from sc_dm.datasets import *

import torch

from ptsdae.sdae import StackedDenoisingAutoEncoder as SDAE

if __name__ == '__main__':
    # #############################################################################
    dset = sys.argv[1]
    #raw_data = DuoBenchmark('data/datasets/'+dset+'.csv')
    raw_data = FromPickle('data/embeddings/mouse-pca-15000-log1p-True.pickle')
    model = SDAE([raw_data.dims, 7500, 500, 2000, 50])
    #model.load_state_dict(torch.load('data/models/'+dset+'.pt'))
    model.load_state_dict(torch.load(sys.argv[1]))
    if int(torch.__version__.split('.')[1]) == 3:
        var = torch.autograd.variable.Variable(torch.Tensor(raw_data.data))
    else:
        var = torch.Tensor(raw_data.data)
    embedding = model.encoder(var).data.numpy()

    labels = DBSCAN().fit(embedding).labels_

    tsne_embedding = TSNE(n_components=2).fit_transform(embedding)

    # #############################################################################

    plt_file = 'data/plots/mouse_SDAE.pdf'
Пример #11
0
def main(cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope

    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "validation_loss": validation_loss,
            },
            epoch,
        )

    ds_train = CachedMNIST(train=True, cuda=cuda,
                           testing_mode=testing_mode)  # training dataset
    ds_val = CachedMNIST(train=False, cuda=cuda,
                         testing_mode=testing_mode)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)
    if cuda:
        autoencoder.cuda()
    print("Pretraining stage.")
    ae.pretrain(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 100, gamma=0.1),
        corruption=0.2,
    )
    print("Training stage.")
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=finetune_epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        scheduler=StepLR(ae_optimizer, 100, gamma=0.1),
        corruption=0.2,
        update_callback=training_callback,
    )
    print("DEC stage.")
    model = DEC(cluster_number=10,
                hidden_dimension=10,
                encoder=autoencoder.encoder)
    if cuda:
        model.cuda()
    dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    train(
        dataset=ds_train,
        model=model,
        epochs=100,
        batch_size=256,
        optimizer=dec_optimizer,
        stopping_delta=0.000001,
        cuda=cuda,
    )
    predicted, actual = predict(ds_train,
                                model,
                                1024,
                                silent=True,
                                return_actual=True,
                                cuda=cuda)
    actual = actual.cpu().numpy()
    predicted = predicted.cpu().numpy()
    reassignment, accuracy = cluster_accuracy(actual, predicted)
    print("Final DEC accuracy: %s" % accuracy)
    if not testing_mode:
        predicted_reassigned = [reassignment[item]
                                for item in predicted]  # TODO numpify
        confusion = confusion_matrix(actual, predicted_reassigned)
        normalised_confusion = (confusion.astype("float") /
                                confusion.sum(axis=1)[:, np.newaxis])
        confusion_id = uuid.uuid4().hex
        sns.heatmap(normalised_confusion).get_figure().savefig(
            "confusion_%s.png" % confusion_id)
        print("Writing out confusion diagram with UUID: %s" % confusion_id)
        writer.close()
Пример #12
0
            ds_path = os.path.join('data/datasets', ds_name + '.csv')
            dataset = DuoBenchmark(ds_path, log1p=log, split_head=False)

            for scale in [True]:
                # Do scaling second as the function will
                # overwrite the existing data
                # yes - yes I know this is bad design but it's too late now
                mlist = model_dict[ds_name][log][scale]
                # Given all of the pre-existing conditions ...
                # cycle through each of the models that match this criteria
                for model in mlist:
                    filename = model[0]
                    print(filename)
                    if scale:
                        scale_dataset(dataset)
                    # get parameter information
                    model_path = os.path.join(model_dir, filename)
                    layers = model[1]
                    # prepare the model
                    model = SDAE([dataset.dims] + layers)
                    model.load_state_dict(
                        torch.load(model_path, map_location='cpu'))
                    # generate the embedding
                    inputs = torch.Tensor(dataset.data)
                    embedding = model.encoder(inputs).data.numpy()
                    # save the embedding
                    with open(
                            os.path.join('data/sdae_embeddings',
                                         filename + '.pickle'), 'wb') as fh:
                        pickle.dump(embedding, fh, protocol=4)