def fit(self, X, y=None): if issparse(X): X = X.todense() ds = TensorDataset(torch.from_numpy(X.astype(np.float32))) self.autoencoder = StackedDenoisingAutoEncoder(self.dimensions, final_activation=self.final_activation) if self.cuda: self.autoencoder.cuda() ae.pretrain( ds, self.autoencoder, cuda=self.cuda, epochs=self.pretrain_epochs, batch_size=self.batch_size, optimizer=self.optimiser_pretrain, scheduler=self.scheduler, corruption=0.2, silent=True ) ae_optimizer = self.optimiser_train(self.autoencoder) ae.train( ds, self.autoencoder, cuda=self.cuda, epochs=self.finetune_epochs, batch_size=self.batch_size, optimizer=ae_optimizer, scheduler=self.scheduler(ae_optimizer), corruption=self.corruption, silent=True ) return self
def pretrain( dataset, autoencoder: StackedDenoisingAutoEncoder, epochs: int, batch_size: int, optimizer: Callable[[torch.nn.Module], torch.optim.Optimizer], scheduler: Optional[Callable[[torch.optim.Optimizer], Any]] = None, validation: Optional[torch.utils.data.Dataset] = None, corruption: Optional[float] = None, cuda: bool = True, sampler: Optional[torch.utils.data.sampler.Sampler] = None, silent: bool = False, update_freq: Optional[int] = 1, update_callback: Optional[Callable[[float, float], None]] = None, num_workers: Optional[int] = None, epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None, ) -> None: """ Given an autoencoder, train it using the data provided in the dataset; for simplicity the accuracy is reported only on the training dataset. If the training dataset is a 2-tuple or list of (feature, prediction), then the prediction is stripped away. :param dataset: instance of Dataset to use for training :param autoencoder: instance of an autoencoder to train :param epochs: number of training epochs :param batch_size: batch size for training :param corruption: proportion of masking corruption to apply, set to None to disable, defaults to None :param optimizer: function taking model and returning optimizer :param scheduler: function taking optimizer and returning scheduler, or None to disable :param validation: instance of Dataset to use for validation :param cuda: whether CUDA is used, defaults to True :param sampler: sampler to use in the DataLoader, defaults to None :param silent: set to True to prevent printing out summary statistics, defaults to False :param update_freq: frequency of batches with which to update counter, None disables, default 1 :param update_callback: function of loss and validation loss to update :param num_workers: optional number of workers to use for data loading :param epoch_callback: function of epoch and model :return: None """ current_dataset = dataset current_validation = validation number_of_subautoencoders = len(autoencoder.dimensions) - 1 for index in range(number_of_subautoencoders): encoder, decoder = autoencoder.get_stack(index) embedding_dimension = autoencoder.dimensions[index] hidden_dimension = autoencoder.dimensions[index + 1] # manual override to prevent corruption for the last subautoencoder if index == (number_of_subautoencoders - 1): corruption = None # initialise the subautoencoder sub_autoencoder = DenoisingAutoencoder( embedding_dimension=embedding_dimension, hidden_dimension=hidden_dimension, activation=torch.nn.ReLU() if index != (number_of_subautoencoders - 1) else None, corruption=nn.Dropout(corruption) if corruption is not None else None, ) if cuda: sub_autoencoder = sub_autoencoder.cuda() ae_optimizer = optimizer(sub_autoencoder) ae_scheduler = scheduler(ae_optimizer) if scheduler is not None else scheduler train( current_dataset, sub_autoencoder, epochs, batch_size, ae_optimizer, validation=current_validation, corruption=None, # already have dropout in the DAE scheduler=ae_scheduler, cuda=cuda, sampler=sampler, silent=silent, update_freq=update_freq, update_callback=update_callback, num_workers=num_workers, epoch_callback=epoch_callback, ) # copy the weights sub_autoencoder.copy_weights(encoder, decoder) # pass the dataset through the encoder part of the subautoencoder if index != (number_of_subautoencoders - 1): current_dataset = TensorDataset( predict( current_dataset, sub_autoencoder, batch_size, cuda=cuda, silent=silent, ) ) if current_validation is not None: current_validation = TensorDataset( predict( current_validation, sub_autoencoder, batch_size, cuda=cuda, silent=silent, ) ) else: current_dataset = None # minor optimisation on the last subautoencoder current_validation = None
def main(data_dir, cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode): writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(epoch, lr, loss, validation_loss): writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'validation_loss': validation_loss, }, epoch) device = 'cuda' if torch.cuda.is_available() else 'cpu' ds_train = CachedMNIST(data_dir, is_train=True, device=device, testing_mode=testing_mode) # training dataset ds_val = CachedMNIST(data_dir, is_train=False, device=device, testing_mode=testing_mode) # evaluation dataset autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10], final_activation=None) autoencoder = autoencoder.to(device) print('Pretraining stage.') ae.pretrain( ds_train, autoencoder, device=device, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, silent=True, optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9), scheduler=lambda x: StepLR(x, 20000, gamma=0.1), corruption=0.2) print('Training stage.') ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9) ae.train(ds_train, autoencoder, device=device, validation=ds_val, epochs=finetune_epochs, batch_size=batch_size, silent=True, optimizer=ae_optimizer, scheduler=StepLR(ae_optimizer, 20000, gamma=0.1), corruption=0.2, update_callback=training_callback) print('DEC stage.') model = DEC(cluster_number=10, embedding_dimension=28 * 28, hidden_dimension=10, encoder=autoencoder.encoder) model = model.to(device) dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) train(dataset=ds_train, model=model, epochs=20000, batch_size=256, silent=True, optimizer=dec_optimizer, stopping_delta=0.000001, cuda=cuda) predicted, actual = predict(ds_train, model, 1024, silent=True, return_actual=True, cuda=cuda) actual = actual.cpu().numpy() predicted = predicted.cpu().numpy() reassignment, accuracy = cluster_accuracy(actual, predicted) print('Final DEC accuracy: %s' % accuracy) if not testing_mode: predicted_reassigned = [reassignment[item] for item in predicted] # TODO numpify confusion = confusion_matrix(actual, predicted_reassigned) normalised_confusion = confusion.astype('float') / confusion.sum( axis=1)[:, np.newaxis] confusion_id = uuid.uuid4().hex sns.heatmap(normalised_confusion).get_figure().savefig( 'confusion_%s.png' % confusion_id) print('Writing out confusion diagram with UUID: %s' % confusion_id) writer.close()
def setUpClass(cls): cls.ae = StackedDenoisingAutoEncoder([100, 50, 5]) cls.dec = DEC(2, 100, 5, cls.ae.encoder)
def setUpClass(cls): cls.dimensions = list(reversed(range(5, 11))) cls.ae = StackedDenoisingAutoEncoder(cls.dimensions)
def main(cuda, batch_size, pretrain_epochs, finetune_epochs): writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(epoch, lr, loss, validation_loss): writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'validation_loss': validation_loss, }, epoch) ds_train = CachedMNIST(train=True, cuda=cuda) # training dataset ds_val = CachedMNIST(train=False, cuda=cuda) # evaluation dataset autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10], final_activation=None) if cuda: autoencoder.cuda() print('Pretraining stage.') ae.pretrain( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9), scheduler=lambda x: StepLR(x, 100, gamma=0.1), corruption=0.2) print('Training stage.') ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9) ae.train(ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=finetune_epochs, batch_size=batch_size, optimizer=ae_optimizer, scheduler=StepLR(ae_optimizer, 100, gamma=0.1), corruption=0.2, update_callback=training_callback) print('k-Means stage') dataloader = DataLoader(ds_train, batch_size=1024, shuffle=False) kmeans = KMeans(n_clusters=10, n_init=20) autoencoder.eval() features = [] actual = [] for index, batch in enumerate(dataloader): if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2: batch, value = batch # if we have a prediction label, separate it to actual actual.append(value) if cuda: batch = batch.cuda(async=True) batch = batch.squeeze(1).view(batch.size(0), -1) features.append(autoencoder.encoder(batch).detach().cpu()) actual = torch.cat(actual).long().cpu().numpy() predicted = kmeans.fit_predict(torch.cat(features).numpy()) reassignment, accuracy = cluster_accuracy(predicted, actual) print('Final k-Means accuracy: %s' % accuracy) predicted_reassigned = [reassignment[item] for item in predicted] # TODO numpify confusion = confusion_matrix(actual, predicted_reassigned) normalised_confusion = confusion.astype('float') / confusion.sum( axis=1)[:, np.newaxis] confusion_id = uuid.uuid4().hex sns.heatmap(normalised_confusion).get_figure().savefig('confusion_%s.png' % confusion_id) print('Writing out confusion diagram with UUID: %s' % confusion_id) writer.add_embedding( torch.cat(features), metadata=predicted, label_img=ds_train.ds.train_data.float().unsqueeze(1), # TODO bit ugly tag='predicted') writer.close()
class SDAETransformerBase(TransformerMixin, BaseEstimator): def __init__(self, dimensions: List[int], cuda: Optional[bool] = None, batch_size: int = 256, pretrain_epochs: int = 200, finetune_epochs: int = 500, corruption: Optional[float] = 0.2, optimiser_pretrain: Callable[[torch.nn.Module], torch.optim.Optimizer] = lambda x: SGD(x.parameters(), lr=0.1, momentum=0.9), optimiser_train: Callable[[torch.nn.Module], torch.optim.Optimizer] = lambda x: SGD(x.parameters(), lr=0.1, momentum=0.9), scheduler: Optional[Callable[[torch.optim.Optimizer], Any]] = lambda x: StepLR(x, 100, gamma=0.1), final_activation: Optional[torch.nn.Module] = None) -> None: self.cuda = torch.cuda.is_available() if cuda is None else cuda self.batch_size = batch_size self.dimensions = dimensions self.pretrain_epochs = pretrain_epochs self.finetune_epochs = finetune_epochs self.optimiser_pretrain = optimiser_pretrain self.optimiser_train = optimiser_train self.scheduler = scheduler self.corruption = corruption self.autoencoder = None self.final_activation = final_activation def fit(self, X, y=None): if issparse(X): X = X.todense() ds = TensorDataset(torch.from_numpy(X.astype(np.float32))) self.autoencoder = StackedDenoisingAutoEncoder(self.dimensions, final_activation=self.final_activation) if self.cuda: self.autoencoder.cuda() ae.pretrain( ds, self.autoencoder, cuda=self.cuda, epochs=self.pretrain_epochs, batch_size=self.batch_size, optimizer=self.optimiser_pretrain, scheduler=self.scheduler, corruption=0.2, silent=True ) ae_optimizer = self.optimiser_train(self.autoencoder) ae.train( ds, self.autoencoder, cuda=self.cuda, epochs=self.finetune_epochs, batch_size=self.batch_size, optimizer=ae_optimizer, scheduler=self.scheduler(ae_optimizer), corruption=self.corruption, silent=True ) return self def score(self, X, y=None, sample_weight=None) -> float: loss_function = torch.nn.MSELoss() if self.autoencoder is None: raise NotFittedError if issparse(X): X = X.todense() self.autoencoder.eval() ds = TensorDataset(torch.from_numpy(X.astype(np.float32))) dataloader = DataLoader( ds, batch_size=self.batch_size, shuffle=False ) loss = 0 for index, batch in enumerate(dataloader): batch = batch[0] if self.cuda: batch = batch.cuda(non_blocking=True) output = self.autoencoder(batch) loss += float(loss_function(output, batch).item()) return loss
print('got dataset', flush=True) ds_train.output = 2 # pretrain pretrain_epochs = 300 finetune_epochs = 500 training_callback = None cuda = torch.cuda.is_available() ds_val = None embedded_dim = get_embedded_dim() try: autoencoder = pickle.load(open(autoencoder_path, 'rb')) except: autoencoder = StackedDenoisingAutoEncoder( dimensions=[embedded_dim, 500, 500, 2000, 10], final_activation=None, ) if cuda: autoencoder.cuda() print('SDAE Pretraining stage.', flush=True) print(f'@ {time.time() - start_time}\n', flush=True) ae.pretrain( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, optimizer=lambda model: SGD( model.parameters(), lr=0.1, momentum=0.9),
def get_opt(model, lr=args.pretrain_lr): return torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9) def get_sched(opt): return torch.optim.lr_scheduler.StepLR(optimizer=opt, step_size=1, gamma=args.lr_step, last_epoch=-1) print("Loading Data ...") sys.stdout.flush() dataset = get_dataset(args) validation = None ae = SDAE([dataset.dims] + args.layers) timestamp = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') print("Pretraining ...") sys.stdout.flush() # pretrain ptsdae.model.pretrain(dataset, autoencoder=ae, epochs=args.pretrain_epochs, batch_size=args.batch_size, optimizer=get_opt, scheduler=get_sched, validation=validation, update_freq=args.pretrain_epochs // 50, cuda=True,
from sklearn.cluster import DBSCAN from sklearn.manifold import TSNE from sc_dm.datasets import * import torch from ptsdae.sdae import StackedDenoisingAutoEncoder as SDAE if __name__ == '__main__': # ############################################################################# dset = sys.argv[1] #raw_data = DuoBenchmark('data/datasets/'+dset+'.csv') raw_data = FromPickle('data/embeddings/mouse-pca-15000-log1p-True.pickle') model = SDAE([raw_data.dims, 7500, 500, 2000, 50]) #model.load_state_dict(torch.load('data/models/'+dset+'.pt')) model.load_state_dict(torch.load(sys.argv[1])) if int(torch.__version__.split('.')[1]) == 3: var = torch.autograd.variable.Variable(torch.Tensor(raw_data.data)) else: var = torch.Tensor(raw_data.data) embedding = model.encoder(var).data.numpy() labels = DBSCAN().fit(embedding).labels_ tsne_embedding = TSNE(n_components=2).fit_transform(embedding) # ############################################################################# plt_file = 'data/plots/mouse_SDAE.pdf'
def main(cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode): writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(epoch, lr, loss, validation_loss): writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "validation_loss": validation_loss, }, epoch, ) ds_train = CachedMNIST(train=True, cuda=cuda, testing_mode=testing_mode) # training dataset ds_val = CachedMNIST(train=False, cuda=cuda, testing_mode=testing_mode) # evaluation dataset autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10], final_activation=None) if cuda: autoencoder.cuda() print("Pretraining stage.") ae.pretrain( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9), scheduler=lambda x: StepLR(x, 100, gamma=0.1), corruption=0.2, ) print("Training stage.") ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9) ae.train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=finetune_epochs, batch_size=batch_size, optimizer=ae_optimizer, scheduler=StepLR(ae_optimizer, 100, gamma=0.1), corruption=0.2, update_callback=training_callback, ) print("DEC stage.") model = DEC(cluster_number=10, hidden_dimension=10, encoder=autoencoder.encoder) if cuda: model.cuda() dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) train( dataset=ds_train, model=model, epochs=100, batch_size=256, optimizer=dec_optimizer, stopping_delta=0.000001, cuda=cuda, ) predicted, actual = predict(ds_train, model, 1024, silent=True, return_actual=True, cuda=cuda) actual = actual.cpu().numpy() predicted = predicted.cpu().numpy() reassignment, accuracy = cluster_accuracy(actual, predicted) print("Final DEC accuracy: %s" % accuracy) if not testing_mode: predicted_reassigned = [reassignment[item] for item in predicted] # TODO numpify confusion = confusion_matrix(actual, predicted_reassigned) normalised_confusion = (confusion.astype("float") / confusion.sum(axis=1)[:, np.newaxis]) confusion_id = uuid.uuid4().hex sns.heatmap(normalised_confusion).get_figure().savefig( "confusion_%s.png" % confusion_id) print("Writing out confusion diagram with UUID: %s" % confusion_id) writer.close()
ds_path = os.path.join('data/datasets', ds_name + '.csv') dataset = DuoBenchmark(ds_path, log1p=log, split_head=False) for scale in [True]: # Do scaling second as the function will # overwrite the existing data # yes - yes I know this is bad design but it's too late now mlist = model_dict[ds_name][log][scale] # Given all of the pre-existing conditions ... # cycle through each of the models that match this criteria for model in mlist: filename = model[0] print(filename) if scale: scale_dataset(dataset) # get parameter information model_path = os.path.join(model_dir, filename) layers = model[1] # prepare the model model = SDAE([dataset.dims] + layers) model.load_state_dict( torch.load(model_path, map_location='cpu')) # generate the embedding inputs = torch.Tensor(dataset.data) embedding = model.encoder(inputs).data.numpy() # save the embedding with open( os.path.join('data/sdae_embeddings', filename + '.pickle'), 'wb') as fh: pickle.dump(embedding, fh, protocol=4)