from ptsdae.sdae import StackedDenoisingAutoEncoder as SDAE if __name__ == '__main__': # ############################################################################# dset = sys.argv[1] #raw_data = DuoBenchmark('data/datasets/'+dset+'.csv') raw_data = FromPickle('data/embeddings/mouse-pca-15000-log1p-True.pickle') model = SDAE([raw_data.dims, 7500, 500, 2000, 50]) #model.load_state_dict(torch.load('data/models/'+dset+'.pt')) model.load_state_dict(torch.load(sys.argv[1])) if int(torch.__version__.split('.')[1]) == 3: var = torch.autograd.variable.Variable(torch.Tensor(raw_data.data)) else: var = torch.Tensor(raw_data.data) embedding = model.encoder(var).data.numpy() labels = DBSCAN().fit(embedding).labels_ tsne_embedding = TSNE(n_components=2).fit_transform(embedding) # ############################################################################# plt_file = 'data/plots/mouse_SDAE.pdf' plt.scatter(tsne_embedding[:, 0], tsne_embedding[:, 1], c=labels, s=1, marker=',')
def main(cuda, batch_size, pretrain_epochs, finetune_epochs): writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(epoch, lr, loss, validation_loss): writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'validation_loss': validation_loss, }, epoch) ds_train = CachedMNIST(train=True, cuda=cuda) # training dataset ds_val = CachedMNIST(train=False, cuda=cuda) # evaluation dataset autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10], final_activation=None) if cuda: autoencoder.cuda() print('Pretraining stage.') ae.pretrain( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9), scheduler=lambda x: StepLR(x, 100, gamma=0.1), corruption=0.2) print('Training stage.') ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9) ae.train(ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=finetune_epochs, batch_size=batch_size, optimizer=ae_optimizer, scheduler=StepLR(ae_optimizer, 100, gamma=0.1), corruption=0.2, update_callback=training_callback) print('k-Means stage') dataloader = DataLoader(ds_train, batch_size=1024, shuffle=False) kmeans = KMeans(n_clusters=10, n_init=20) autoencoder.eval() features = [] actual = [] for index, batch in enumerate(dataloader): if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2: batch, value = batch # if we have a prediction label, separate it to actual actual.append(value) if cuda: batch = batch.cuda(async=True) batch = batch.squeeze(1).view(batch.size(0), -1) features.append(autoencoder.encoder(batch).detach().cpu()) actual = torch.cat(actual).long().cpu().numpy() predicted = kmeans.fit_predict(torch.cat(features).numpy()) reassignment, accuracy = cluster_accuracy(predicted, actual) print('Final k-Means accuracy: %s' % accuracy) predicted_reassigned = [reassignment[item] for item in predicted] # TODO numpify confusion = confusion_matrix(actual, predicted_reassigned) normalised_confusion = confusion.astype('float') / confusion.sum( axis=1)[:, np.newaxis] confusion_id = uuid.uuid4().hex sns.heatmap(normalised_confusion).get_figure().savefig('confusion_%s.png' % confusion_id) print('Writing out confusion diagram with UUID: %s' % confusion_id) writer.add_embedding( torch.cat(features), metadata=predicted, label_img=ds_train.ds.train_data.float().unsqueeze(1), # TODO bit ugly tag='predicted') writer.close()
ds_path = os.path.join('data/datasets', ds_name + '.csv') dataset = DuoBenchmark(ds_path, log1p=log, split_head=False) for scale in [True]: # Do scaling second as the function will # overwrite the existing data # yes - yes I know this is bad design but it's too late now mlist = model_dict[ds_name][log][scale] # Given all of the pre-existing conditions ... # cycle through each of the models that match this criteria for model in mlist: filename = model[0] print(filename) if scale: scale_dataset(dataset) # get parameter information model_path = os.path.join(model_dir, filename) layers = model[1] # prepare the model model = SDAE([dataset.dims] + layers) model.load_state_dict( torch.load(model_path, map_location='cpu')) # generate the embedding inputs = torch.Tensor(dataset.data) embedding = model.encoder(inputs).data.numpy() # save the embedding with open( os.path.join('data/sdae_embeddings', filename + '.pickle'), 'wb') as fh: pickle.dump(embedding, fh, protocol=4)