Пример #1
0
 def test_basic(self):
     """
     Basic test to check that the calculation is sensible.
     """
     true_value1 = np.array([1, 2, 1, 2, 0, 0], dtype=np.int64)
     pred_value1 = np.array([2, 1, 2, 1, 0, 0], dtype=np.int64)
     self.assertAlmostEqual(
         cluster_accuracy(true_value1, pred_value1)[1], 1.0)
     self.assertAlmostEqual(
         cluster_accuracy(true_value1, pred_value1, 3)[1], 1.0)
     self.assertDictEqual(
         cluster_accuracy(true_value1, pred_value1)[0], {
             0: 0,
             1: 2,
             2: 1
         })
     true_value2 = np.array([1, 1, 1, 1, 1, 1], dtype=np.int64)
     pred_value2 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int64)
     self.assertAlmostEqual(
         cluster_accuracy(true_value2, pred_value2)[1], 1.0 / 6.0)
     self.assertAlmostEqual(
         cluster_accuracy(true_value2, pred_value2, 6)[1], 1.0 / 6.0)
     true_value3 = np.array([1, 3, 1, 3, 0, 2], dtype=np.int64)
     pred_value3 = np.array([2, 1, 2, 1, 3, 0], dtype=np.int64)
     self.assertDictEqual(
         cluster_accuracy(true_value3, pred_value3)[0], {
             2: 1,
             1: 3,
             3: 0,
             0: 2
         })
Пример #2
0
def main(data_dir, cuda, batch_size, pretrain_epochs, finetune_epochs,
         testing_mode):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope

    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'validation_loss': validation_loss,
        }, epoch)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    ds_train = CachedMNIST(data_dir,
                           is_train=True,
                           device=device,
                           testing_mode=testing_mode)  # training dataset
    ds_val = CachedMNIST(data_dir,
                         is_train=False,
                         device=device,
                         testing_mode=testing_mode)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)

    autoencoder = autoencoder.to(device)
    print('Pretraining stage.')
    ae.pretrain(
        ds_train,
        autoencoder,
        device=device,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        silent=True,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 20000, gamma=0.1),
        corruption=0.2)
    print('Training stage.')
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(ds_train,
             autoencoder,
             device=device,
             validation=ds_val,
             epochs=finetune_epochs,
             batch_size=batch_size,
             silent=True,
             optimizer=ae_optimizer,
             scheduler=StepLR(ae_optimizer, 20000, gamma=0.1),
             corruption=0.2,
             update_callback=training_callback)
    print('DEC stage.')
    model = DEC(cluster_number=10,
                embedding_dimension=28 * 28,
                hidden_dimension=10,
                encoder=autoencoder.encoder)

    model = model.to(device)
    dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    train(dataset=ds_train,
          model=model,
          epochs=20000,
          batch_size=256,
          silent=True,
          optimizer=dec_optimizer,
          stopping_delta=0.000001,
          cuda=cuda)
    predicted, actual = predict(ds_train,
                                model,
                                1024,
                                silent=True,
                                return_actual=True,
                                cuda=cuda)
    actual = actual.cpu().numpy()
    predicted = predicted.cpu().numpy()
    reassignment, accuracy = cluster_accuracy(actual, predicted)
    print('Final DEC accuracy: %s' % accuracy)
    if not testing_mode:
        predicted_reassigned = [reassignment[item]
                                for item in predicted]  # TODO numpify
        confusion = confusion_matrix(actual, predicted_reassigned)
        normalised_confusion = confusion.astype('float') / confusion.sum(
            axis=1)[:, np.newaxis]
        confusion_id = uuid.uuid4().hex
        sns.heatmap(normalised_confusion).get_figure().savefig(
            'confusion_%s.png' % confusion_id)
        print('Writing out confusion diagram with UUID: %s' % confusion_id)
        writer.close()
Пример #3
0
def train(
        dataset: torch.utils.data.Dataset,
        model: torch.nn.Module,
        epochs: int,
        batch_size: int,
        optimizer: torch.optim.Optimizer,
        stopping_delta: Optional[float] = None,
        cuda: bool = True,
        sampler: Optional[torch.utils.data.sampler.Sampler] = None,
        silent: bool = True,
        update_freq: int = 10,
        evaluate_batch_size: int = 1024,
        update_callback: Optional[Callable[[float, float], None]] = None,
        epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None) -> None:
    """
    Train the DEC model given a dataset, a model instance and various configuration parameters.

    :param dataset: instance of Dataset to use for training
    :param model: instance of DEC model to train
    :param epochs: number of training epochs
    :param batch_size: size of the batch to train with
    :param optimizer: instance of optimizer to use
    :param stopping_delta: label delta as a proportion to use for stopping, None to disable, default None
    :param cuda: whether to use CUDA, defaults to True
    :param sampler: optional sampler to use in the DataLoader, defaults to None
    :param silent: set to True to prevent printing out summary statistics, defaults to False
    :param update_freq: frequency of batches with which to update counter, None disables, default 10
    :param evaluate_batch_size: batch size for evaluation stage, default 1024
    :param update_callback: optional function of accuracy and loss to update, default None
    :param epoch_callback: optional function of epoch and model, default None
    :return: None
    """
    static_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=False,
        sampler=sampler,
        shuffle=False
    )
    train_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        shuffle=True
    )
    data_iterator = tqdm(
        static_dataloader,
        leave=True,
        unit='batch',
        postfix={
            'epo': -1,
            'acc': '%.4f' % 0.0,
            'lss': '%.8f' % 0.0,
            'dlb': '%.4f' % -1,
        },
        disable=silent
    )
    kmeans = KMeans(n_clusters=model.cluster_number, n_init=20)
    model.train()
    features = []
    actual = []
    # form initial cluster centres
    for index, batch in enumerate(data_iterator):
        if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2:
            batch, value = batch  # if we have a prediction label, separate it to actual
            actual.append(value)
        if cuda:
            batch = batch.cuda(non_blocking=True)
        features.append(model.encoder(batch).detach().cpu())
        # features.append(model.encoder(batch))
    actual = torch.cat(actual).long()
    predicted = kmeans.fit_predict(torch.cat(features).numpy())
    predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long)
    _, accuracy = cluster_accuracy(predicted, actual.cpu().numpy())
    cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float)
    if cuda:
        cluster_centers = cluster_centers.cuda(non_blocking=True)
    model.assignment.cluster_centers = torch.nn.Parameter(cluster_centers)
    loss_function = nn.KLDivLoss(size_average=False)
    delta_label = None
    for epoch in range(epochs):
        features = []
        data_iterator = tqdm(
            train_dataloader,
            leave=True,
            unit='batch',
            postfix={
                'epo': epoch,
                'acc': '%.4f' % (accuracy or 0.0),
                'lss': '%.8f' % 0.0,
                'dlb': '%.4f' % (delta_label or 0.0),
            },
            disable=silent,
        )
        model.train()
        for index, batch in enumerate(data_iterator):
            if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2:
                batch, _ = batch  # if we have a prediction label, strip it away
            if cuda:
                batch = batch.cuda(non_blocking=True)
            output = model(batch)
            target = target_distribution(output).detach()
            loss = loss_function(output.log(), target) / output.shape[0]
            data_iterator.set_postfix(
                epo=epoch,
                acc='%.4f' % (accuracy or 0.0),
                lss='%.8f' % float(loss.item()),
                dlb='%.4f' % (delta_label or 0.0),
            )
            optimizer.zero_grad()
            loss.backward()
            optimizer.step(closure=None)
            features.append(model.encoder(batch).detach().cpu())
            # features.append(model.encoder(batch))
            if update_freq is not None and index % update_freq == 0:
                loss_value = float(loss.item())
                data_iterator.set_postfix(
                    epo=epoch,
                    acc='%.4f' % (accuracy or 0.0),
                    lss='%.8f' % loss_value,
                    dlb='%.4f' % (delta_label or 0.0),
                )
                if update_callback is not None:
                    update_callback(accuracy, loss_value, delta_label)
        predicted, actual = predict(dataset, model, evaluate_batch_size, silent=True, return_actual=True, cuda=cuda)
        delta_label = float((predicted != predicted_previous).float().sum().item()) / predicted_previous.shape[0]
        # if stopping_delta is not None and delta_label < stopping_delta:
        #     print('Early stopping as label delta "%1.5f" less than "%1.5f".' % (delta_label, stopping_delta))
        #     break
        predicted_previous = predicted
        _, accuracy = cluster_accuracy(predicted.cpu().numpy(), actual.cpu().numpy())
        data_iterator.set_postfix(
            epo=epoch,
            acc='%.4f' % (accuracy or 0.0),
            lss='%.8f' % 0.0,
            dlb='%.4f' % (delta_label or 0.0),
        )
        if epoch_callback is not None:
            epoch_callback(epoch, model)
Пример #4
0
def train(dataset: torch.utils.data.Dataset,
          wdec: torch.nn.Module,
          epochs: int,
          batch_size: int,
          optimizer: torch.optim.Optimizer,
          reinitKMeans: bool = True,
          scheduler = None, ###
          positive_ratio: float = 0.6, ###
          stopping_delta: Optional[float] = None,
          collate_fn = default_collate,
          cuda: bool = True,
          sampler: Optional[torch.utils.data.sampler.Sampler] = None,
          silent: bool = False,
          update_freq: int = 10,
          evaluate_batch_size: int = 1024,
          update_callback: Optional[Callable[[float, float], None]] = None,
          epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None,
          start_time: Optional[float] = None,           
          ) -> None:
    """
    Train the DEC model given a dataset, a model instance and various configuration parameters.

    :param dataset: instance of Dataset to use for training
    :param model: instance of DEC model to train
    :param epochs: number of training epochs
    :param batch_size: size of the batch to train with
    :param reinitKMeans: if true, the clusters will be initialized.
    :param optimizer: instance of optimizer to use
    :param scheduler: instance of lr_scheduler to use
    :param stopping_delta: label delta as a proportion to use for stopping, None to disable, default None
    :param collate_fn: function to merge a list of samples into mini-batch
    :param cuda: whether to use CUDA, defaults to True
    :param sampler: optional sampler to use in the DataLoader, defaults to None
    :param silent: set to True to prevent printing out summary statistics, defaults to False
    :param update_freq: frequency of batches with which to update counter, None disables, default 10
    :param evaluate_batch_size: batch size for evaluation stage, default 1024
    :param update_callback:sample_weight optional function of accuracy and loss to update, default None
    :param epoch_callback: optional function of epoch and model, default None
    :param start_time: optional starting time of training process, default None
    :return: None
    """
    static_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        pin_memory=False,
        sampler=sampler,
        shuffle=False
    )
    train_dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        sampler=sampler,
        shuffle=True
    )
    data_iterator = tqdm(
        static_dataloader,
        leave=True,
        unit='batch',
        postfix={
            'epo': -1,
            'acc': '%.4f' % 0.0,
            'lss': '%.8f' % 0.0,
            'dlb': '%.4f' % -1,
        },
        disable=silent
    )
    wdec.train()
    
    test_dataset(dataset)
    
    
    if reinitKMeans:
        # get all data needed for KMeans.
        if start_time is not None:
            print('\nLinearizing data')
            print(f'@ {time.time() - start_time}\n')
        features, actual, idxs, boxs, videos, frames = DataSetExtract(dataset, wdec)
               
        # KMeans.
        if start_time is not None:
            print('\nPerforming KMeans')
            print(f'@ {time.time() - start_time}\n')
        predicted, kmeans = SSKMeans(
            wdec, features, actual, idxs, boxs, videos, frames
        )
        # Computing the positive ration scores and the positive ratio clusters
        cpr = PositiveRatioClusters(
            predicted, actual, wdec.assignment.cluster_number,
        )
        predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long)
        _, accuracy        = cluster_accuracy(predicted, actual.cpu().numpy())
        cluster_centers    = torch.tensor(
            kmeans.cluster_centers_,
            dtype=torch.float, requires_grad=True
        )
        predicted_idxed    = torch.cat(
            [idxs.reshape(-1,1), torch.tensor(predicted).reshape(-1,1).long()],
            dim = -1
        )
        del features, actual, idxs, boxs, videos, frames
        if cuda:
            wdec.cuda()
            cluster_centers = cluster_centers.cuda(non_blocking=True)
        with torch.no_grad():
            # initialise the cluster centers
            wdec.state_dict()['assignment.cluster_centers'].copy_(cluster_centers)
            # wdec.state_dict()['assignment.cluster_predicted'].copy_(predicted_idxed)
            # wdec.state_dict()['assignment.cluster_positive_ratio'].copy_(cpr)
            wdec.assignment.cluster_predicted = predicted_idxed.clone()
            wdec.assignment.cluster_positive_ratio = cpr.clone()
    else:
        predicted, actual = predict(
              dataset,
              wdec,
              batch_size=evaluate_batch_size,
              collate_fn=collate_fn,
              silent=True,
              return_actual=True,
              cuda=cuda
        )
        predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long)
        _, accuracy = cluster_accuracy(predicted.cpu().numpy(), actual.cpu().numpy())
        
    if start_time is not None:
        print('\ntrainint DEC')
        print(f'@ {time.time() - start_time}\n')

    loss_function = nn.KLDivLoss(size_average=False)
    delta_label = None
    for epoch in range(epochs):
        # features = [] ### I see no use for this
        data_iterator = tqdm(
            train_dataloader,
            leave=True,
            unit='batch',
            postfix={
                'epo': epoch,
                'acc': '%.4f' % (accuracy or 0.0),
                'lss': '%.8f' % 0.0,
                'dlb': '%.4f' % (delta_label or 0.0),
            },
            disable=silent,
        )
        wdec.train()
        for index, batch in enumerate(data_iterator):
            if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 6:
                batch, actual, idxs, _, _, _ = batch  # if we have a prediction label, strip it away
            if cuda:
                batch  = batch.cuda(non_blocking=True)
                actual = actual.cuda()
                idxs   = idxs.cuda()
            output = wdec(batch, actual, idxs,)
            target = target_distribution(output).detach()
            loss   = loss_function(output.log(), target) / output.shape[0]
            data_iterator.set_postfix(
                epo = epoch,
                acc = '%.4f' % (accuracy or 0.0),
                lss = '%.8f' % float(loss.item()),
                dlb = '%.4f' % (delta_label or 0.0),
            )
            optimizer.zero_grad()
            loss.backward()
            optimizer.step(closure=None)
            if scheduler is not None: scheduler.step()
            # features.append(model.encoder(batch).detach().cpu()) ### I see no use for this
            if update_freq is not None and index % update_freq == 0:
                loss_value = float(loss.item())
                data_iterator.set_postfix(
                    epo=epoch,
                    acc='%.4f' % (accuracy or 0.0),
                    lss='%.8f' % loss_value,
                    dlb='%.4f' % (delta_label or 0.0),
                )
                if update_callback is not None:
                    update_callback(accuracy, loss_value, delta_label)
        predicted, actual = predict(
            dataset,
            wdec,
            batch_size=evaluate_batch_size,
            collate_fn=collate_fn,
            silent=True,
            return_actual=True,
            cuda=cuda
        )
        delta_label = float((predicted != predicted_previous).float().sum().item()) / predicted_previous.shape[0]
        if stopping_delta is not None and delta_label < stopping_delta:
            print('Early stopping as label delta "%1.5f" less than "%1.5f".' % (delta_label, stopping_delta))
            break
        predicted_previous = predicted
        _, accuracy = cluster_accuracy(predicted.cpu().numpy(), actual.cpu().numpy())
        data_iterator.set_postfix(
            epo=epoch,
            acc='%.4f' % (accuracy or 0.0),
            lss='%.8f' % 0.0,
            dlb='%.4f' % (delta_label or 0.0),
        )
        if epoch_callback is not None:
            epoch_callback(epoch, wdec)
    wdec.cpu()
Пример #5
0
def main(cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope

    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "validation_loss": validation_loss,
            },
            epoch,
        )

    ds_train = CachedMNIST(train=True, cuda=cuda,
                           testing_mode=testing_mode)  # training dataset
    ds_val = CachedMNIST(train=False, cuda=cuda,
                         testing_mode=testing_mode)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)
    if cuda:
        autoencoder.cuda()
    print("Pretraining stage.")
    ae.pretrain(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 100, gamma=0.1),
        corruption=0.2,
    )
    print("Training stage.")
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=finetune_epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        scheduler=StepLR(ae_optimizer, 100, gamma=0.1),
        corruption=0.2,
        update_callback=training_callback,
    )
    print("DEC stage.")
    model = DEC(cluster_number=10,
                hidden_dimension=10,
                encoder=autoencoder.encoder)
    if cuda:
        model.cuda()
    dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    train(
        dataset=ds_train,
        model=model,
        epochs=100,
        batch_size=256,
        optimizer=dec_optimizer,
        stopping_delta=0.000001,
        cuda=cuda,
    )
    predicted, actual = predict(ds_train,
                                model,
                                1024,
                                silent=True,
                                return_actual=True,
                                cuda=cuda)
    actual = actual.cpu().numpy()
    predicted = predicted.cpu().numpy()
    reassignment, accuracy = cluster_accuracy(actual, predicted)
    print("Final DEC accuracy: %s" % accuracy)
    if not testing_mode:
        predicted_reassigned = [reassignment[item]
                                for item in predicted]  # TODO numpify
        confusion = confusion_matrix(actual, predicted_reassigned)
        normalised_confusion = (confusion.astype("float") /
                                confusion.sum(axis=1)[:, np.newaxis])
        confusion_id = uuid.uuid4().hex
        sns.heatmap(normalised_confusion).get_figure().savefig(
            "confusion_%s.png" % confusion_id)
        print("Writing out confusion diagram with UUID: %s" % confusion_id)
        writer.close()