Exemplo n.º 1
0
 def getLabeledMassif(self, pattern=None):
     if self._labeled_massif is None:
         with self._sem_label:
             if self._labeled_massif is None:
                 if pattern is None:
                     pattern = [
                         [1] * 3
                     ] * 3  # [[0, 1, 0], [1, 1, 1], [0, 1, 0]]#[[1] * 3] * 3
                 logger.debug(
                     "Labeling all massifs. This takes some time !!!")
                 labeled_massif, self._number_massif = label(
                     (self.getBinnedData() > self.getBluredData()), pattern)
                 logger.info("Labeling found %s massifs." %
                             self._number_massif)
                 if logger.getEffectiveLevel() == logging.DEBUG:
                     fabio.edfimage.edfimage(data=labeled_massif).write(
                         "labeled_massif_small.edf")
                 relabeled = utils.relabel(labeled_massif,
                                           self.getBinnedData(),
                                           self.getBluredData())
                 if logger.getEffectiveLevel() == logging.DEBUG:
                     fabio.edfimage.edfimage(
                         data=relabeled).write("relabeled_massif_small.edf")
                 self._labeled_massif = unBinning(relabeled, self.binning,
                                                  False)
                 if logger.getEffectiveLevel() == logging.DEBUG:
                     fabio.edfimage.edfimage(data=self._labeled_massif
                                             ).write("labeled_massif.edf")
                 logger.info("Labeling found %s massifs." %
                             self._number_massif)
     return self._labeled_massif
Exemplo n.º 2
0
    def generate(self, N=None, batch_size=1):

        K = 11
        while K > 10:
            clusters, N, K = generate_CRP(self.params, N=N)

        data = torch.zeros([batch_size, N, 28, 28])

        cumsum = np.cumsum(clusters)

        for i in range(batch_size):
            labels = np.random.choice(
                10, size=K, replace=False
            )  #this is a sample from the 'base measure' for each cluster
            for k in range(K):
                l = labels[k]
                nk = clusters[k + 1]
                inds = np.random.choice(self.label_data[l].shape[0],
                                        size=nk,
                                        replace=False)
                data[i, cumsum[k]:cumsum[k + 1], :, :] = self.label_data[l][
                    inds, :, :]

        cs = np.empty(N, dtype=np.int32)
        for k in range(K):
            cs[cumsum[k]:cumsum[k + 1]] = k

        arr = np.arange(N)
        np.random.shuffle(arr)
        cs = cs[arr]
        data = data[:, arr, :, :]
        cs = relabel(cs)

        return data, cs, clusters, K
def watershed_center(image, center):
    distance = ndi.distance_transform_edt(image)
    markers, nr_blobs = ndi.label(center)
    labeled = morph.watershed(-distance, markers, mask=image)

    dropped, _ = ndi.label(image - (labeled > 0))
    dropped = np.where(dropped > 0, dropped + nr_blobs, 0)
    correct_labeled = dropped + labeled
    return relabel(correct_labeled)
Exemplo n.º 4
0
def drop_small(img, min_size):
    freqs = itemfreq(img)
    small_blob_id = freqs[freqs[:, 1] < min_size, 0]

    h, w = img.shape
    for i, j in product(range(h), range(w)):
        if img[i, j] in small_blob_id:
            img[i, j] = 0

    return relabel(img)
Exemplo n.º 5
0
 def getLabeledMassif(self, pattern=None):
     if self._labeled_massif is None:
         with self._sem_label:
             if self._labeled_massif is None:
                 if pattern is None:
                     pattern = [[1] * 3] * 3  # [[0, 1, 0], [1, 1, 1], [0, 1, 0]]#[[1] * 3] * 3
                 logger.debug("Labeling all massifs. This takes some time !!!")
                 labeled_massif, self._number_massif = label((self.getBinnedData() > self.getBluredData()), pattern)
                 logger.info("Labeling found %s massifs." % self._number_massif)
                 if logger.getEffectiveLevel() == logging.DEBUG:
                     fabio.edfimage.edfimage(data=labeled_massif).write("labeled_massif_small.edf")
                 relabeled = utils.relabel(labeled_massif, self.getBinnedData(), self.getBluredData())
                 if logger.getEffectiveLevel() == logging.DEBUG:
                         fabio.edfimage.edfimage(data=relabeled).write("relabeled_massif_small.edf")
                 self._labeled_massif = unBinning(relabeled, self.binning, False)
                 if logger.getEffectiveLevel() == logging.DEBUG:
                     fabio.edfimage.edfimage(data=self._labeled_massif).write("labeled_massif.edf")
                 logger.info("Labeling found %s massifs." % self._number_massif)
     return self._labeled_massif
Exemplo n.º 6
0
    def generate(self, N=None, batch_size=1):

        lamb = self.params['lambda']
        sigma = self.params['sigma']
        x_dim = self.params['x_dim']

        clusters, N, num_clusters = generate_CRP(self.params, N=N)

        cumsum = np.cumsum(clusters)
        data = np.empty([batch_size, N, x_dim])
        cs = np.empty(N, dtype=np.int32)

        for i in range(num_clusters):
            mu = np.random.normal(0, lamb, size=[x_dim * batch_size, 1])
            samples = np.random.normal(
                mu, sigma, size=[x_dim * batch_size, clusters[i + 1]])

            samples = np.swapaxes(
                samples.reshape([batch_size, x_dim, clusters[i + 1]]), 1, 2)
            data[:, cumsum[i]:cumsum[i + 1], :] = samples
            cs[cumsum[i]:cumsum[i + 1]] = i + 1

        #%shuffle the assignment order
        arr = np.arange(N)
        np.random.shuffle(arr)
        cs = cs[arr]

        data = data[:, arr, :]

        # relabel cluster numbers so that they appear in order
        cs = relabel(cs)

        #normalize data
        #means = np.expand_dims(data.mean(axis=1),1 )
        medians = np.expand_dims(np.median(data, axis=1), 1)

        data = data - medians
        #data = 2*data/(maxs-mins)-1        #data point are now in [-1,1]

        return data, cs, clusters, num_clusters
Exemplo n.º 7
0
def main(args):

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    model = args.model
    params = get_parameters(model)
    params['device'] = torch.device("cuda:0" if args.cuda else "cpu")

    print(params['device'])

    dpmm = NeuralClustering(params).to(params['device'])
    data_generator = get_generator(params)

    #define containers to collect statistics
    losses = []  # NLLs
    accs = []  # Accuracy of the classification prediction
    perm_vars = []  # permutation variance

    it = 0  # iteration counter
    learning_rate = 1e-4
    weight_decay = 0.01
    optimizer = torch.optim.Adam(dpmm.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)

    perms = 6  # Number of permutations for each mini-batch.
    # In each permutation, the order of the datapoints is shuffled.

    batch_size = args.batch_size
    max_it = args.iterations

    if params['model'] == 'Gauss2D':
        if not os.path.isdir('saved_models/Gauss2D'):
            os.makedirs('saved_models/Gauss2D')
        if not os.path.isdir('figures/Gauss2D'):
            os.makedirs('figures/Gauss2D')

    elif params['model'] == 'MNIST':
        if not os.path.isdir('saved_models/MNIST'):
            os.makedirs('saved_models/MNIST')
        if not os.path.isdir('figures/MNIST'):
            os.makedirs('figures/MNIST')

    end_name = params['model']
    learning_rates = {1200: 5e-5, 2200: 1e-5}

    t_start = time.time()
    itt = it
    while True:

        it += 1

        if it == max_it:
            break

        dpmm.train()

        if it % args.plot_interval == 0:

            torch.cuda.empty_cache()
            plot_avgs(losses,
                      accs,
                      perm_vars,
                      50,
                      save_name='./figures/train_avgs_' + end_name + '.pdf')

            if params['model'] == 'Gauss2D':
                fig_name = './figures/Gauss2D/samples_2D_' + str(it) + '.pdf'
                print('\nCreating plot at ' + fig_name + '\n')
                plot_samples_2d(dpmm,
                                data_generator,
                                N=100,
                                seed=it,
                                save_name=fig_name)

            elif params['model'] == 'MNIST':
                fig_name = './figures/MNIST/samples_MNIST_' + str(it) + '.pdf'
                print('\nCreating plot at ' + fig_name + '\n')
                plot_samples_MNIST(dpmm,
                                   data_generator,
                                   N=20,
                                   seed=it,
                                   save_name=fig_name)

        if it % 100 == 0:
            if 'fname' in vars():
                os.remove(fname)
            dpmm.params['it'] = it
            fname = 'saved_models/' + end_name + '/' + end_name + '_' + str(
                it) + '.pt'
            torch.save(dpmm, fname)

        if it in learning_rates:
            optimizer = torch.optim.Adam(dpmm.parameters(),
                                         lr=learning_rates[it],
                                         weight_decay=weight_decay)

        data, cs, clusters, K = data_generator.generate(None, batch_size)
        N = data.shape[1]

        loss_values = np.zeros(perms)
        accuracies = np.zeros([N - 1, perms])

        # The memory requirements change in each iteration according to the random values of N and K.
        # If both N and K are big, an out of memory RuntimeError exception might be raised.
        # When this happens, we capture the exception, reduce the batch_size to 3/4 of its value, and try again.

        while True:
            try:

                loss = 0

                for perm in range(perms):
                    arr = np.arange(N)
                    np.random.shuffle(
                        arr
                    )  # permute the order in which the points are queried
                    cs = cs[arr]
                    data = data[:, arr, :]

                    cs = relabel(
                        cs
                    )  # this makes cluster labels appear in cs[] in increasing order

                    this_loss = 0
                    dpmm.previous_n = 0

                    for n in range(1, N):
                        # points up to (n-1) are already assigned, the point n is to be assigned

                        logprobs = dpmm(data, cs, n)
                        c = cs[n]
                        accuracies[n - 1, perm] = np.sum(
                            np.argmax(logprobs.detach().to('cpu').numpy(),
                                      axis=1) == c) / logprobs.shape[0]

                        this_loss -= logprobs[:, c].mean()

                    this_loss.backward(
                    )  # this accumulates the gradients for each permutation
                    loss_values[perm] = this_loss.item() / N
                    loss += this_loss

                perm_vars.append(loss_values.var())
                losses.append(loss.item() / N)
                accs.append(accuracies.mean())

                optimizer.step(
                )  # the gradients used in this step are the sum of the gradients for each permutation
                optimizer.zero_grad()


                print('{0:4d}  N:{1:2d}  K:{2}  Mean NLL:{3:.3f}   Mean Acc:{4:.3f}   Mean Permutation Variance: {5:.7f}  Mean Time/Iteration: {6:.1f}'\
                      .format(it, N, K , np.mean(losses[-50:]), np.mean(accs[-50:]), np.mean(perm_vars[-50:]), (time.time()-t_start)/(it - itt)    ))

                break

            except RuntimeError:
                bsize = int(.75 * data.shape[0])
                if bsize > 2:
                    print('RuntimeError handled  ', 'N:', N, ' K:', K,
                          'Trying batch size:', bsize)
                    data = data[:bsize, :, :]
                else:
                    break
def drop_small(img, min_size):
    img = morph.remove_small_objects(img, min_size=min_size)
    return relabel(img)
Exemplo n.º 9
0
def main(opts, start_time=time.time()):

    # taxonomy
    T = np.load(
        'taxonomy/{dataset}/taxonomy.npy'.format(dataset=opts.dataset)).item()
    utils.update_taxonomy(opts.method, T, opts.radius, start_time)

    # model
    data_dim = 2048  # feature dimension before softmax

    # top-down
    if opts.method == 'TD':
        model = models.TDModel(data_dim,
                               T['num_children'],
                               ns=opts.novel_score)

    # combined
    elif 'TD+' in opts.method:
        TDModel = models.TDModel(data_dim,
                                 T['num_children'],
                                 ns=opts.novel_score,
                                 relu=opts.test_relu,
                                 softmax=opts.softmax)
        FLModel = models.FLModel(sum(T['num_children']), len(T['wnids']))
        model = nn.Sequential(TDModel, FLModel)

    # flatten
    else:
        if opts.method == 'LOO' and opts.loo == 0.:
            model = models.FLModel(data_dim, len(T['wnids_leaf']))
        else:
            model = models.FLModel(data_dim, len(T['wnids']))

        # deep flatten
        if opts.num_layers > 0:
            model = nn.Sequential(
                models.DeepLinearReLU([data_dim] * (opts.num_layers + 1),
                                      no_last_relu=opts.no_last_relu), model)

    if opts.gpu: model = model.cuda()

    torch.backends.cudnn.benchmark = True

    # optimizer and scheduler
    model_parameters = FLModel.parameters(
    ) if 'TD+' in opts.method else model.parameters()
    if opts.batch_size > 0:
        optimizer = SGD(model_parameters,
                        lr=opts.lr,
                        weight_decay=opts.wd,
                        momentum=0.9)
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=opts.lr_decay,
                                      patience=0,
                                      verbose=False,
                                      threshold=2e-2,
                                      threshold_mode='rel',
                                      cooldown=0,
                                      min_lr=0,
                                      eps=1e-8)
    else:  # full-batch
        optimizer = Adam(model_parameters, lr=opts.lr, weight_decay=opts.wd)
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=opts.lr_decay,
                                      patience=10,
                                      verbose=False,
                                      threshold=1e-4,
                                      threshold_mode='rel',
                                      cooldown=0,
                                      min_lr=0,
                                      eps=1e-8)
        scheduler.cooldown_counter = opts.num_epochs // 10  # loss would increase in the first few epochs

    # loss
    loss_fn = models.TDLoss(
        T, opts) if opts.method == 'TD' else models.LOOLoss(T, opts)
    if opts.gpu: loss_fn = loss_fn.cuda()

    # save path
    save_path = utils.get_path(opts)
    print(save_path)

    # load the recent model
    epoch = utils.load_model(model, optimizer, scheduler, save_path,
                             opts.num_epochs, start_time)
    if ('TD+' in opts.method) and epoch == 0:
        td_path = 'train/{dataset}/{cnn}/{method}/{td_name}' \
                  .format(dataset=opts.dataset, cnn=opts.cnn, method='TD', td_name=opts.td_name)
        utils.load_model(TDModel, None, None, td_path, opts.num_epochs,
                         start_time)
    prev_epoch = 0 if opts.keep else epoch
    saved = True

    # data loader
    if opts.test: dtypes = ['train', 'val', 'known', 'novel']
    else: dtypes = ['train']
    if 'data_loader' in opts:
        print('data_loader exists; {time:8.3f} s'.format(time=time.time() -
                                                         start_time))
        data_loader = opts.data_loader
    else:
        data_loader = utils.get_feature_loader(dtypes, opts, start_time)
        opts.data_loader = data_loader

    # recover labels if relabeled previously
    if 'train' in data_loader and hasattr(data_loader['train'].dataset,
                                          'target_tensor_bak'):
        data_loader['train'].dataset.target_tensor = data_loader[
            'train'].dataset.target_tensor_bak
        del data_loader['train'].dataset.target_tensor_bak
        print('labels recovered')

    # relabel
    if 'RLB' in opts.method:
        dataset_path = 'datasets/{dataset}'.format(dataset=opts.dataset)
        relabels = utils.relabel(opts.relabel,
                                 data_loader['train'].dataset.target_tensor, T,
                                 opts.num_epochs, dataset_path, start_time)
        data_loader['train'].dataset.target_tensor_bak = data_loader[
            'train'].dataset.target_tensor

    # min lr
    min_lr = opts.lr * (opts.lr_decay**opts.num_lr_decay) - ee

    print('{epoch:4d}/{num_epochs:4d} e; '.format(epoch=epoch,
                                                  num_epochs=opts.num_epochs),
          end='')
    print('start training; ', end='')
    print('{time:8.3f} s'.format(time=time.time() - start_time))
    for epoch in range(epoch + 1, opts.num_epochs + 1):

        # stopping criterion
        if optimizer.param_groups[0]['lr'] == 0.: break

        # train
        if 'RLB' in opts.method:
            data_loader['train'].dataset.target_tensor = relabels[epoch - 1]

        loss_val = train(data_loader['train'], model, loss_fn, optimizer,
                         epoch, T, opts, start_time)

        # lr decay
        if scheduler is not None:
            scheduler.step(loss_val)
            if optimizer.param_groups[0]['lr'] < min_lr:
                optimizer.param_groups[0]['lr'] = 0.

        # save model
        saved = False
        if opts.batch_size > 0 or epoch % opts.save_freq == 0:
            utils.save_model(model, optimizer, scheduler, save_path, epoch,
                             opts.num_epochs, prev_epoch, start_time)
            if not opts.keep: prev_epoch = epoch
            saved = True

    if not saved:
        utils.save_model(model, optimizer, scheduler, save_path, epoch,
                         opts.num_epochs, prev_epoch, start_time)

    print('{epoch:4d}/{num_epochs:4d} e; '.format(epoch=epoch,
                                                  num_epochs=opts.num_epochs),
          end='')
    print('training done; ', end='')
    print('{time:8.3f} s'.format(time=time.time() - start_time))

    # eval
    if opts.test:
        if opts.test_data_norm:
            save_path += '_tdn'
        if opts.method == 'TD':
            ths_opt = test.val_td(data_loader, model, T, opts, save_path,
                                  start_time)
            test.test_td(ths_opt['local'], data_loader, model, T, opts,
                         save_path, start_time)
        else:
            test.test('val', data_loader, model, T, opts, save_path,
                      start_time)
            test.test('test', data_loader, model, T, opts, save_path,
                      start_time)
Exemplo n.º 10
0
    def forward(self,data, cs, n):
             
        # n =1,2,3..N
        # elements with index below or equal to n-1 are already assigned
        # element with index n is to be assigned. 
        # the elements from the n+1-th are not assigned

        assert(n == self.previous_n+1)
        self.previous_n = self.previous_n + 1 

        K = len(set(cs[:n]))  # num of already created clusters

        if n==1:
            
            self.batch_size = data.shape[0]
            self.N = data.shape[1]
            assert (cs==relabel(cs)).all()

            
            
            if self.params['model'] == 'Gauss2D':
                # The data comes as a numpy vector
                data = torch.tensor(data).float().to(self.device)                    
                data = data.view([self.batch_size*self.N, self.params['x_dim']])

            elif self.params['model'] == 'MNIST':
                # The data comes as a torch tensor, we just move it to the device 
                data = data.to(self.device)    
                data = data.view([self.batch_size*self.N, 28,28])
                                
            
            self.hs = self.h(data).view([self.batch_size,self.N, self.h_dim])            
            self.Hs = torch.zeros([self.batch_size, 1, self.h_dim]).to(self.device)
            self.Hs[:,0,:] = self.hs[:,0,:]
            
            self.qs = self.q(data).view([self.batch_size,self.N, self.h_dim])            
            self.Q = self.qs[:,2:,].sum(dim=1)     #[batch_size,h_dim]
            
            
            
        else:            
            if K == self.previous_K:            
                self.Hs[:, cs[n-1], :] += self.hs[:,n-1,:]
            else:
                self.Hs = torch.cat((self.Hs,self.hs[:,n-1,:].unsqueeze(1)), dim=1)


            if n==self.N-1:
                self.Q = torch.zeros([self.batch_size,self.h_dim]).to(self.device)    #[batch_size,h_dim]
                self.previous_n = 0
                
            else:
                self.Q -= self.qs[:,n,]
                
            
        self.previous_K = K
        
        assert self.Hs.shape[1] == K
        
        logprobs = torch.zeros([self.batch_size, K+1]).to(self.device)
            
        # loop over the K existing clusters for datapoint n to join
        for k in range(K):
            Hs2 = self.Hs.clone()
            Hs2[:,k,:] += self.hs[:,n,:]
            
            
            Hs2 = Hs2.view([self.batch_size*K, self.h_dim])                
            gs  = self.g(Hs2).view([self.batch_size, K, self.g_dim])
            Gk = gs.sum(dim=1)   #[batch_size,g_dim]

            uu = torch.cat((Gk,self.Q), dim=1)  #prepare argument for the call to f()
            logprobs[:,k] = torch.squeeze(self.f(uu))    
            
        
        # consider datapoint n creating a new cluster
        Hs2 = torch.cat((self.Hs,self.hs[:,n,:].unsqueeze(1)), dim=1)    
        Hs2 = Hs2.view([self.batch_size*(K+1), self.h_dim])                
    
        gs  = self.g(Hs2).view([self.batch_size, K+1, self.g_dim])
    
        Gk = gs.sum(dim=1)
    
        uu = torch.cat((Gk,self.Q), dim=1)   #prepare argument for the call to f()
        logprobs[:,K] = torch.squeeze(self.f(uu))    


        # Normalize
        m,_ = torch.max(logprobs,1, keepdim=True)        #[batch_size,1]
        logprobs = logprobs - m - torch.log( torch.exp(logprobs-m).sum(dim=1, keepdim=True))

        return logprobs