def simple_train(model, dl, dl2, lamb=0.001, pre_out=None):
    print(lamb)
    total_samples = len(dl.dataset)
    total_samples2 = len(dl2.dataset)
    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters())
    sizes = []
    losses = []
    taccuracies = []
    accuracies = []
    best = (-np.inf, -np.inf)
    bn = None
    patience = 1

    def go(images):
        output = model(images)
        if pre_out is not None:
            output += Variable(pre_out(images).data, requires_grad=False)
        return output

    while True:
        print('epoch')
        los = 0
        accs = 0
        taccs = 0
        for i, (images, labels) in enumerate(dl):
            images = wrap(Variable(images, requires_grad=False))
            labels = wrap(Variable(labels, requires_grad=False))
            output = go(images)
            optimizer.zero_grad()
            l = criterion(output, labels)
            l2 = l + float(lamb) * model.loss()
            l2.backward()
            accs += tn((output.max(1)[1] == labels).float().sum().data)
            los += tn(l.data)  # Save the loss without the penalty
            optimizer.step()
        for i, (images, labels) in enumerate(dl2):
            images = wrap(Variable(images, requires_grad=False))
            labels = wrap(Variable(labels, requires_grad=False))
            output = go(images)
            taccs += tn((output.max(1)[1] == labels).float().sum().data)
        losses.append(los)
        accuracies.append(accs)
        taccuracies.append(taccs / total_samples2)
        sizes.append(tn(model.l0_loss().data))
        next_score = (-sizes[-1], taccuracies[-1])
        if best < next_score:
            best = next_score
            bm = copy.deepcopy(model)
            patience = 1
        else:
            patience += 1
            if patience >= 3:
                break
    return bm, best[1], np.stack(
        sizes), np.stack(losses) / total_samples, np.stack(
            accuracies) / total_samples, np.stack(taccuracies)
示例#2
0
def forward(model, dataloader, config, class_weights, optimizer=None):
    accs = []
    losses = []
    is_classification = config['mode'] == 'classification'
    if is_classification:
        criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = torch.nn.MSELoss()
    lamb = config['params']['lambda']

    for i, (inputs, labels) in enumerate(dataloader):
        inputs = Variable(inputs.cuda(async=True),
                          volatile=(optimizer is None))
        labels = Variable(labels.cuda(async=True),
                          volatile=(optimizer is None))
        prediction = model(inputs)
        loss = criterion(prediction, labels)
        losses.append(tn(loss.data))
        penalty = float(lamb) * shrinknet_penalty(model)
        loss = loss + penalty
        if is_classification:
            discrete_prediction = prediction.max(1)[1]
            accuracy = (discrete_prediction == labels).float().data.mean()
            accs.append(accuracy)
        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            update_statistics(model)
    losses = np.array(losses).mean()
    if accs:
        return losses, np.array(accs).mean()
    return losses, losses
示例#3
0
def forward(model, dl, lamb=0, optimizer=None, mode='classification', stats=None, weight=None):
    if mode == 'classification':
        criterion = CrossEntropyLoss(weight=weight, size_average=True)
    else:
        criterion = MSELoss(size_average=False)
    acc_sum = 0
    tot = 0
    for i, (images, labels) in enumerate(dl):
        if stats is not None:
            stats.next_batch()
        images = wrap(Variable(images, requires_grad=False))
        labels = wrap(Variable(labels, requires_grad=False))
        output = model(images)
        if optimizer is not None:
            original_loss = criterion(output, labels)
            penalized_loss = original_loss
            penalty = get_l1_loss(model)
            if penalty is not None:
                penalized_loss += float(lamb) * penalty
            # print(tn(original_loss.data), tn(penalized_loss.data), tn(penalty.data))
            optimizer.zero_grad()
            penalized_loss.backward()
            optimizer.step()
            if stats is not None:
                capacities = get_capacities(model)
                for layer, c in enumerate(capacities):
                    stats.log('capacity_l%s' % layer, c)
                stats.log('capacity', sum(capacities))
                stats.log('batch_original_loss', tn(original_loss.data))
                stats.log('batch_l1_penalty', tn(penalty.data))
                stats.log('batch_loss', tn(penalized_loss.data))
        if mode == 'classification':
            acc = (output.max(1)[1] == labels).float().sum()
        else:
            acc = -torch.nn.functional.mse_loss(output, labels, size_average=False)
        if stats is not None and optimizer is not None:
            stats.log('batch_acc', tn(acc.data) / images.size(0))
        tot += len(labels)
        acc_sum += acc

    try:
        acc = tn(acc_sum.data / tot)
    except:
        acc = acc_sum / tot
    return acc
def train(models, dl, lamb=0.001, epochs=EPOCHS, l2_penalty=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizers = []
    for model in models:
        normal_params = set(model.parameters())
        normal_params.remove(model.x_0)

        optimizer = Adam([{
            'params': normal_params,
            'weight_decay': l2_penalty,
        }, {
            'params': [model.x_0],
            'lr': 1,
        }])
        optimizers.append(optimizer)
    gradients = []
    sizes = []
    losses = []
    for e in range(0, epochs):
        print("Epoch %s" % e)
        gradient = np.zeros(len(models))
        los = np.zeros(len(models))
        for i, (images, labels) in enumerate(dl):
            images = wrap(Variable(images, requires_grad=False))
            labels = wrap(Variable(labels, requires_grad=False))
            for mid, (model, optimizer) in enumerate(zip(models, optimizers)):
                output = model(images)
                optimizer.zero_grad()
                l = (criterion(output, labels) + lamb * model.loss())
                l.backward()
                acc = (output.max(1)[1] == labels).float().mean()
                # a = tn(model.x_0.grad.data)
                # if a != a:
                #     return images, labels
                gradient[mid] += tn(model.x_0.grad.data)
                los[mid] += tn(l.data)
                # print(tn(acc.data), tn(model.x_0.data), tn(model.x_0.grad.data))
                optimizer.step()
                if isinstance(model, MNIST_1h_flexible_scaled):
                    model.reorder()
        gradients.append(gradient)
        losses.append(los)
        sizes.append([tn(m.x_0.data) for m in models])
    total_samples = len(dl.dataset)
    return np.stack(sizes), -np.stack(gradients) / total_samples, np.stack(losses) / total_samples
 def eval_loss():
     total_loss = 0
     criterion = CrossEntropyLoss()
     for images, labels in dl:
         images = wrap(Variable(images, requires_grad=False))
         labels = wrap(Variable(labels, requires_grad=False))
         output = model(images)
         total_loss += tn(criterion(output, labels).data)
     return total_loss
def train_algo(model_gen, ds, l=1, size=50, f=10):
    models = []
    dl1 = get_dl(ds, True)
    dl2 = get_dl(ds, False)
    gbs = 0
    l *= f

    def preout(x):
        values = [m(x) for m in models]
        return sum(values[1:], values[0])

    while l > 1e-9:
        l /= f
        model = model_gen()
        pr = preout if len(models) > 0 else None
        bm, bs, sizes, losses, accs, taccs = simple_train(model,
                                                          dl1,
                                                          dl2,
                                                          lamb=l,
                                                          pre_out=pr)
        if sizes[-1] == 0 or bs < gbs:
            continue
        else:
            print('temp - best score', bs)
            while True:
                l *= f
                cm, cs, ss, ll, aa, taa = simple_train(bm,
                                                       dl1,
                                                       dl2,
                                                       lamb=l,
                                                       pre_out=pr)
                if cs < bs:
                    break
                else:
                    bm = cm
                    bs = cs
                    print('temp - best score', bs)
            print('block score')
            if bs > gbs:
                models.append(bm)
                print('current size',
                      sum([tn(m.l0_loss().data) for m in models]))
                gbs = bs
            else:
                return models
    return models
示例#7
0
def forward(model, dl, lamb=0, optimizer=None):
    criterion = CrossEntropyLoss()
    acc_sum = 0
    tot = 0
    for i, (images, labels) in enumerate(dl):
        print(i)
        images = wrap(Variable(images, requires_grad=False))
        labels = wrap(Variable(labels, requires_grad=False))
        output = model(images)
        if optimizer is not None:
            original_loss = criterion(output, labels)
            penalized_loss = original_loss + float(lamb) * model.loss()
            optimizer.zero_grad()
            penalized_loss.backward()
            optimizer.step()

        acc = (output.max(1)[1] == labels).float().sum()
        tot += len(labels)
        acc_sum += acc

    acc = tn(acc_sum.data / tot)
    return acc
def train(models,
          dl,
          dl2,
          lamb=0.001,
          epochs=EPOCHS,
          l2_penalty=0.01,
          pre_out=None):
    try:
        lamb[len(models) - 1]
    except TypeError:
        lamb = [lamb] * len(models)
    criterion = CrossEntropyLoss()
    optimizers = []
    for model in models:
        optimizer = Adam([{
            'params': model.parameters(),
            'weight_decay': l2_penalty,
        }])
        optimizers.append(optimizer)
    sizes = []
    losses = []
    taccuracies = []
    accuracies = []
    stopped = [False] * len(models)
    best = [np.inf] * len(models)
    for e in range(0, epochs):
        print("Epoch %s" % e)
        gradient = np.zeros(len(models))
        los = np.zeros(len(models))
        accs = np.zeros(len(models))
        taccs = np.zeros(len(models))
        for i, (images, labels) in enumerate(dl):
            images = wrap(Variable(images, requires_grad=False))
            labels = wrap(Variable(labels, requires_grad=False))
            for mid, (model, optimizer) in enumerate(zip(models, optimizers)):
                output = model(images)
                if pre_out is not None:
                    output += Variable(pre_out(images).data,
                                       requires_grad=False)
                optimizer.zero_grad()
                l = criterion(output, labels)
                l2 = l + float(lamb[mid]) * model.loss()
                l2.backward()
                acc = (output.max(1)[1] == labels).float().sum()
                los[mid] += tn(l.data)  # Save the loss without the penalty
                accs[mid] += tn(acc.data)
                optimizer.step()
        for i, (images, labels) in enumerate(dl2):
            images = wrap(Variable(images, requires_grad=False))
            labels = wrap(Variable(labels, requires_grad=False))
            for mid, (model, optimizer) in enumerate(zip(models, optimizers)):
                output = model(images)
                if pre_out is not None:
                    output += Variable(pre_out(images).data,
                                       requires_grad=False)
                acc = (output.max(1)[1] == labels).float().sum()
                taccs[mid] += tn(acc.data)
        losses.append(los)
        accuracies.append(accs)
        taccuracies.append(taccs)
        sizes.append([tn(m.l0_loss().data) for m in models])
    total_samples = len(dl.dataset)
    total_samples2 = len(dl2.dataset)
    return np.stack(sizes), np.stack(losses) / total_samples, np.stack(
        accuracies) / total_samples, np.stack(taccuracies) / total_samples2
示例#9
0
def train(gen_block,
          base_model,
          train_dl,
          val_dl,
          test_dl,
          start_lamb=10,
          lamb_decay=10,
          max_patience=4,
          default_block_size=25,
          min_lambda=1e-7,
          max_block_size=100):
    block_size = default_block_size
    lamb = start_lamb
    val_accuracies = []
    test_accuracies = []
    Global_best_acc = 0
    while block_size <= max_block_size:  # Add blocks
        current_model = base_model.next_block(gen_block(block_size))
        best_model = None
        while True:  # Find Grip
            try:
                current_accuracy, current_model = train_until_convergence(
                    current_model,
                    train_dl,
                    val_dl,
                    lamb,
                    patience=max_patience)
                break
            except StopIteration:
                lamb /= lamb_decay
                if lamb < min_lambda:
                    return val_accuracies, test_accuracies, base_model
                current_model = base_model.next_block(gen_block(block_size))
        best_acc = current_accuracy
        best_model = deepcopy(current_model)
        while False:  # Optimize
            best_acc = current_accuracy
            if best_acc < global_best_acc:
                break
            best_model = deepcopy(current_model)
            lamb *= lamb_decay
            try:
                current_accuracy, current_model = train_until_convergence(
                    current_model,
                    train_dl,
                    val_dl,
                    lamb,
                    patience=max_patience)
            except StopIteration:
                current_accuracy = 0
            if current_accuracy < best_acc:
                break
            else:
                best_acc = current_accuracy
        if best_acc > global_best_acc:  # We did improve the model
            print('did improve')
            base_model = best_model  # We build on top of this block
            block_size = max(
                block_size,
                int(
                    min(
                        max_block_size,
                        3 * tn(base_model.training_component().get_capacities(
                        ).max().data))))
            global_best_acc = best_acc
            val_accuracies.append(best_acc)
            test_acc = forward(base_model, test_dl)
            print('TEST ACC:', test_acc)
            test_accuracies.append(test_acc)
        else:
            print('did not improve')
            lamb /= lamb_decay
            if lamb <= min_lambda:  # We can only increase the size
                break
    return val_accuracies, test_accuracies, base_model
示例#10
0
def has_collapsed(model):
    l1 = get_l1_loss(model)
    if l1 is None:
        return False
    return tn(l1.data) == 0
示例#11
0
 def has_collapsed(self):
     return tn(self.get_capacities().min().data) == 0