def simple_train(model, dl, dl2, lamb=0.001, pre_out=None): print(lamb) total_samples = len(dl.dataset) total_samples2 = len(dl2.dataset) criterion = CrossEntropyLoss() optimizer = Adam(model.parameters()) sizes = [] losses = [] taccuracies = [] accuracies = [] best = (-np.inf, -np.inf) bn = None patience = 1 def go(images): output = model(images) if pre_out is not None: output += Variable(pre_out(images).data, requires_grad=False) return output while True: print('epoch') los = 0 accs = 0 taccs = 0 for i, (images, labels) in enumerate(dl): images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) output = go(images) optimizer.zero_grad() l = criterion(output, labels) l2 = l + float(lamb) * model.loss() l2.backward() accs += tn((output.max(1)[1] == labels).float().sum().data) los += tn(l.data) # Save the loss without the penalty optimizer.step() for i, (images, labels) in enumerate(dl2): images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) output = go(images) taccs += tn((output.max(1)[1] == labels).float().sum().data) losses.append(los) accuracies.append(accs) taccuracies.append(taccs / total_samples2) sizes.append(tn(model.l0_loss().data)) next_score = (-sizes[-1], taccuracies[-1]) if best < next_score: best = next_score bm = copy.deepcopy(model) patience = 1 else: patience += 1 if patience >= 3: break return bm, best[1], np.stack( sizes), np.stack(losses) / total_samples, np.stack( accuracies) / total_samples, np.stack(taccuracies)
def forward(model, dataloader, config, class_weights, optimizer=None): accs = [] losses = [] is_classification = config['mode'] == 'classification' if is_classification: criterion = torch.nn.CrossEntropyLoss(weight=class_weights) else: criterion = torch.nn.MSELoss() lamb = config['params']['lambda'] for i, (inputs, labels) in enumerate(dataloader): inputs = Variable(inputs.cuda(async=True), volatile=(optimizer is None)) labels = Variable(labels.cuda(async=True), volatile=(optimizer is None)) prediction = model(inputs) loss = criterion(prediction, labels) losses.append(tn(loss.data)) penalty = float(lamb) * shrinknet_penalty(model) loss = loss + penalty if is_classification: discrete_prediction = prediction.max(1)[1] accuracy = (discrete_prediction == labels).float().data.mean() accs.append(accuracy) if optimizer is not None: optimizer.zero_grad() loss.backward() optimizer.step() update_statistics(model) losses = np.array(losses).mean() if accs: return losses, np.array(accs).mean() return losses, losses
def forward(model, dl, lamb=0, optimizer=None, mode='classification', stats=None, weight=None): if mode == 'classification': criterion = CrossEntropyLoss(weight=weight, size_average=True) else: criterion = MSELoss(size_average=False) acc_sum = 0 tot = 0 for i, (images, labels) in enumerate(dl): if stats is not None: stats.next_batch() images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) output = model(images) if optimizer is not None: original_loss = criterion(output, labels) penalized_loss = original_loss penalty = get_l1_loss(model) if penalty is not None: penalized_loss += float(lamb) * penalty # print(tn(original_loss.data), tn(penalized_loss.data), tn(penalty.data)) optimizer.zero_grad() penalized_loss.backward() optimizer.step() if stats is not None: capacities = get_capacities(model) for layer, c in enumerate(capacities): stats.log('capacity_l%s' % layer, c) stats.log('capacity', sum(capacities)) stats.log('batch_original_loss', tn(original_loss.data)) stats.log('batch_l1_penalty', tn(penalty.data)) stats.log('batch_loss', tn(penalized_loss.data)) if mode == 'classification': acc = (output.max(1)[1] == labels).float().sum() else: acc = -torch.nn.functional.mse_loss(output, labels, size_average=False) if stats is not None and optimizer is not None: stats.log('batch_acc', tn(acc.data) / images.size(0)) tot += len(labels) acc_sum += acc try: acc = tn(acc_sum.data / tot) except: acc = acc_sum / tot return acc
def train(models, dl, lamb=0.001, epochs=EPOCHS, l2_penalty=0.01): criterion = nn.CrossEntropyLoss() optimizers = [] for model in models: normal_params = set(model.parameters()) normal_params.remove(model.x_0) optimizer = Adam([{ 'params': normal_params, 'weight_decay': l2_penalty, }, { 'params': [model.x_0], 'lr': 1, }]) optimizers.append(optimizer) gradients = [] sizes = [] losses = [] for e in range(0, epochs): print("Epoch %s" % e) gradient = np.zeros(len(models)) los = np.zeros(len(models)) for i, (images, labels) in enumerate(dl): images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) for mid, (model, optimizer) in enumerate(zip(models, optimizers)): output = model(images) optimizer.zero_grad() l = (criterion(output, labels) + lamb * model.loss()) l.backward() acc = (output.max(1)[1] == labels).float().mean() # a = tn(model.x_0.grad.data) # if a != a: # return images, labels gradient[mid] += tn(model.x_0.grad.data) los[mid] += tn(l.data) # print(tn(acc.data), tn(model.x_0.data), tn(model.x_0.grad.data)) optimizer.step() if isinstance(model, MNIST_1h_flexible_scaled): model.reorder() gradients.append(gradient) losses.append(los) sizes.append([tn(m.x_0.data) for m in models]) total_samples = len(dl.dataset) return np.stack(sizes), -np.stack(gradients) / total_samples, np.stack(losses) / total_samples
def eval_loss(): total_loss = 0 criterion = CrossEntropyLoss() for images, labels in dl: images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) output = model(images) total_loss += tn(criterion(output, labels).data) return total_loss
def train_algo(model_gen, ds, l=1, size=50, f=10): models = [] dl1 = get_dl(ds, True) dl2 = get_dl(ds, False) gbs = 0 l *= f def preout(x): values = [m(x) for m in models] return sum(values[1:], values[0]) while l > 1e-9: l /= f model = model_gen() pr = preout if len(models) > 0 else None bm, bs, sizes, losses, accs, taccs = simple_train(model, dl1, dl2, lamb=l, pre_out=pr) if sizes[-1] == 0 or bs < gbs: continue else: print('temp - best score', bs) while True: l *= f cm, cs, ss, ll, aa, taa = simple_train(bm, dl1, dl2, lamb=l, pre_out=pr) if cs < bs: break else: bm = cm bs = cs print('temp - best score', bs) print('block score') if bs > gbs: models.append(bm) print('current size', sum([tn(m.l0_loss().data) for m in models])) gbs = bs else: return models return models
def forward(model, dl, lamb=0, optimizer=None): criterion = CrossEntropyLoss() acc_sum = 0 tot = 0 for i, (images, labels) in enumerate(dl): print(i) images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) output = model(images) if optimizer is not None: original_loss = criterion(output, labels) penalized_loss = original_loss + float(lamb) * model.loss() optimizer.zero_grad() penalized_loss.backward() optimizer.step() acc = (output.max(1)[1] == labels).float().sum() tot += len(labels) acc_sum += acc acc = tn(acc_sum.data / tot) return acc
def train(models, dl, dl2, lamb=0.001, epochs=EPOCHS, l2_penalty=0.01, pre_out=None): try: lamb[len(models) - 1] except TypeError: lamb = [lamb] * len(models) criterion = CrossEntropyLoss() optimizers = [] for model in models: optimizer = Adam([{ 'params': model.parameters(), 'weight_decay': l2_penalty, }]) optimizers.append(optimizer) sizes = [] losses = [] taccuracies = [] accuracies = [] stopped = [False] * len(models) best = [np.inf] * len(models) for e in range(0, epochs): print("Epoch %s" % e) gradient = np.zeros(len(models)) los = np.zeros(len(models)) accs = np.zeros(len(models)) taccs = np.zeros(len(models)) for i, (images, labels) in enumerate(dl): images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) for mid, (model, optimizer) in enumerate(zip(models, optimizers)): output = model(images) if pre_out is not None: output += Variable(pre_out(images).data, requires_grad=False) optimizer.zero_grad() l = criterion(output, labels) l2 = l + float(lamb[mid]) * model.loss() l2.backward() acc = (output.max(1)[1] == labels).float().sum() los[mid] += tn(l.data) # Save the loss without the penalty accs[mid] += tn(acc.data) optimizer.step() for i, (images, labels) in enumerate(dl2): images = wrap(Variable(images, requires_grad=False)) labels = wrap(Variable(labels, requires_grad=False)) for mid, (model, optimizer) in enumerate(zip(models, optimizers)): output = model(images) if pre_out is not None: output += Variable(pre_out(images).data, requires_grad=False) acc = (output.max(1)[1] == labels).float().sum() taccs[mid] += tn(acc.data) losses.append(los) accuracies.append(accs) taccuracies.append(taccs) sizes.append([tn(m.l0_loss().data) for m in models]) total_samples = len(dl.dataset) total_samples2 = len(dl2.dataset) return np.stack(sizes), np.stack(losses) / total_samples, np.stack( accuracies) / total_samples, np.stack(taccuracies) / total_samples2
def train(gen_block, base_model, train_dl, val_dl, test_dl, start_lamb=10, lamb_decay=10, max_patience=4, default_block_size=25, min_lambda=1e-7, max_block_size=100): block_size = default_block_size lamb = start_lamb val_accuracies = [] test_accuracies = [] Global_best_acc = 0 while block_size <= max_block_size: # Add blocks current_model = base_model.next_block(gen_block(block_size)) best_model = None while True: # Find Grip try: current_accuracy, current_model = train_until_convergence( current_model, train_dl, val_dl, lamb, patience=max_patience) break except StopIteration: lamb /= lamb_decay if lamb < min_lambda: return val_accuracies, test_accuracies, base_model current_model = base_model.next_block(gen_block(block_size)) best_acc = current_accuracy best_model = deepcopy(current_model) while False: # Optimize best_acc = current_accuracy if best_acc < global_best_acc: break best_model = deepcopy(current_model) lamb *= lamb_decay try: current_accuracy, current_model = train_until_convergence( current_model, train_dl, val_dl, lamb, patience=max_patience) except StopIteration: current_accuracy = 0 if current_accuracy < best_acc: break else: best_acc = current_accuracy if best_acc > global_best_acc: # We did improve the model print('did improve') base_model = best_model # We build on top of this block block_size = max( block_size, int( min( max_block_size, 3 * tn(base_model.training_component().get_capacities( ).max().data)))) global_best_acc = best_acc val_accuracies.append(best_acc) test_acc = forward(base_model, test_dl) print('TEST ACC:', test_acc) test_accuracies.append(test_acc) else: print('did not improve') lamb /= lamb_decay if lamb <= min_lambda: # We can only increase the size break return val_accuracies, test_accuracies, base_model
def has_collapsed(model): l1 = get_l1_loss(model) if l1 is None: return False return tn(l1.data) == 0
def has_collapsed(self): return tn(self.get_capacities().min().data) == 0