def InsertConvolution(predecessor_id, successor_id, old_model, kernel_size, batch): """ Function to insert a Conv-BatchNorm-Relu block Args: predecessor_id: previous layer successor_id: next layer old_model: model before mutation kernel_size: kernel size of the new block batch: first batch of the train loader Returns: Returns mutated model """ new_model_descriptor = copy.deepcopy(old_model['model_descriptor']) old_pytorch_model = old_model['pytorch_model'] successor = [ layer for layer in new_model_descriptor['layers'] if str(layer['id']) == str(successor_id) ][0] new_id_conv = utils.GetUnusedID(new_model_descriptor) new_id_bn = new_id_conv + 1 new_id_acti = new_id_bn + 1 old_pytorch_model.forward(batch) channels = old_pytorch_model.layerdic[str(predecessor_id)].size()[1] new_layer_conv = { 'type': 'conv', 'params': { 'channels': channels, 'ks1': kernel_size, 'ks2': kernel_size, "in_channels": channels }, 'input': [predecessor_id], 'id': new_id_conv } new_layer_bn = { 'type': 'batchnorm', 'params': { "in_channels": channels }, 'input': [new_id_conv], 'id': new_id_bn } new_layer_acti = { 'type': 'activation', 'params': {}, 'input': [new_id_bn], 'id': new_id_acti } utils.ReplaceInput([successor], predecessor_id, new_id_acti) new_model_descriptor['layers'].append(new_layer_conv) new_model_descriptor['layers'].append(new_layer_bn) new_model_descriptor['layers'].append(new_layer_acti) new_pytorch_model = ConvNet(new_model_descriptor) new_pytorch_model.cuda() new_pytorch_model._modules[str(new_id_bn)].momentum = 1.0 new_pytorch_model._modules[str(new_id_bn)].eps = 0.0 new_pytorch_model.forward(batch) new_pytorch_model = utils.InheritWeights(old_pytorch_model, new_pytorch_model) IDConv = conv2d_identity(channels, kernel_size) bias_shape = new_pytorch_model._modules[str( new_id_conv)].weight[1].size()[0] state_dict = { "weight": torch.from_numpy(IDConv).cuda(), "bias": torch.from_numpy(np.zeros(shape=bias_shape)).cuda() } new_pytorch_model._modules[str(new_id_conv)].load_state_dict(state_dict) # Batch Normalization layer's weight inheritance new_pytorch_model.forward(batch) predecessor_output_batch = new_pytorch_model.layerdic[str(new_id_conv)][0] predecessor_output_batch_cpu = predecessor_output_batch.cpu() predecessor_output_batch_data = predecessor_output_batch_cpu.data.numpy() batch_mean = np.mean(predecessor_output_batch_data, axis=(1, 2)) batch_var = np.var(predecessor_output_batch_data, axis=(1, 2)) eps = new_pytorch_model._modules[str(new_id_bn)].eps beta_ini = batch_mean rm_copy = copy.deepcopy( new_pytorch_model._modules[str(new_id_bn)].running_mean) rv_copy = copy.deepcopy( np.sqrt(new_pytorch_model._modules[str(new_id_bn)].running_var + eps)) state_dict = { "weight": nn.Parameter(rv_copy.cuda()), "bias": nn.Parameter(rm_copy.cuda()), "running_var": torch.from_numpy(batch_var).cuda(), "running_mean": torch.from_numpy(beta_ini).cuda() } new_pytorch_model._modules[str(new_id_bn)].load_state_dict(state_dict) new_model = { 'pytorch_model': new_pytorch_model, 'model_descriptor': new_model_descriptor, 'topo_ordering': new_pytorch_model.topo_ordering } return new_model
def MergeLayersConcatWithDownSampling(layer1_id, layer2_id, downsampling_factor, old_model, batch): """ Function to merge layers by concatenation, from layer 1 to layer 2 Args: layer1_id: layer 1 layer2_id: layer 2 downsampling_factor: If downsampling_factor is greater than one, pooling needed old_model: model before mutation batch: first batch of the train loader Returns: Returns mutated model """ new_model_descriptor = copy.deepcopy(old_model['model_descriptor']) old_pytorch_model = old_model['pytorch_model'] [subsequentlayers, _, _] = utils.GetSubsequentLayers(int(layer2_id), new_model_descriptor) new_id = utils.GetUnusedID(new_model_descriptor) new_id_subseq = new_id + 1 # If the downsampling factor not 1, we need to use pooling layer for dimention matching if downsampling_factor != 1: new_id_pool = new_id_subseq + 1 # Check which layer is smaller old_pytorch_model.forward(batch) if old_pytorch_model.layerdic[str(layer1_id)].size( )[2] > old_pytorch_model.layerdic[str(layer2_id)].size()[2]: pool_layer = { 'type': 'pool', 'params': { 'poolsize': downsampling_factor, 'pooltype': 'max' }, 'id': new_id_pool, 'input': [int(layer1_id)] } new_model_descriptor['layers'].append(pool_layer) merge_layer = { 'type': 'merge', 'params': { 'mergetype': 'concat' }, 'id': new_id, 'input': [int(layer2_id), int(new_id_pool)] } new_model_descriptor['layers'].append(merge_layer) else: pool_layer = { 'type': 'pool', 'params': { 'poolsize': downsampling_factor, 'pooltype': 'max' }, 'id': new_id_pool, 'input': [int(layer2_id)] } new_model_descriptor['layers'].append(pool_layer) merge_layer = { 'type': 'merge', 'params': { 'mergetype': 'concat' }, 'id': new_id, 'input': [int(layer1_id), int(new_id_pool)] } new_model_descriptor['layers'].append(merge_layer) else: # Downsampling factor is 1 merge_layer = { 'type': 'merge', 'params': { 'mergetype': 'concat' }, 'id': new_id, 'input': [int(layer2_id), int(layer1_id)] } new_model_descriptor['layers'].append(merge_layer) old_id_subseq = subsequentlayers[0]['id'] subsequentlayers[0]['id'] = new_id_subseq utils.ReplaceInput(subsequentlayers, int(layer2_id), new_id) # Update input for subsequent layers of subsequent conv layer subsubsequentlayers, _, _ = utils.GetSubsequentLayers( old_id_subseq, new_model_descriptor) # Change the next layers input with the new layer utils.ReplaceInput(subsubsequentlayers, old_id_subseq, new_id_subseq) # Replace in_channels for subsequent layer # We need the number of channels from layer2_id and layer1_id # We can use forward of old model to calculate the shape # Next layer's input and parameters need to be reshaped old_pytorch_model.forward(batch) parent_1_channels = old_pytorch_model.layerdic[str(layer1_id)].shape[1] parent_2_channels = old_pytorch_model.layerdic[str(layer2_id)].shape[1] subsequentlayers[0]['params']['in_channels'] = int( parent_1_channels) + int(parent_2_channels) new_pytorch_model = ConvNet(new_model_descriptor) new_pytorch_model.cuda() new_pytorch_model = utils.InheritWeights(old_pytorch_model, new_pytorch_model) try: new_pytorch_model.forward(batch) except: print("Problem with sizes MergeLayersConcatWithDownSampling") return old_model new_weights = copy.deepcopy( new_pytorch_model._modules[str(new_id_subseq)].weight) old_weights = copy.deepcopy( old_pytorch_model._modules[str(old_id_subseq)].weight) old_bias = copy.deepcopy( old_pytorch_model._modules[str(old_id_subseq)].bias) new_weights[:, 0:old_weights.shape[1], :, :] = old_weights new_weights[:, old_weights.shape[1]:, :, :] = torch.from_numpy( np.zeros(shape=new_weights[:, old_weights.shape[1]:, :, :].shape)) state_dict = { "weight": nn.Parameter(new_weights.cuda()), "bias": nn.Parameter(old_bias.cuda()) } new_pytorch_model._modules[str(new_id_subseq)].load_state_dict(state_dict) new_model = { 'pytorch_model': new_pytorch_model, 'model_descriptor': new_model_descriptor, 'topo_ordering': new_pytorch_model.topo_ordering } return new_model
def SpecialChild(n_models, n_mutations, n_epochs_total, initial_model, savepath, folder_out): """ generate and train children, update best model n_models = number of child models n_mutations = number of mutations/network operators to be applied per model_descriptor n_epochs_total = number of epochs for training in total initial model = current best model_descriptor savepath = where to save stuff folder_out = where to save the general files for one run """ n_epochs_each = int(n_epochs_total) print('Train all models for', int(n_epochs_each), 'epochs.') init_weights_path = savepath + 'ini_weights' torch.save(initial_model['pytorch_model'].state_dict(), init_weights_path) performance = np.zeros(shape=(n_models,)) descriptors = [] for model_idx in range(0, n_models): print('\nmodel idx ' + str(model_idx)) # save some data time_overall_s = time.time() pytorch_model = ConvNet(initial_model['model_descriptor']) pytorch_model.cuda() pytorch_model.load_state_dict(torch.load(init_weights_path), strict=False) model = {'pytorch_model': pytorch_model, 'model_descriptor': copy.deepcopy(initial_model['model_descriptor']), 'topo_ordering': pytorch_model.topo_ordering} descriptors.append(model['model_descriptor']) mutations_applied = [] # overall , mutations, training times = [0, 0, 0] # apply operators for i in range(0, n_mutations): time_mut_s = time.time() # we don't mutate the first child! if model_idx != 0: mutations_probs = np.array([1, 1, 1, 1, 1, 1]) [model, mutation_type, params] = network_operators.MutateNetwork(model, batch, mutation_probs=mutations_probs) mutations_applied.append(mutation_type) time_mut_e = time.time() times[1] = times[1] + (time_mut_e - time_mut_s) pytorch_total_params = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad) if pytorch_total_params > max_n_params: break # train time_train_s = time.time() # initial short training of the children model['pytorch_model'].fit(trainloader, epochs=n_epochs_each) time_train_e = time.time() times[2] = times[2] + (time_train_e - time_train_s) performance[model_idx] = model['pytorch_model'].evaluate(validloader) pytorch_total_params_child = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad) torch.save(model['pytorch_model'].state_dict(), savepath + 'model_' + str(model_idx)) with open(folder_out + "performance.txt", "a+") as f_out: f_out.write('child ' + str(model_idx) + ' performance ' +str(performance[model_idx])+' num params '+str(pytorch_total_params_child) +'\n') descriptors[model_idx] = copy.deepcopy(model['model_descriptor']) time_overall_e = time.time() times[0] = times[0] + (time_overall_e - time_overall_s) np.savetxt(savepath + 'model_' + str(model_idx) + '_times', times) descriptor_file = open(savepath + 'model_' + str(model_idx) + '_model_descriptor.txt', 'w') for layer in model['model_descriptor']['layers']: layer_str = str(layer) descriptor_file.write(layer_str + "\n") descriptor_file.close() # delete the model (attempt to clean the memory) del model['pytorch_model'] del model torch.cuda.empty_cache() # continue SH steps sorted_children = np.argsort(performance) n_children = len(sorted_children) n_epochs_train_children = n_epochs_each while n_children > 1: # pick the best halve of the children best_children = sorted_children[(n_children // 2):] # increase the training budget for them n_epochs_train_children = n_epochs_train_children * 2 print("\nbest_children", best_children) print("n_epochs_train_children", n_epochs_train_children) for child in best_children: print("child ", child) # load the child parameters pytorch_model = ConvNet(descriptors[child]) pytorch_model.cuda() pytorch_model.load_state_dict(torch.load(savepath + 'model_' + str(child)), strict=False) model = {'pytorch_model': pytorch_model, 'model_descriptor': copy.deepcopy(descriptors[child]), 'topo_ordering': pytorch_model.topo_ordering} # train a child model['pytorch_model'].fit(trainloader, epochs=n_epochs_train_children) # evaluate a child performance[child] = model['pytorch_model'].evaluate(validloader) pytorch_total_params_child = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad) with open(folder_out + "performance.txt", "a+") as f_out: f_out.write('child ' + str(child) + ' performance ' +str(performance[child])+' num params '+str(pytorch_total_params_child) +'\n') # update a child model torch.save(model['pytorch_model'].state_dict(), savepath + 'model_' + str(child)) # delete the model (attempt to clean the memory) del model['pytorch_model'] del model torch.cuda.empty_cache() print("\nperformance ", performance) temp_children_array = np.argsort(performance) sorted_children = [] for i, t in enumerate(temp_children_array): if t in best_children: sorted_children.append(t) print("sorted_children ", sorted_children) n_children = len(sorted_children) print("it should be the winner", sorted_children[0]) print("it should be the best performance", performance[sorted_children[0]]) # load the best child the_best_child = sorted_children[0] pytorch_model = ConvNet(descriptors[the_best_child]) pytorch_model.cuda() pytorch_model.load_state_dict(torch.load(savepath + 'model_' + str(the_best_child)), strict=False) model = {'pytorch_model': pytorch_model, 'model_descriptor': copy.deepcopy(descriptors[the_best_child]), 'topo_ordering': pytorch_model.topo_ordering} with open(folder_out + "performance.txt", "a+") as f_out: f_out.write("****************************\n") return model, performance[sorted_children[0]]
def SplitConnection(layer2split_id, old_model, batch, split_factor): """ Function to split up Conv-BatchNorm-Relu block Args: layer2split_id: id of layer to be split into two old_model: model before mutation batch: first batch of the train loader split_factor: splitting factor Returns: Returns mutated model """ new_model_descriptor = copy.deepcopy(old_model['model_descriptor']) old_pytorch_model = old_model['pytorch_model'] # Get BN and activation layer belonging to conv layer layer2split_bn = [ layer for layer in new_model_descriptor['layers'] if layer['input'] == [layer2split_id] ][0] layer2split_acti = [ layer for layer in new_model_descriptor['layers'] if layer['input'] == [layer2split_bn['id']] ][0] subsequentlayers = [ layer for layer in new_model_descriptor['layers'] if layer2split_acti['id'] in layer['input'] ] layer2split = [ layer for layer in new_model_descriptor['layers'] if layer['id'] == layer2split_id ][0] old_id_conv = layer2split_id old_id_bn = layer2split_bn['id'] old_id_acti = layer2split_acti['id'] old_bn_layer = [ layer for layer in new_model_descriptor['layers'] if layer['id'] == old_id_bn ][0] assert ((layer2split['type'] == 'conv') or (layer2split['type'] == 'sep')), 'Error: Layer hast to be conv or sep layer.' layer_type = layer2split['type'] # 1st branch new_id_conv1 = utils.GetUnusedID(new_model_descriptor) new_id_bn1 = new_id_conv1 + 1 new_id_acti1 = new_id_conv1 + 2 # 2nd branch new_id_conv2 = new_id_conv1 + 3 new_id_bn2 = new_id_conv1 + 4 new_id_acti2 = new_id_conv1 + 5 # sum up split new_id_add = new_id_conv1 + 6 layer2split['id'] = new_id_conv1 layer2split_bn['id'] = new_id_bn1 layer2split_acti['id'] = new_id_acti1 layer2split_bn['input'] = [new_id_conv1] layer2split_acti['input'] = [new_id_bn1] new_conv_layer = { 'type': layer2split['type'], 'params': copy.deepcopy(layer2split['params']), 'id': new_id_conv2, 'input': copy.deepcopy(layer2split['input']) } new_bn_layer = { 'type': 'batchnorm', 'params': copy.deepcopy(old_bn_layer['params']), 'id': new_id_bn2, 'input': [new_id_conv2] } new_acti_layer = { 'type': 'activation', 'params': {}, 'id': new_id_acti2, 'input': [new_id_bn2] } new_merge_layer = { 'type': 'merge', 'params': { 'mergetype': 'add' }, 'id': new_id_add, 'input': [int(new_id_acti1), int(new_id_acti2)] } utils.ReplaceInput(subsequentlayers, old_id_acti, new_id_add) new_model_descriptor['layers'].append(new_conv_layer) new_model_descriptor['layers'].append(new_bn_layer) new_model_descriptor['layers'].append(new_acti_layer) new_model_descriptor['layers'].append(new_merge_layer) new_pytorch_model = ConvNet(new_model_descriptor) new_pytorch_model.cuda() new_pytorch_model = utils.InheritWeights(old_model['pytorch_model'], new_pytorch_model) new_pytorch_model.forward(batch) if layer_type == 'conv': old_weights_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].weight) old_bias_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].bias) # New_id_conv1 state_dict = { "weight": nn.Parameter((split_factor * old_weights_conv).cuda()), "bias": nn.Parameter((split_factor * old_bias_conv).cuda()) } new_pytorch_model._modules[str(new_id_conv1)].load_state_dict( state_dict) # New_id_conv2 state_dict = { "weight": nn.Parameter( ((1 - split_factor) * old_weights_conv).cuda()), "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda()) } new_pytorch_model._modules[str(new_id_conv2)].load_state_dict( state_dict) elif layer_type == 'sep': # Depthwise old_weights_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].depthwise.weight) old_bias_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].depthwise.bias) # New_id_conv1 state_dict = { "weight": nn.Parameter((split_factor * old_weights_conv).cuda()), "bias": nn.Parameter((split_factor * old_bias_conv).cuda()) } new_pytorch_model._modules[str( new_id_conv1)].depthwise.load_state_dict(state_dict) # New_id_conv2 state_dict = { "weight": nn.Parameter( ((1 - split_factor) * old_weights_conv).cuda()), "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda()) } new_pytorch_model._modules[str( new_id_conv2)].depthwise.load_state_dict(state_dict) # Pointwise old_weights_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].pointwise.weight) old_bias_conv = copy.deepcopy( old_pytorch_model._modules[str(old_id_conv)].pointwise.bias) # New_id_conv1 state_dict = { "weight": nn.Parameter((split_factor * old_weights_conv).cuda()), "bias": nn.Parameter((split_factor * old_bias_conv).cuda()) } new_pytorch_model._modules[str( new_id_conv1)].pointwise.load_state_dict(state_dict) # New_id_conv2 state_dict = { "weight": nn.Parameter( ((1 - split_factor) * old_weights_conv).cuda()), "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda()) } new_pytorch_model._modules[str( new_id_conv2)].pointwise.load_state_dict(state_dict) # Old_id_bn old_weights_bn = copy.deepcopy( old_pytorch_model._modules[str(old_id_bn)].weight) old_bias_bn = copy.deepcopy( old_pytorch_model._modules[str(old_id_bn)].bias) old_mean_bn = copy.deepcopy( old_pytorch_model._modules[str(old_id_bn)].running_mean) old_var_bn = copy.deepcopy( old_pytorch_model._modules[str(old_id_bn)].running_var) # New_id_bn1 state_dict = { "weight": nn.Parameter((split_factor * old_weights_bn).cuda()), "bias": nn.Parameter((split_factor * old_bias_bn).cuda()), "running_var": nn.Parameter((split_factor * old_var_bn).cuda()), "running_mean": nn.Parameter((split_factor * old_mean_bn).cuda()) } new_pytorch_model._modules[str(new_id_bn1)].load_state_dict(state_dict) # New_id_bn2 state_dict = { "weight": nn.Parameter(((1 - split_factor) * old_weights_bn).cuda()), "bias": nn.Parameter(((1 - split_factor) * old_bias_bn).cuda()), "running_var": nn.Parameter(((1 - split_factor) * old_var_bn).cuda()), "running_mean": nn.Parameter(((1 - split_factor) * old_mean_bn).cuda()) } new_pytorch_model._modules[str(new_id_bn2)].load_state_dict(state_dict) new_model = { 'pytorch_model': new_pytorch_model, 'model_descriptor': new_model_descriptor, 'topo_ordering': new_pytorch_model.topo_ordering } return new_model
opt_algo = {'name': optim.SGD, 'lr': lr_vanilla, 'momentum': 0.9, 'weight_decay': 0.0005, 'alpha': 1.0} sch_algo = {'name': optim.lr_scheduler.CosineAnnealingLR, 'T_max': 5, 'eta_min': 0, 'last_epoch': -1} comp = {'optimizer': opt_algo, 'scheduler': sch_algo, 'loss': nn.CrossEntropyLoss, 'metrics': ['accuracy']} model_descriptor = {} model_descriptor['layers'] = [layer0, layer1, layer1_1, layer1_2, layer4, layer5, layer5_1, layer5_2, layer8, layer9, layer9_1, layer9_2, layer11] model_descriptor['compile']= comp # create a new basic model mod = ConvNet(model_descriptor) mod.cuda() vanilla_model = {'pytorch_model': mod, 'model_descriptor': model_descriptor, 'topo_ordering': mod.topo_ordering} # train initially our vanilla model and save vanilla_model['pytorch_model'].fit_vanilla(trainloader, epochs=20) # save vanilla model weights torch.save(vanilla_model['pytorch_model'].state_dict(), expfolder + "vanilla_model") def SpecialChild(n_models, n_mutations, n_epochs_total, initial_model, savepath, folder_out): """