def fed_train_step(self): ''' A single communication round (Upload weights for non-delayed nodes) ''' self.weight_list = [] self.nb_list = [] if self.bad_node: print('bad node !!!!! number{}, size{}!!!!!!'.format( self.bad_node_nb, self.bad_node_size)) for i in range(self.bad_node_nb): bad_model = ANN_model() bad_weight = bad_model.get_weights() bad_model_w = get_nb_matrix(bad_weight, self.bad_node_size) self.weight_list.append(bad_weight) self.nb_list.append(bad_model_w) for index_path in self.nodes_p: # Run each nodes and collect their weights model_weights, nb = node_training_process( index_path, self.shared_index, self.central_p, self.local_epoch, self.batch_size, self.augument, self.local_iid, self.node_evl) model_w = get_nb_matrix(model_weights, nb) self.weight_list.append(model_weights) self.nb_list.append(model_w) tf.keras.backend.clear_session( ) ########## to solve memory leak 2/2 self.epo = self.epo + 1 # memory(self.epo) ## Testing memory usage return True
import glob from models import ANN_model from fedml import Fed_Training import datetime gpus = tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3000)]) # data index_collection=glob.glob('worker_nodes/*/index.npy') central_weight_path=os.path.join('central_node','ANN_model.h5') (_, _), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() # build central folder & start_up model model=ANN_model() shutil.rmtree('central_node', ignore_errors=True) os.makedirs('central_node') model.save_weights(central_weight_path) ################################################################################################### my_EPO = 5000 loc_EPO = 1 early_STOP = 500 usual_node_NUMBER = 10 delayed_node_NUMBER = 0 shared_node_NUMBER = 0 delayed_SPEED = 1 my_AUGUMENT= False
index1 = index_list[i] print(len(y_train[index1])) print(np.unique(y_train[index1], return_counts=True)) ##****************************************************************************** base_path = 'worker_nodes' workers = ['model' + str(int(i + 1)) for i in range(worker_nb) ] #save like 'worker_nodes/model1/index.npy' shutil.rmtree(base_path, ignore_errors=True) for i in range(worker_nb): worker = workers[i] worker_dir = os.path.join(base_path, worker) try: os.makedirs(worker_dir) except: pass file_path = os.path.join( worker_dir, 'index.npy') ## TODO: copy the training code of the worker to this dir np.save(file_path, index_list[i]) index_collection = glob.glob('worker_nodes/*/index.npy') central_weight_path = os.path.join('central_node', 'ANN_model.h5') from models import ANN_model model = ANN_model() shutil.rmtree('central_node', ignore_errors=True) os.makedirs('central_node') model.save_weights(central_weight_path)
def node_training_process(index_path, shared_index, central_weight_path, local_epoch, batch_size=50, augment=False, local_iid=False, node_evl=False): ''' 1. Get index and initial_weights from central, 2. Load & prepare the dataset accordingly, 3. Training, 4. Return weights to central * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir') * Saving node weights locally can be a safe way if node have many data, but here we just neglect this ''' g = tf.Graph() with g.as_default(): #tf.graph to solve memory leak # load & processing data (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() autotune = tf.data.experimental.AUTOTUNE # load index index1 = np.load(index_path) ori_traning = index1.shape[0] # node's own data size # assign node_evl_set (1/2) if node_evl: evl_p = index_path[:-9] + 'evl_index.npy' evl_index = np.load(evl_p) x_test_n = x_test[evl_index] y_test_n = y_test[evl_index] node_evl_list = [] total_node_evl_list = [] for i in range(10): index0 = np.where(y_test_n == i) index = index0[0] x_evl = tf.data.Dataset.from_tensor_slices(x_test_n[index]) y_evl = tf.data.Dataset.from_tensor_slices(y_test_n[index]) node_evl_set = tf.data.Dataset.zip((x_evl, y_evl)) node_evl_set = node_evl_set.repeat().batch(1).prefetch( buffer_size=autotune) total_node_evl = len(index) node_evl_list.append(node_evl_set) total_node_evl_list.append(total_node_evl) # if shared_index!=[]: # shared_test_index = np.array([0]) # for x in shared_index: # b=np.load(x) # index1 = np.concatenate((index1, b)) # shared_test_index = np.concatenate((shared_test_index, b)) # shared_test_index = shared_test_index[1:] # x_test_shared=x_train[shared_test_index] # y_test_shared=y_train[shared_test_index] # x_shared_evl=tf.data.Dataset.from_tensor_slices(x_test_shared) # y_shared_evl=tf.data.Dataset.from_tensor_slices(y_test_shared) # shared_evl_set = tf.data.Dataset.zip((x_shared_evl, y_shared_evl)) # shared_evl_set = shared_evl_set.repeat().batch(batch_size).prefetch(buffer_size=autotune) # total_shared_evl = shared_test_index.shape[0] ################################### x_train_i = x_train[index1] y_train_i = y_train[index1] print(np.unique(y_train_i, return_counts=True)) ############################## if -1 in index1: iii = [random.randint(0, 40000) for i in range(len(index1))] x_train_i = x_train[iii] iii = [random.randint(0, 40000) for i in range(len(index1))] y_train_i = y_train[iii] print(np.unique( y_train_i, return_counts=True)) ############################## buffer_size = x_train_i.shape[0] # total_traning=index1.shape[0] x_tr = tf.data.Dataset.from_tensor_slices(x_train_i) y_tr = tf.data.Dataset.from_tensor_slices(y_train_i) total_traning = len(x_train_i) if local_iid == True: y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i) print( np.unique(y_train_i2, return_counts=True)) ############################## total_traning = len(x_train_i2) x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2) y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2) print(np.unique(y_train_i, return_counts=True)) ############################## train_set = tf.data.Dataset.zip((x_tr, y_tr)) if augment == True: train_set = train_set.map(img_augument).shuffle( buffer_size, reshuffle_each_iteration=True).repeat().batch( batch_size).prefetch(buffer_size=autotune) else: train_set = train_set.shuffle( buffer_size, reshuffle_each_iteration=True).repeat().batch( batch_size).prefetch(buffer_size=autotune) # Training & save # THIS LINE SHOULD BE THE FIRST save_dir = index_path[:-9] model = ANN_model() model.load_weights(central_weight_path) # node_evl before training (2/2) if node_evl: filename = os.path.join(save_dir, 'node_EVAL_before_training.txt') with open(filename, 'a') as file_handle: for i in range(10): if total_node_evl_list[i] == 0: file_handle.write('200') file_handle.write(' ') else: [loss, acc ] = model.evaluate(node_evl_list[i], steps=total_node_evl_list[i] // 1, verbose=0) file_handle.write(str(acc)) file_handle.write(' ') file_handle.write('\n') # # see if overtrained over the shared index # if shared_index!=[]: # [loss, acc]=model.evaluate(shared_evl_set,steps=total_shared_evl//batch_size,verbose=0) # filename = os.path.join(save_dir,'shared_EVAL.txt') # with open(filename,'a') as file_handle: # file_handle.write(str(loss)) # file_handle.write(' ') # file_handle.write(str(acc)) # file_handle.write('\n') # test the loaded model to see if it's overtrainned? mention it's last epo's acc [self_loss, self_acc] = model.evaluate(train_set, steps=total_traning // batch_size, verbose=0) filename = os.path.join(save_dir, 'self_EVAL.txt') with open(filename, 'a') as file_handle: file_handle.write(str(self_loss)) file_handle.write(' ') file_handle.write(str(self_acc)) file_handle.write('\n') history = model.fit(train_set, epochs=local_epoch, steps_per_epoch=total_traning // batch_size, verbose=0) # return model_weight model_weights = model.get_weights() del model # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes. # And compare this weighted average to current one (a centralized testing set) return model_weights, total_traning
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=48, metavar='N', help='input batch size for training (default: 1)') parser.add_argument('--test-batch-size', type=int, default=48, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=1e-4, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-net', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} writer = SummaryWriter('./summaries/cifar10') data_generation_parameters = load_configuration_parameters.load_data_generation_config_paras() data_path = data_generation_parameters['output_directory'] train_dataset = STORM_DVS(train=True, win=100, path=data_path, net_model='ANN') test_dataset = STORM_DVS(train=False, win=100, path=data_path, net_model='ANN') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs) os.environ['CUDA_VISIBLE_DEVICES'] = '2,3' device = torch.device("cuda:6" if use_cuda else "cpu") net = ANN_model.Unet_4x_ANN() # net = nn.DataParallel(net, device_ids=[2, 3]) net = net.to(device) net.apply(init_weights) optimizer = optim.Adam(net.parameters(), lr=args.lr) criterion = psf_loss(device) # criterion = psf_weighted_loss() criterion = MSE_and_L1_loss() # criterion = nn.CrossEntropyLoss() save_id = uuid.uuid4() for epoch in range(1, args.epochs + 1): train(args, net, device, train_loader, optimizer, epoch, writer, criterion, save_id) test(args, net, device, test_loader, epoch, writer, criterion) writer.close()
def node_training_process(index_path, shared_index, central_weight_path, local_epoch, batch_size=50, augment=False, local_iid=False): ''' 1. Get index and initial_weights from central, 2. Load & prepare the dataset accordingly, 3. Training, 4. Return weights to central * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir') * Saving node weights locally can be a safe way if node have many data, but here we just neglect this ''' g = tf.Graph() with g.as_default(): #tf.graph to solve memory leak # load index index1 = np.load(index_path) if shared_index != []: for x in shared_index: b = np.load(x) index1 = np.concatenate((index1, b)) # load & processing data (x_train, y_train), (_, _) = tf.keras.datasets.cifar10.load_data() x_train_i = x_train[index1] y_train_i = y_train[index1] autotune = tf.data.experimental.AUTOTUNE buffer_size = x_train_i.shape[0] total_traning = index1.shape[0] x_tr = tf.data.Dataset.from_tensor_slices(x_train_i) y_tr = tf.data.Dataset.from_tensor_slices(y_train_i) if local_iid == True: y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i) x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2) y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2) train_set = tf.data.Dataset.zip((x_tr, y_tr)) if augment == True: train_set = train_set.map(img_augument).shuffle( buffer_size, reshuffle_each_iteration=True).repeat().batch( batch_size).prefetch(buffer_size=autotune) else: train_set = train_set.shuffle( buffer_size, reshuffle_each_iteration=True).repeat().batch( batch_size).prefetch(buffer_size=autotune) # Training & save save_dir = index_path[:-9] model = ANN_model() model.load_weights(central_weight_path) # test the loaded model to see if it's overtrainned? mention it's last epo's acc [self_loss, self_acc] = model.evaluate(train_set, steps=total_traning // batch_size, verbose=0) filename = os.path.join(save_dir, 'self_EVAL.txt') with open(filename, 'a') as file_handle: file_handle.write(str(self_loss)) file_handle.write(' ') file_handle.write(str(self_acc)) file_handle.write('\n') history = model.fit(train_set, epochs=local_epoch, steps_per_epoch=total_traning // batch_size, verbose=0) # return model_weight model_weights = model.get_weights() del model # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes. # And compare this weighted average to current one (a centralized testing set) return model_weights, total_traning # index_path
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=1, metavar='N', help='input batch size for training (default: 1)') parser.add_argument('--test-batch-size', type=int, default=1, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=1e-4, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda:0" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} writer = SummaryWriter('./summaries/cifar10') data_generation_parameters = load_configuration_parameters.load_data_generation_config_paras( ) data_path = data_generation_parameters['output_directory'] train_dataset = STORM_DVS(train=True, win=100, path=data_path, net_model='ANN', Normalize='True') test_dataset = STORM_DVS(train=False, win=100, path=data_path, net_model='ANN', Normalize='True') # train_dataset = STORM_DVS(train=True, win=100, path=data_path, net_model='SNN', Normalize = 'True') # test_dataset = STORM_DVS(train=False, win=100, path=data_path, net_model='SNN', Normalize = 'True') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs) # data_path = "E:\PHD\DVS_STORM_SOFI\DVS\DVS_data/" # net = model.enconder_decoder_4x_SNN().to(device) # net = ANN_model.resnet18().to(device) net = ANN_model.Unet_4x_ANN().to(device) # net = SNN_model.Unet_8x_SNN().to(device) state = torch.load('./checkpoint/6c18065b-f314-4a09-b809-5a0bb14be388' + 'Decoder_4x_SNN' + '.t7') net.load_state_dict(state['net']) # from collections import OrderedDict # def multi_GPU_net_load(model, check): # new_state = OrderedDict() # for layer_multi_GPU, name in state['net'].items(): # layer_single_gpu = layer_multi_GPU[7:] # new_state[layer_single_gpu] = name # model.load_state_dict(new_state) # return model # # net = multi_GPU_net_load(net,state) for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) data = data.float() target = target.float() output = net(data) common_utils.plot_single_tensor_image( data[0, :, :, :].squeeze()) # for ANN # common_utils.plot_single_tensor_image(data[:, :, :, :, 0].squeeze()) # for SNN common_utils.plot_single_tensor_image(output.squeeze()) common_utils.plot_single_tensor_image(target.squeeze()) if batch_idx > 0: break