示例#1
0
    def fed_train_step(self):
        '''
        A single communication round (Upload weights for non-delayed nodes)
        '''
        self.weight_list = []
        self.nb_list = []

        if self.bad_node:
            print('bad node !!!!! number{}, size{}!!!!!!'.format(
                self.bad_node_nb, self.bad_node_size))
            for i in range(self.bad_node_nb):
                bad_model = ANN_model()
                bad_weight = bad_model.get_weights()
                bad_model_w = get_nb_matrix(bad_weight, self.bad_node_size)
                self.weight_list.append(bad_weight)
                self.nb_list.append(bad_model_w)

        for index_path in self.nodes_p:
            # Run each nodes and collect their weights
            model_weights, nb = node_training_process(
                index_path, self.shared_index, self.central_p,
                self.local_epoch, self.batch_size, self.augument,
                self.local_iid, self.node_evl)
            model_w = get_nb_matrix(model_weights, nb)
            self.weight_list.append(model_weights)
            self.nb_list.append(model_w)
            tf.keras.backend.clear_session(
            )  ########## to solve memory leak 2/2

        self.epo = self.epo + 1

        # memory(self.epo)  ## Testing memory usage

        return True
示例#2
0
import glob
from models import ANN_model
from fedml import Fed_Training
import datetime


gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3000)])

# data 
index_collection=glob.glob('worker_nodes/*/index.npy')
central_weight_path=os.path.join('central_node','ANN_model.h5')
(_, _), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# build central folder & start_up model
model=ANN_model()
shutil.rmtree('central_node', ignore_errors=True)
os.makedirs('central_node')
model.save_weights(central_weight_path)


###################################################################################################
my_EPO = 5000
loc_EPO = 1
early_STOP = 500
usual_node_NUMBER = 10
delayed_node_NUMBER = 0
shared_node_NUMBER = 0
delayed_SPEED = 1

my_AUGUMENT= False
示例#3
0
    index1 = index_list[i]
    print(len(y_train[index1]))
    print(np.unique(y_train[index1], return_counts=True))

##******************************************************************************

base_path = 'worker_nodes'
workers = ['model' + str(int(i + 1)) for i in range(worker_nb)
           ]  #save like 'worker_nodes/model1/index.npy'
shutil.rmtree(base_path, ignore_errors=True)
for i in range(worker_nb):
    worker = workers[i]
    worker_dir = os.path.join(base_path, worker)
    try:
        os.makedirs(worker_dir)
    except:
        pass
    file_path = os.path.join(
        worker_dir,
        'index.npy')  ## TODO: copy the training code of the worker to this dir
    np.save(file_path, index_list[i])

index_collection = glob.glob('worker_nodes/*/index.npy')
central_weight_path = os.path.join('central_node', 'ANN_model.h5')

from models import ANN_model

model = ANN_model()
shutil.rmtree('central_node', ignore_errors=True)
os.makedirs('central_node')
model.save_weights(central_weight_path)
示例#4
0
def node_training_process(index_path,
                          shared_index,
                          central_weight_path,
                          local_epoch,
                          batch_size=50,
                          augment=False,
                          local_iid=False,
                          node_evl=False):
    '''
    1. Get index and initial_weights from central,
    2. Load & prepare the dataset accordingly,
    3. Training,
    4. Return weights to central
    * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir')
    * Saving node weights locally can be a safe way if node have many data, but here we just neglect this
    '''
    g = tf.Graph()
    with g.as_default():  #tf.graph to solve memory leak

        # load & processing data
        (x_train, y_train), (x_test,
                             y_test) = tf.keras.datasets.cifar10.load_data()

        autotune = tf.data.experimental.AUTOTUNE

        # load index
        index1 = np.load(index_path)
        ori_traning = index1.shape[0]  # node's own data size

        # assign node_evl_set (1/2)
        if node_evl:
            evl_p = index_path[:-9] + 'evl_index.npy'
            evl_index = np.load(evl_p)
            x_test_n = x_test[evl_index]
            y_test_n = y_test[evl_index]
            node_evl_list = []
            total_node_evl_list = []
            for i in range(10):
                index0 = np.where(y_test_n == i)
                index = index0[0]
                x_evl = tf.data.Dataset.from_tensor_slices(x_test_n[index])
                y_evl = tf.data.Dataset.from_tensor_slices(y_test_n[index])
                node_evl_set = tf.data.Dataset.zip((x_evl, y_evl))
                node_evl_set = node_evl_set.repeat().batch(1).prefetch(
                    buffer_size=autotune)
                total_node_evl = len(index)
                node_evl_list.append(node_evl_set)
                total_node_evl_list.append(total_node_evl)

        # if shared_index!=[]:
        #     shared_test_index = np.array([0])
        #     for x in shared_index:
        #         b=np.load(x)
        #         index1 = np.concatenate((index1, b))
        #         shared_test_index = np.concatenate((shared_test_index, b))
        #     shared_test_index = shared_test_index[1:]
        #     x_test_shared=x_train[shared_test_index]
        #     y_test_shared=y_train[shared_test_index]
        #     x_shared_evl=tf.data.Dataset.from_tensor_slices(x_test_shared)
        #     y_shared_evl=tf.data.Dataset.from_tensor_slices(y_test_shared)
        #     shared_evl_set = tf.data.Dataset.zip((x_shared_evl, y_shared_evl))
        #     shared_evl_set = shared_evl_set.repeat().batch(batch_size).prefetch(buffer_size=autotune)
        #     total_shared_evl = shared_test_index.shape[0] ###################################

        x_train_i = x_train[index1]
        y_train_i = y_train[index1]

        print(np.unique(y_train_i,
                        return_counts=True))  ##############################

        if -1 in index1:
            iii = [random.randint(0, 40000) for i in range(len(index1))]
            x_train_i = x_train[iii]
            iii = [random.randint(0, 40000) for i in range(len(index1))]
            y_train_i = y_train[iii]
            print(np.unique(
                y_train_i, return_counts=True))  ##############################

        buffer_size = x_train_i.shape[0]
        # total_traning=index1.shape[0]

        x_tr = tf.data.Dataset.from_tensor_slices(x_train_i)
        y_tr = tf.data.Dataset.from_tensor_slices(y_train_i)
        total_traning = len(x_train_i)

        if local_iid == True:
            y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i)
            print(
                np.unique(y_train_i2,
                          return_counts=True))  ##############################
            total_traning = len(x_train_i2)
            x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2)
            y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2)

        print(np.unique(y_train_i,
                        return_counts=True))  ##############################
        train_set = tf.data.Dataset.zip((x_tr, y_tr))
        if augment == True:
            train_set = train_set.map(img_augument).shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)
        else:
            train_set = train_set.shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)

    # Training & save
    # THIS LINE SHOULD BE THE FIRST
        save_dir = index_path[:-9]

        model = ANN_model()
        model.load_weights(central_weight_path)

        # node_evl before training (2/2)
        if node_evl:
            filename = os.path.join(save_dir, 'node_EVAL_before_training.txt')
            with open(filename, 'a') as file_handle:
                for i in range(10):
                    if total_node_evl_list[i] == 0:
                        file_handle.write('200')
                        file_handle.write(' ')
                    else:
                        [loss, acc
                         ] = model.evaluate(node_evl_list[i],
                                            steps=total_node_evl_list[i] // 1,
                                            verbose=0)
                        file_handle.write(str(acc))
                        file_handle.write(' ')
                file_handle.write('\n')

        # # see if overtrained over the shared index
        # if shared_index!=[]:
        #     [loss, acc]=model.evaluate(shared_evl_set,steps=total_shared_evl//batch_size,verbose=0)
        #     filename = os.path.join(save_dir,'shared_EVAL.txt')
        #     with open(filename,'a') as file_handle:
        #             file_handle.write(str(loss))
        #             file_handle.write(' ')
        #             file_handle.write(str(acc))
        #             file_handle.write('\n')

        # test the loaded model to see if it's overtrainned? mention it's last epo's acc
        [self_loss,
         self_acc] = model.evaluate(train_set,
                                    steps=total_traning // batch_size,
                                    verbose=0)
        filename = os.path.join(save_dir, 'self_EVAL.txt')
        with open(filename, 'a') as file_handle:
            file_handle.write(str(self_loss))
            file_handle.write(' ')
            file_handle.write(str(self_acc))
            file_handle.write('\n')

        history = model.fit(train_set,
                            epochs=local_epoch,
                            steps_per_epoch=total_traning // batch_size,
                            verbose=0)

        # return model_weight
        model_weights = model.get_weights()

    del model

    # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes.
    #       And compare this weighted average to current one (a centralized testing set)

    return model_weights, total_traning
示例#5
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=48, metavar='N',
                        help='input batch size for training (default: 1)')
    parser.add_argument('--test-batch-size', type=int, default=48, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=100, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=1e-4, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-net', action='store_true', default=True,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)


    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    writer = SummaryWriter('./summaries/cifar10')

    data_generation_parameters = load_configuration_parameters.load_data_generation_config_paras()
    data_path = data_generation_parameters['output_directory']

    train_dataset = STORM_DVS(train=True,  win=100, path=data_path, net_model='ANN')
    test_dataset = STORM_DVS(train=False,  win=100, path=data_path, net_model='ANN')

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=args.test_batch_size, shuffle=True, **kwargs)

    os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
    device = torch.device("cuda:6" if use_cuda else "cpu")
    net = ANN_model.Unet_4x_ANN()
    # net = nn.DataParallel(net, device_ids=[2, 3])
    net = net.to(device)
    net.apply(init_weights)
    optimizer = optim.Adam(net.parameters(), lr=args.lr)

    criterion = psf_loss(device)
    # criterion = psf_weighted_loss()
    criterion = MSE_and_L1_loss()
    # criterion = nn.CrossEntropyLoss()
    save_id = uuid.uuid4()
    for epoch in range(1, args.epochs + 1):
        train(args, net, device, train_loader, optimizer, epoch, writer, criterion, save_id)
        test(args, net, device, test_loader, epoch, writer, criterion)

    writer.close()
示例#6
0
def node_training_process(index_path,
                          shared_index,
                          central_weight_path,
                          local_epoch,
                          batch_size=50,
                          augment=False,
                          local_iid=False):
    '''
    1. Get index and initial_weights from central,
    2. Load & prepare the dataset accordingly,
    3. Training,
    4. Return weights to central
    * In reality, it doesn't need index from central, it can read all local data like glob.glob('data_dir')
    * Saving node weights locally can be a safe way if node have many data, but here we just neglect this
    '''
    g = tf.Graph()
    with g.as_default():  #tf.graph to solve memory leak

        # load index
        index1 = np.load(index_path)

        if shared_index != []:
            for x in shared_index:
                b = np.load(x)
                index1 = np.concatenate((index1, b))

        # load & processing data
        (x_train, y_train), (_, _) = tf.keras.datasets.cifar10.load_data()

        x_train_i = x_train[index1]
        y_train_i = y_train[index1]

        autotune = tf.data.experimental.AUTOTUNE
        buffer_size = x_train_i.shape[0]
        total_traning = index1.shape[0]

        x_tr = tf.data.Dataset.from_tensor_slices(x_train_i)
        y_tr = tf.data.Dataset.from_tensor_slices(y_train_i)

        if local_iid == True:
            y_train_i2, x_train_i2 = set_iid(y_train_i, x_train_i)
            x_tr = tf.data.Dataset.from_tensor_slices(x_train_i2)
            y_tr = tf.data.Dataset.from_tensor_slices(y_train_i2)

        train_set = tf.data.Dataset.zip((x_tr, y_tr))
        if augment == True:
            train_set = train_set.map(img_augument).shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)
        else:
            train_set = train_set.shuffle(
                buffer_size, reshuffle_each_iteration=True).repeat().batch(
                    batch_size).prefetch(buffer_size=autotune)

    # Training & save
        save_dir = index_path[:-9]

        model = ANN_model()
        model.load_weights(central_weight_path)

        # test the loaded model to see if it's overtrainned? mention it's last epo's acc
        [self_loss,
         self_acc] = model.evaluate(train_set,
                                    steps=total_traning // batch_size,
                                    verbose=0)
        filename = os.path.join(save_dir, 'self_EVAL.txt')
        with open(filename, 'a') as file_handle:
            file_handle.write(str(self_loss))
            file_handle.write(' ')
            file_handle.write(str(self_acc))
            file_handle.write('\n')

        history = model.fit(train_set,
                            epochs=local_epoch,
                            steps_per_epoch=total_traning // batch_size,
                            verbose=0)

        # return model_weight
        model_weights = model.get_weights()

    del model

    # TODO: Change/add validation based on the split of data on the worker node -----------> save locally in worker nodes.
    #       And compare this weighted average to current one (a centralized testing set)

    return model_weights, total_traning


# index_path
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=1,
                        metavar='N',
                        help='input batch size for training (default: 1)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-4,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=True,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda:0" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    writer = SummaryWriter('./summaries/cifar10')

    data_generation_parameters = load_configuration_parameters.load_data_generation_config_paras(
    )
    data_path = data_generation_parameters['output_directory']

    train_dataset = STORM_DVS(train=True,
                              win=100,
                              path=data_path,
                              net_model='ANN',
                              Normalize='True')
    test_dataset = STORM_DVS(train=False,
                             win=100,
                             path=data_path,
                             net_model='ANN',
                             Normalize='True')

    # train_dataset = STORM_DVS(train=True,  win=100, path=data_path, net_model='SNN', Normalize = 'True')
    # test_dataset = STORM_DVS(train=False,  win=100, path=data_path, net_model='SNN', Normalize = 'True')

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    # data_path = "E:\PHD\DVS_STORM_SOFI\DVS\DVS_data/"
    # net = model.enconder_decoder_4x_SNN().to(device)
    # net = ANN_model.resnet18().to(device)
    net = ANN_model.Unet_4x_ANN().to(device)
    # net = SNN_model.Unet_8x_SNN().to(device)

    state = torch.load('./checkpoint/6c18065b-f314-4a09-b809-5a0bb14be388' +
                       'Decoder_4x_SNN' + '.t7')
    net.load_state_dict(state['net'])

    # from collections import OrderedDict
    # def multi_GPU_net_load(model, check):
    #     new_state = OrderedDict()
    #     for layer_multi_GPU, name in state['net'].items():
    #         layer_single_gpu = layer_multi_GPU[7:]
    #         new_state[layer_single_gpu] = name
    #     model.load_state_dict(new_state)
    #     return model
    #
    # net = multi_GPU_net_load(net,state)

    for batch_idx, (data, target) in enumerate(test_loader):
        data, target = data.to(device), target.to(device)
        data = data.float()
        target = target.float()
        output = net(data)
        common_utils.plot_single_tensor_image(
            data[0, :, :, :].squeeze())  # for ANN
        # common_utils.plot_single_tensor_image(data[:, :, :, :, 0].squeeze()) # for SNN
        common_utils.plot_single_tensor_image(output.squeeze())
        common_utils.plot_single_tensor_image(target.squeeze())

        if batch_idx > 0:
            break