def processData(device_index, start_samples, samples, federated,
                full_data_size, number_of_batches, parameter_server,
                sample_distribution):
    pause(5)  # PS server (if any) starts first
    checkpointpath1 = 'results/model{}.h5'.format(device_index)
    outfile = 'results/dump_train_variables{}.npz'.format(device_index)
    outfile_models = 'results/dump_train_model{}.npy'.format(device_index)
    global_model = 'results/model_global.npy'
    global_epoch = 'results/epoch_global.npy'

    np.random.seed(1)
    tf.random.set_seed(1)  # common initialization

    learning_rate = args.mu
    learning_rate_local = learning_rate

    B = np.ones((devices, devices)) - tf.one_hot(np.arange(devices), devices)
    Probabilities = B[device_index, :] / (devices - 1)
    training_signal = False

    # check for backup variables on start
    if os.path.isfile(checkpointpath1):
        train_start = False

        # backup the model and the model target
        model = models.load_model(checkpointpath1)
        data_history = []
        label_history = []
        local_model_parameters = np.load(outfile_models, allow_pickle=True)
        model.set_weights(local_model_parameters.tolist())

        dump_vars = np.load(outfile, allow_pickle=True)
        frame_count = dump_vars['frame_count']
        epoch_loss_history = dump_vars['epoch_loss_history'].tolist()
        running_loss = np.mean(epoch_loss_history[-5:])
        epoch_count = dump_vars['epoch_count']
    else:
        train_start = True
        model = create_q_model()
        data_history = []
        label_history = []
        frame_count = 0
        # Experience replay buffers
        epoch_loss_history = []
        epoch_count = 0
        running_loss = math.inf

    if parameter_server:
        epoch_global = 0

    training_end = False

    a = model.get_weights()
    # set an arbitrary optimizer, here Adam is used
    optimizer = keras.optimizers.Adam(learning_rate=args.mu, clipnorm=1.0)
    # create a data object (here radar data)
    start = time.time()
    data_handle = RadarData(filepath, device_index, start_samples, samples,
                            full_data_size, args.random_data_distribution)
    end = time.time()
    time_count = (end - start)
    print(time_count)
    # create a consensus object
    cfa_consensus = CFA_process(devices, device_index, args.N)

    while True:  # Run until solved
        # collect 1 batch
        frame_count += 1
        obs, labels = data_handle.getTrainingData(batch_size)
        data_batch = preprocess_observation(obs, batch_size)

        # Save data and labels in the current learning session
        data_history.append(data_batch)
        label_history.append(labels)

        if frame_count % number_of_batches == 0:
            if not parameter_server:
                epoch_count += 1
            # check scheduling for federated
            if federated:
                if epoch_count == 1 or scheduling_tx[device_index,
                                                     epoch_count] == 1:
                    training_signal = False
                else:
                    # stop all computing, just save the previous model
                    training_signal = True
                    model_weights = np.asarray(model.get_weights())
                    model.save(checkpointpath1,
                               include_optimizer=True,
                               save_format='h5')
                    np.savez(outfile,
                             frame_count=frame_count,
                             epoch_loss_history=epoch_loss_history,
                             training_end=training_end,
                             epoch_count=epoch_count,
                             loss=running_loss)
                    np.save(outfile_models, model_weights)
            # check scheduling for parameter server
            if parameter_server:
                while not os.path.isfile(global_epoch):
                    # implementing consensus
                    print("waiting")
                    pause(1)
                try:
                    epoch_global = np.load(global_epoch, allow_pickle=True)
                except:
                    pause(5)
                    print("retrying opening global epoch counter")
                    try:
                        epoch_global = np.load(global_epoch, allow_pickle=True)
                    except:
                        print("failed reading global epoch")

                if epoch_global == 0:
                    training_signal = False

                elif scheduling_tx[device_index, epoch_global] == 1:
                    if epoch_global > epoch_count:
                        epoch_count = epoch_global
                        training_signal = False
                    else:
                        training_signal = True
                else:
                    # stop all computing, just save the previous model
                    training_signal = True

                # always refresh the local model using the PS one
                stop_aggregation = False
                while not os.path.isfile(global_model):
                    # implementing consensus
                    print("waiting")
                    pause(1)
                try:
                    model_global = np.load(global_model, allow_pickle=True)
                except:
                    pause(5)
                    print("retrying opening global model")
                    try:
                        model_global = np.load(global_model, allow_pickle=True)
                    except:
                        print("halting aggregation")
                        stop_aggregation = True

                if not stop_aggregation:
                    model.set_weights(model_global.tolist())

                if training_signal:
                    model_weights = np.asarray(model.get_weights())
                    model.save(checkpointpath1,
                               include_optimizer=True,
                               save_format='h5')
                    np.savez(outfile,
                             frame_count=frame_count,
                             epoch_loss_history=epoch_loss_history,
                             training_end=training_end,
                             epoch_count=epoch_count,
                             loss=running_loss)
                    np.save(outfile_models, model_weights)
            # check schedulting for parameter server

        # Local learning update every "number of batches" batches
        time_count = 0
        if frame_count % number_of_batches == 0 and not training_signal:
            # run local batches
            for i in range(number_of_batches):
                start = time.time()
                data_sample = np.array(data_history[i])
                label_sample = np.array(label_history[i])

                # Create a mask to calculate loss
                masks = tf.one_hot(label_sample, n_outputs)

                with tf.GradientTape() as tape:
                    # Train the model on data samples
                    classes = model(data_sample)
                    # Apply the masks
                    class_v = tf.reduce_sum(tf.multiply(classes, masks),
                                            axis=1)
                    # Calculate loss
                    loss = loss_function(label_sample, class_v)

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
                end = time.time()
                time_count = time_count + (end - start) / number_of_batches
            if not parameter_server and not federated:
                print('Average batch training time {:.2f}'.format(time_count))
            del data_history
            del label_history
            data_history = []
            label_history = []

            model_weights = np.asarray(model.get_weights())
            model.save(checkpointpath1,
                       include_optimizer=True,
                       save_format='h5')
            np.savez(outfile,
                     frame_count=frame_count,
                     epoch_loss_history=epoch_loss_history,
                     training_end=training_end,
                     epoch_count=epoch_count,
                     loss=running_loss)
            np.save(outfile_models, model_weights)

            #  Consensus round
            # update local model
            cfa_consensus.update_local_model(model_weights)
            # neighbor = cfa_consensus.get_connectivity(device_index, args.N, devices) # fixed neighbor

            if not train_start:
                if federated and not training_signal:
                    eps_c = 1 / (args.N + 1)
                    # apply consensus for model parameter
                    # neighbor = np.random.choice(np.arange(devices), args.N, p=Probabilities, replace=False) # choose neighbor
                    neighbor = np.random.choice(
                        indexes_tx[:, epoch_count - 1], args.N,
                        replace=False)  # choose neighbor
                    while neighbor == device_index:
                        neighbor = np.random.choice(
                            indexes_tx[:, epoch_count - 1],
                            args.N,
                            replace=False)  # choose neighbor
                    print(
                        "Consensus from neighbor {} for device {}, local loss {:.2f}"
                        .format(neighbor, device_index, loss.numpy()))

                    model.set_weights(
                        cfa_consensus.federated_weights_computing(
                            neighbor, args.N, epoch_count, eps_c, max_lag))
                    if cfa_consensus.getTrainingStatusFromNeightbor():
                        # a neighbor completed the training, with loss < target, transfer learning is thus applied (the device will copy and reuse the same model)
                        training_signal = True  # stop local learning, just do validation
            else:
                print("Consensus warm up")
                train_start = False

            # check if parameter server is enabled
            # stop_aggregation = False

            # if parameter_server:
            #     # pause(refresh_server)
            #     while not os.path.isfile(global_model):
            #         # implementing consensus
            #         print("waiting")
            #         pause(1)
            #     try:
            #         model_global = np.load(global_model, allow_pickle=True)
            #     except:
            #         pause(5)
            #         print("retrying opening global model")
            #         try:
            #             model_global = np.load(global_model, allow_pickle=True)
            #         except:
            #             print("halting aggregation")
            #             stop_aggregation = True
            #
            #     if not stop_aggregation:
            #         # print("updating from global model inside the parmeter server")
            #         for k in range(cfa_consensus.layers):
            #             # model_weights[k] = model_weights[k]+ 0.5*(model_global[k]-model_weights[k])
            #             model_weights[k] = model_global[k]
            #         model.set_weights(model_weights.tolist())
            #
            #     while not os.path.isfile(global_epoch):
            #         # implementing consensus
            #         print("waiting")
            #         pause(1)
            #     try:
            #         epoch_global = np.load(global_epoch, allow_pickle=True)
            #     except:
            #         pause(5)
            #         print("retrying opening global epoch counter")
            #         try:
            #             epoch_global = np.load(global_epoch, allow_pickle=True)
            #         except:
            #             print("halting aggregation")

            del model_weights

        #start = time.time()
        # validation tool for device 'device_index'
        if epoch_count > validation_start and frame_count % number_of_batches == 0:
            avg_cost = 0.
            for i in range(number_of_batches_for_validation):
                obs_valid, labels_valid = data_handle.getTestData(
                    batch_size, i)
                # obs_valid, labels_valid = data_handle.getRandomTestData(batch_size)
                data_valid = preprocess_observation(np.squeeze(obs_valid),
                                                    batch_size)
                data_sample = np.array(data_valid)
                label_sample = np.array(labels_valid)
                # Create a mask to calculate loss
                masks = tf.one_hot(label_sample, n_outputs)
                classes = model(data_sample)
                # Apply the masks
                class_v = tf.reduce_sum(tf.multiply(classes, masks), axis=1)
                # Calculate loss
                loss = loss_function(label_sample, class_v)
                avg_cost += loss / number_of_batches_for_validation  # Training loss
            epoch_loss_history.append(avg_cost)
            print("Device {} epoch count {}, validation loss {:.2f}".format(
                device_index, epoch_count, avg_cost))
            # mean loss for last 5 epochs
            running_loss = np.mean(epoch_loss_history[-1:])
        #end = time.time()
        #time_count = (end - start)
        #print(time_count)

        if running_loss < target_loss:  # Condition to consider the task solved
            print(
                "Solved for device {} at epoch {} with average loss {:.2f} !".
                format(device_index, epoch_count, running_loss))
            training_end = True
            model_weights = np.asarray(model.get_weights())
            model.save(checkpointpath1,
                       include_optimizer=True,
                       save_format='h5')
            # model_target.save(checkpointpath2, include_optimizer=True, save_format='h5')
            np.savez(outfile,
                     frame_count=frame_count,
                     epoch_loss_history=epoch_loss_history,
                     training_end=training_end,
                     epoch_count=epoch_count,
                     loss=running_loss)
            np.save(outfile_models, model_weights)

            if federated:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "neighbors": args.N,
                    "active_devices": args.Ka_consensus,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }
            elif parameter_server:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "active_devices": active_devices_per_round,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }
            else:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }

            if federated:
                sio.savemat(
                    "results/matlab/CFA_device_{}_samples_{}_devices_{}_active_{}_neighbors_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(device_index, samples, devices, args.Ka_consensus,
                            args.N, number_of_batches, batch_size,
                            args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
                sio.savemat(
                    "CFA_device_{}_samples_{}_devices_{}_neighbors_{}_batches_{}_size{}.mat"
                    .format(device_index, samples, devices, args.N,
                            number_of_batches, batch_size), dict_1)
            elif parameter_server:
                sio.savemat(
                    "results/matlab/FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(device_index, samples, devices,
                            active_devices_per_round, number_of_batches,
                            batch_size, args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
                sio.savemat(
                    "FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}.mat"
                    .format(device_index, samples, devices,
                            active_devices_per_round, number_of_batches,
                            batch_size), dict_1)
            else:  # CL
                sio.savemat(
                    "results/matlab/CL_samples_{}_devices_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(samples, devices, number_of_batches, batch_size,
                            args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
            break

        if epoch_count > max_epochs:  # stop simulation
            print("Unsolved for device {} at epoch {}!".format(
                device_index, epoch_count))
            training_end = True
            model_weights = np.asarray(model.get_weights())
            model.save(checkpointpath1,
                       include_optimizer=True,
                       save_format='h5')
            # model_target.save(checkpointpath2, include_optimizer=True, save_format='h5')
            np.savez(outfile,
                     frame_count=frame_count,
                     epoch_loss_history=epoch_loss_history,
                     training_end=training_end,
                     epoch_count=epoch_count,
                     loss=running_loss)
            np.save(outfile_models, model_weights)

            if federated:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "neighbors": args.N,
                    "active_devices": args.Ka_consensus,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }
            elif parameter_server:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "active_devices": active_devices_per_round,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }
            else:
                dict_1 = {
                    "epoch_loss_history": epoch_loss_history,
                    "federated": federated,
                    "parameter_server": parameter_server,
                    "devices": devices,
                    "batches": number_of_batches,
                    "batch_size": batch_size,
                    "samples": samples,
                    "noniid": args.noniid_assignment,
                    "data_distribution": args.random_data_distribution
                }

            if federated:
                sio.savemat(
                    "results/matlab/CFA_device_{}_samples_{}_devices_{}_active_{}_neighbors_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(device_index, samples, devices, args.Ka_consensus,
                            args.N, number_of_batches, batch_size,
                            args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
                sio.savemat(
                    "CFA_device_{}_samples_{}_devices_{}_neighbors_{}_batches_{}_size{}.mat"
                    .format(device_index, samples, devices, args.N,
                            number_of_batches, batch_size), dict_1)
            elif parameter_server:
                sio.savemat(
                    "results/matlab/FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(device_index, samples, devices,
                            active_devices_per_round, number_of_batches,
                            batch_size, args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
                sio.savemat(
                    "FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}.mat"
                    .format(device_index, samples, devices,
                            active_devices_per_round, number_of_batches,
                            batch_size), dict_1)
            else:  # CL
                sio.savemat(
                    "results/matlab/CL_samples_{}_devices_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat"
                    .format(samples, devices, number_of_batches, batch_size,
                            args.noniid_assignment, args.run,
                            args.random_data_distribution), dict_1)
            break
Пример #2
0
        # Experience replay buffers
        epoch_loss_history = []
        epoch_count = 0
        running_loss = math.inf

    training_end = False
    # set an arbitrary optimizer, here Adam is used
    optimizer = keras.optimizers.Adam(learning_rate=args.mu, clipnorm=1.0)
    # create a data object (here radar data)
    # data_handle = MnistData(device_index, start_index, training_set_per_device, validation_train, 0)
    if args.noniid_assignment == 1:
        data_handle = RadarData_tasks(filepath, device_index, start_index,
                                      training_set_per_device,
                                      validation_train, 0)
    else:
        data_handle = RadarData(filepath, device_index, start_index,
                                training_set_per_device, validation_train, 0)
    # create a consensus object (only for consensus)
    # cfa_consensus = CFA_process(device_index, args.N)

    model_weights = np.asarray(model.get_weights())
    layers = model_weights.size

    del model_weights

    while epoch_count < local_rounds:
        frame_count += 1
        obs, labels = data_handle.getTrainingData(batch_size)
        data_batch = preprocess_observation(obs, batch_size)
        # Save data and labels in the current learning session
        data_history.append(data_batch)
        label_history.append(labels)