Пример #1
0
def main(process_id, worker_number, args):
    # check "gpu_mapping.yaml" to see how to define the topology
    device = mapping_processes_to_gpu_device_from_yaml_file(process_id, worker_number, \
                                                            args.gpu_mapping_file, args.gpu_mapping_key)
    logging.info("process_id = %d, size = %d, device=%s" %
                 (process_id, worker_number, str(device)))
    # Set Transformer logger.
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    # Loading full data (for centralized learning)
    train_data_num, test_data_num, train_data_global, test_data_global, \
    train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
    data_attr = load_data(args, args.dataset)

    labels_map = data_attr["target_vocab"]
    num_labels = len(labels_map)

    logging.info(
        "num_clients = %d, train_data_num = %d, test_data_num = %d, num_labels = %d"
        % (data_attr["n_clients"], train_data_num, test_data_num, num_labels))

    # Transform data to DataFrame.
    # train_data = [(x, labels_map[y])
    #               for x, y in zip(train_data["X"], train_data["Y"])]
    # train_df = pd.DataFrame(train_data)

    # test_data = [(x, labels_map[y])
    #              for x, y in zip(test_data["X"], test_data["Y"])]
    # test_df = pd.DataFrame(test_data)

    # Create a ClassificationModel.
    transformer_model = ClassificationModel(
        args.model_type,
        args.model_name,
        num_labels=num_labels,
        labels_map=labels_map,
        args={
            "epochs": args.epochs,
            "learning_rate": args.learning_rate,
            "gradient_accumulation_steps": args.gradient_accumulation_steps,
            "do_lower_case": args.do_lower_case,
            "manual_seed": args.manual_seed,
            "reprocess_input_data": True,
            "overwrite_output_dir": True,
            "max_seq_length": args.max_seq_length,
            "train_batch_size": args.train_batch_size,
            "eval_batch_size": args.eval_batch_size,
            "dataloader_num_workers": 1,
            "thread_count": 1,
            "use_multiprocessing": False,
            "fp16": args.fp16,
            "n_gpu": args.n_gpu,
            "output_dir": args.output_dir,
            #   "wandb_project": "fednlp",
        })

    # Strat training.
    # model.train_model(train_df)

    model_trainer = TransformerTrainer(transformer_model=transformer_model,
                                       task_formulation="classification")

    # start FedAvg algorithm
    # for distributed algorithm, train_data_gloabl and test_data_global are required
    FedML_FedAvg_distributed(process_id, worker_number, device, comm,
                             transformer_model, train_data_num,
                             train_data_global, test_data_global,
                             train_data_local_num_dict, train_data_local_dict,
                             test_data_local_dict, args, model_trainer)
Пример #2
0
    # machine 1: worker0, worker4, worker8;
    # machine 2: worker1, worker5;
    # machine 3: worker2, worker6;
    # machine 4: worker3, worker7;
    # Therefore, we can see that workers are assigned according to the order of machine list.
    logging.info("process_id = %d, size = %d" % (process_id, worker_number))
    device = init_training_device(process_id, worker_number - 1,
                                  args.gpu_num_per_server)

    # load data
    dataset = load_data(args, args.dataset)
    [
        train_data_num, test_data_num, train_data_global, test_data_global,
        train_data_local_num_dict, train_data_local_dict, test_data_local_dict,
        class_num
    ] = dataset

    # create model.
    # Note if the model is DNN (e.g., ResNet), the training will be very slow.
    # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
    model = create_model(args, model_name=args.model, output_dim=dataset[7])

    # define my own trainer
    model_trainer = ClassificationTrainer(model)

    # start "federated averaging (FedAvg)"
    FedML_FedAvg_distributed(process_id, worker_number, device, comm, model,
                             train_data_num, train_data_global,
                             test_data_global, train_data_local_num_dict,
                             train_data_local_dict, test_data_local_dict, args,
                             model_trainer)
Пример #3
0
def main(process_id, worker_number, args):

    # GPU arrangement: Please customize this function according your own topology.
    # The GPU server list is configured at "mpi_host_file".
    # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker.
    # The 4 machines will be assigned as follows:
    # machine 1: worker0, worker4, worker8;
    # machine 2: worker1, worker5;
    # machine 3: worker2, worker6;
    # machine 4: worker3, worker7;
    # Therefore, we can see that workers are assigned according to the order of machine list.

    device = mapping_processes_to_gpu_device_from_yaml_file(process_id, worker_number, \
                                                            args.gpu_mapping_file, args.gpu_mapping_key)
    logging.info("process_id = %d, size = %d, device=%s" %
                 (process_id, worker_number, str(device)))
    # Set Transformer logger.
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    # Loading full data (for centralized learning)
    train_data_num, test_data_num, train_data_global, test_data_global, \
        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
            data_attr = load_data(args, args.dataset)

    logging.info("num_clients = %d, train_data_num = %d, test_data_num = %d" %
                 (data_attr["n_clients"], train_data_num, test_data_num))

    # Create a QuestinoAnsweringModel object.
    transformer_model = NERModel(
        args.model_type,
        args.model_name,
        args={
            "epochs": args.epochs,
            "learning_rate": args.learning_rate,
            "gradient_accumulation_steps": args.gradient_accumulation_steps,
            "do_lower_case": args.do_lower_case,
            "manual_seed": args.manual_seed,
            "reprocess_input_data": True,
            "overwrite_output_dir": True,
            "max_seq_length": args.max_seq_length,
            "train_batch_size": args.train_batch_size,
            "eval_batch_size": args.eval_batch_size,
            "dataloader_num_workers": 1,
            "thread_count": 1,
            "use_multiprocessing": False,
            "fp16": args.fp16,
            "n_gpu": args.n_gpu,
            "output_dir": args.output_dir,
            #   "wandb_project": "fednlp",
        })

    # Strat training.
    # model.train_model(train_df)

    model_trainer = TransformerTrainer(transformer_model=transformer_model,
                                       task_formulation="sequence_tagging")

    # start FedAvg algorithm
    # for distributed algorithm, train_data_gloabl and test_data_global are required
    FedML_FedAvg_distributed(process_id, worker_number, device, comm,
                             transformer_model, train_data_num,
                             train_data_global, test_data_global,
                             train_data_local_num_dict, train_data_local_dict,
                             test_data_local_dict, args, model_trainer)
Пример #4
0
    total_epochs = epochs * opt.comm_round

    lf = lambda x: ((1 + math.cos(x * math.pi / total_epochs)) / 2) * (1 - hyp[
        'lrf']) + hyp['lrf']  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    opt.scheduler = scheduler
    opt.optimizer = optimizer
    opt.ema = ema

    opt.hyp = hyp  # add hyperparameters

    opt.wandb = wandb
    device = init_training_device(process_id, worker_number - 1,
                                  opt.gpu_num_per_server)
    # start "federated averaging (FedAvg)"
    print("start distributed")

    try:
        # start "federated averaging (FedAvg)"
        print("start distributed")
        FedML_FedAvg_distributed(process_id, worker_number, device, comm,
                                 model, train_data_num, train_data_global,
                                 test_data_global, train_data_local_num_dict,
                                 train_data_local_dict, test_data_local_dict,
                                 opt, None, True, hyp)
    except Exception as e:
        print(e)
        logging.info('traceback.format_exc():\n%s' % traceback.format_exc())
        MPI.COMM_WORLD.Abort()