def generate_test_sets(trials, N_test, Delta, d, generator=MaxCallStochasticModel): """ generate #trials test sets of given dimensions using the util func in aggregating :return: X_test_lists, y_test_list of specified dimensions; stacked into a single numpy array (trials, N,Delta*d / 1) """ X_test_list = [] y_test_list = [] for _ in range(trials): X_test, y_test = generate_train_set(N_test, Delta, d, generator) X_test_list.append(X_test) y_test_list.append(y_test) return np.stack(X_test_list, axis=0), np.stack(y_test_list, axis=0)
## create logger logger = generate_logger_MPI(LOGFILE, LOGLEVEL, rank) logger.info(f"node with rank {rank} started") if rank == 0: """ executed by main MPI process mpiexec -n <num_nodes> python -m mpi4py.futures mpi\mpi_bagging.py will create 1 dispatcher node with rank 0 and num_node-1 workers for the pool """ ## generate Training set, Test set & V_0s X_train, y_train = generate_train_set(Config.N_train, Config.Delta, Config.d) X_test, y_test = generate_test_set(Config.N_test, Config.Delta, Config.d) V_0_train = generate_V_0(Config.N_train, Config.Delta, Config.d) V_0_test = generate_V_0(Config.N_test, Config.Delta, Config.d) logger.info(f"V_0_test = {V_0_test}") reference = create_GPR(Config.N_train) reference.fit(X_train, y_train) f_X = reference.predict(X_test) reference_error = normalized_error_VT(f_X, y_test, V_0_test) logger.info(f"reference error : {reference_error}") ## MPI execute results = []
return train_and_evaluate(model, X_train, y_train, DataContainer.X_test_list) ## init comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() logger = generate_logger_MPI(LOGFILE, LOGLEVEL, rank) logger.info(f"node with rank {rank}/{size} started") ## let the main task create the train & testsets if rank == 0: logger.info(f"creating train & testsets") DataContainer.X_train, DataContainer.y_train = generate_train_set( Config.N_train, Config.Delta, Config.d) DataContainer.X_test_list, DataContainer.y_test_list = generate_test_sets( Config.trials, Config.N_test, Config.Delta, Config.d) ## broadcast the required data to all nodes # broadcast the numpy arrays separately for efficiency gains # make broadcasts non blocking since the worker nodes are spawned at different times # https://github.com/mpi4py/mpi4py/blob/70333ef76db05f643347b9880a05967891fb1eed/src/mpi4py/MPI/Comm.pyx#L750 # this feature is not documented in the documentation, but the source code clearly indicates it is present xtrain_req = comm.Ibcast(DataContainer.X_train, root=0) ytrain_req = comm.Ibcast(DataContainer.y_train, root=0) ytest_req = comm.Ibcast(DataContainer.X_test_list, root=0) if rank > 0: # want the broadcast to be blocking since we need the data before continuing logger.debug(f"waiting for broadcasts")