def _init_mpi(): """provides a way to manually set the thread init mode for MPI if necessary. Needs to happen as early as possible, otherwise mpi4py might auto-init somewhere else. """ try: import mpi4py except ImportError: return # only change finalize setting if unset finalize = (mpi4py.rc.finalize is None) or mpi4py.rc.finalize mpi4py.rc(initialize=False, finalize=finalize) from mpi4py import MPI if not MPI.Is_initialized(): required_level = int( os.environ.get('PYMOR_MPI_INIT_THREAD', MPI.THREAD_MULTIPLE)) supported_lvl = MPI.Init_thread(required_level) if supported_lvl < required_level: print( f'MPI does support threading level {required_level}, running with {supported_lvl} instead', flush=True) try: # this solves sporadic mpi calls happening after finalize import petsc4py petsc4py.init() except ImportError: return
def start_mpi(block_nonroot_stdout=True): """ Check if MPI has already been initialized. If so, just set the communicators, Npus, and rank variables. Parameters ---------- block_nonroot_stdout : bool (True) Redirect stdout on nonzero ranks to /dev/null, for cleaner output. """ global world_comm, node_comm, rank_comm, rank, Npus if not MPI.Is_initialized(): MPI.Init_thread(MPI.THREAD_MULTIPLE) atexit.register(MPI.Finalize) world_comm = MPI.COMM_WORLD node_comm = world_comm.Split_type(MPI.COMM_TYPE_SHARED) rank_comm = world_comm.Split(color=node_comm.rank) Npus = world_comm.Get_size() rank = world_comm.Get_rank() set_mpi_excepthook(world_comm) world_comm.Barrier() if (not rank == 0) and block_nonroot_stdout: # pragma: no cover # For non-root ranks, do not print to stdout. # (Uncovered until we have multi-rank tests) sys.stdout = open('/dev/null', 'w')
def main(self): MPI.Init_thread(MPI.THREAD_MULTIPLE) if MPI.Query_thread() != MPI.THREAD_MULTIPLE: print 'ERROR: make sure MPI is configured with thread support' self.terminate() self.read_commandline() self.init_logger() self.show_banner() self.list_solvers() self.init_defaults() if self.read_controlfile() and self.load_solver(): self.initialise_solver() self.start_solver()
def set_level(cls, level): if cls.__LEVEL != None: raise ParallelismAlreadySet( 'Can not reset the parallelism level when it has already been set.' ) if level not in cls.__LEVELS: raise ValueError( f'Unrecognized parallelism option! Valid choices are {cls.__LEVELS}' ) cls.__LEVEL = level if level == LEVEL_2: MPI.Init_thread() atexit.register(MPI.Finalize)
def __init__( self, run_function, num_workers: int = None, callbacks=None, run_function_kwargs=None, comm=None, ): super().__init__(run_function, num_workers, callbacks, run_function_kwargs) if not MPI.Is_initialized(): MPI.Init_thread() self.comm = comm if comm else MPI.COMM_WORLD self.num_workers = self.comm.Get_size() - 1 # 1 rank is the master self.sem = asyncio.Semaphore(self.num_workers) logging.info( f"Creating MPIPoolExecutor with {self.num_workers} max_workers...") self.executor = MPIPoolExecutor(max_workers=self.num_workers) logging.info("Creation of MPIPoolExecutor done")
from mpi4py import rc rc.initialize = False from mpi4py import MPI assert not MPI.Is_initialized() assert not MPI.Is_finalized() MPI.Init_thread() assert MPI.Is_initialized() assert not MPI.Is_finalized() import sys name, _ = MPI.get_vendor() if name == 'MPICH': assert MPI.Query_thread() == MPI.THREAD_MULTIPLE if name == 'MPICH2' and sys.platform[:3] != 'win': assert MPI.Query_thread() == MPI.THREAD_MULTIPLE MPI.Finalize() assert MPI.Is_initialized() assert MPI.Is_finalized()
def __init__(self): if not MPI.Is_initialized(): MPI.Init_thread() self.comm = MPI.COMM_WORLD self.size = self.comm.size self.rank = self.comm.rank
import sys, copy, time, os import libessc as essc #import testlib as essc import numpy as np from mpi4py import rc rc.initialize = False from mpi4py import MPI assert not MPI.Is_initialized() assert not MPI.Is_finalized() MPI.Init_thread(MPI.THREAD_MULTIPLE) assert MPI.Is_initialized() assert not MPI.Is_finalized() comm = MPI.COMM_WORLD rank = comm.Get_rank() world_Size = comm.Get_size() thisHost = os.getenv('HOSTNAME') if rank == 0: print Version() print Description essc.printInfo() # Variables seqFiles = [] ABseqFiles = []
def __init__( self, problem, run_function, random_state: int = None, log_dir: str = ".", verbose: int = 0, comm=None, run_function_kwargs: dict = None, n_jobs: int = 1, surrogate_model: str = "RF", surrogate_model_kwargs: dict = None, n_initial_points: int = 10, lazy_socket_allocation: bool = False, communication_batch_size=2048, sync_communication: bool = False, sync_communication_freq: int = 10, checkpoint_file: str = "results.csv", checkpoint_freq: int = 1, acq_func: str = "UCB", acq_optimizer: str = "auto", kappa: float = 1.96, xi: float = 0.001, sample_max_size: int = -1, sample_strategy: str = "quantile", ): # get the __init__ parameters self._init_params = locals() self._call_args = [] self._problem = problem self._run_function = run_function self._run_function_kwargs = ({} if run_function_kwargs is None else run_function_kwargs) if type(random_state) is int: self._seed = random_state self._random_state = np.random.RandomState(random_state) elif isinstance(random_state, np.random.RandomState): self._random_state = random_state else: self._random_state = np.random.RandomState() # Create logging directory if does not exist self._log_dir = os.path.abspath(log_dir) pathlib.Path(log_dir).mkdir(parents=False, exist_ok=True) self._verbose = verbose # mpi if not MPI.Is_initialized(): MPI.Init_thread() self._comm = comm if comm else MPI.COMM_WORLD self._rank = self._comm.Get_rank() self._size = self._comm.Get_size() self._communication_batch_size = communication_batch_size logging.info(f"DMBSMPI has {self._size} worker(s)") # force socket allocation with dummy message to reduce overhead if not lazy_socket_allocation: logging.info("Initializing communication...") ti = time.time() logging.info("Sending to all...") t1 = time.time() req_send = [ self._comm.isend(None, dest=i, tag=TAG_INIT) for i in range(self._size) if i != self._rank ] MPI.Request.waitall(req_send) logging.info(f"Sending to all done in {time.time() - t1:.4f} sec.") logging.info("Receiving from all...") t1 = time.time() req_recv = [ self._comm.irecv(source=i, tag=TAG_INIT) for i in range(self._size) if i != self._rank ] MPI.Request.waitall(req_recv) logging.info( f"Receiving from all done in {time.time() - t1:.4f} sec.") logging.info( f"Initializing communications done in {time.time() - ti:.4f} sec." ) # sync communication management self._sync_communication = sync_communication self._sync_communication_freq = sync_communication_freq # checkpointing self._checkpoint_size = 0 self._checkpoint_file = checkpoint_file self._checkpoint_freq = checkpoint_freq # set random state for given rank self._rank_seed = self._random_state.randint( low=0, high=2**32, size=self._size)[self._rank] self._timestamp = time.time() self._history = History() if acq_optimizer == "auto": if acq_func == "qUCB": acq_optimizer = "sampling" else: acq_optimizer = "boltzmann_sampling" if acq_func == "qUCB": kappa = self._random_state.exponential(kappa, size=self._size)[self._rank] acq_func = "UCB" # check if it is possible to convert the ConfigSpace to standard skopt Space if (isinstance(self._problem.space, CS.ConfigurationSpace) and len(self._problem.space.get_forbiddens()) == 0 and len(self._problem.space.get_conditions()) == 0): self._opt_space = convert_to_skopt_space(self._problem.space) else: self._opt_space = self._problem.space self._opt = None self._opt_kwargs = dict( dimensions=self._opt_space, base_estimator=self._get_surrogate_model( surrogate_model, surrogate_model_kwargs, n_jobs, ), acq_func=MAP_acq_func.get(acq_func, acq_func), acq_func_kwargs={ "xi": xi, "kappa": kappa }, acq_optimizer=acq_optimizer, acq_optimizer_kwargs={ "n_points": 10000, "boltzmann_gamma": 1, # "boltzmann_psucc": 1/self._size, "n_jobs": n_jobs, }, n_initial_points=n_initial_points, random_state=self._rank_seed, sample_max_size=sample_max_size, sample_strategy=sample_strategy, )
def main( cfg: AAEModelConfig, encoder_gpu: int, generator_gpu: int, discriminator_gpu: int, distributed: bool, ): # Do some scaffolding for DDP comm_rank = 0 comm_size = 1 comm = None if distributed and dist.is_available(): import mpi4py mpi4py.rc.initialize = False from mpi4py import MPI # noqa: E402 MPI.Init_thread() # get communicator: duplicate from comm world comm = MPI.COMM_WORLD.Dup() # now match ranks between the mpi comm and the nccl comm os.environ["WORLD_SIZE"] = str(comm.Get_size()) os.environ["RANK"] = str(comm.Get_rank()) # init pytorch dist.init_process_group(backend="nccl", init_method="env://") comm_rank = dist.get_rank() comm_size = dist.get_world_size() model_hparams = AAE3dHyperparams( num_features=cfg.num_features, encoder_filters=cfg.encoder_filters, encoder_kernel_sizes=cfg.encoder_kernel_sizes, generator_filters=cfg.generator_filters, discriminator_filters=cfg.discriminator_filters, latent_dim=cfg.latent_dim, encoder_relu_slope=cfg.encoder_relu_slope, generator_relu_slope=cfg.generator_relu_slope, discriminator_relu_slope=cfg.discriminator_relu_slope, use_encoder_bias=cfg.use_encoder_bias, use_generator_bias=cfg.use_generator_bias, use_discriminator_bias=cfg.use_discriminator_bias, noise_mu=cfg.noise_mu, noise_std=cfg.noise_std, lambda_rec=cfg.lambda_rec, lambda_gp=cfg.lambda_gp, ) # optimizers optimizer_hparams = OptimizerHyperparams(name=cfg.optimizer_name, hparams={"lr": cfg.optimizer_lr}) # Save hparams to disk and load initial weights and create virtual h5 file if comm_rank == 0: cfg.output_path.mkdir(exist_ok=True) model_hparams.save(cfg.output_path.joinpath("model-hparams.json")) optimizer_hparams.save( cfg.output_path.joinpath("optimizer-hparams.json")) init_weights = get_init_weights(cfg) h5_file, h5_files = get_h5_training_file(cfg) with open(cfg.output_path.joinpath("virtual-h5-metadata.json"), "w") as f: json.dump(h5_files, f) else: init_weights, h5_file = None, None if comm_size > 1: init_weights = comm.bcast(init_weights, 0) h5_file = comm.bcast(h5_file, 0) # construct model aae = AAE3d( cfg.num_points, cfg.num_features, cfg.batch_size, model_hparams, optimizer_hparams, gpu=(encoder_gpu, generator_gpu, discriminator_gpu), init_weights=init_weights, ) enc_device = torch.device(f"cuda:{encoder_gpu}") if comm_size > 1: if (encoder_gpu == generator_gpu) and (encoder_gpu == discriminator_gpu): aae.model = DDP(aae.model, device_ids=[enc_device], output_device=enc_device) else: aae.model = DDP(aae.model, device_ids=None, output_device=None) # set global default device torch.cuda.set_device(enc_device.index) if comm_rank == 0: # Diplay model print(aae) assert isinstance(h5_file, Path) # set up dataloaders train_dataset = get_dataset( cfg.dataset_location, h5_file, cfg.dataset_name, cfg.rmsd_name, cfg.fnc_name, cfg.num_points, cfg.num_features, split="train", shard_id=comm_rank, num_shards=comm_size, normalize="box", cms_transform=False, ) train_loader = DataLoader( train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True, pin_memory=True, num_workers=cfg.num_data_workers, ) valid_dataset = get_dataset( cfg.dataset_location, h5_file, cfg.dataset_name, cfg.rmsd_name, cfg.fnc_name, cfg.num_points, cfg.num_features, split="valid", shard_id=comm_rank, num_shards=comm_size, normalize="box", cms_transform=False, ) valid_loader = DataLoader( valid_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True, pin_memory=True, num_workers=cfg.num_data_workers, ) print( f"Having {len(train_dataset)} training and {len(valid_dataset)} validation samples." ) wandb_config = setup_wandb(cfg, aae.model, comm_rank) # Optional callbacks loss_callback = LossCallback(cfg.output_path.joinpath("loss.json"), wandb_config=wandb_config, mpi_comm=comm) checkpoint_callback = CheckpointCallback( out_dir=cfg.output_path.joinpath("checkpoint"), mpi_comm=comm) save_callback = SaveEmbeddingsCallback( out_dir=cfg.output_path.joinpath("embeddings"), interval=cfg.embed_interval, sample_interval=cfg.sample_interval, mpi_comm=comm, ) # TSNEPlotCallback requires SaveEmbeddingsCallback to run first tsne_callback = TSNEPlotCallback( out_dir=cfg.output_path.joinpath("embeddings"), projection_type="3d", target_perplexity=100, interval=cfg.tsne_interval, tsne_is_blocking=True, wandb_config=wandb_config, mpi_comm=comm, ) # Train model with callbacks callbacks = [ loss_callback, checkpoint_callback, save_callback, tsne_callback, ] # Optionaly train for a different number of # epochs on the first DDMD iterations if cfg.stage_idx == 0: epochs = cfg.initial_epochs else: epochs = cfg.epochs aae.train(train_loader, valid_loader, epochs, callbacks=callbacks) # Save loss history to disk. if comm_rank == 0: loss_callback.save(cfg.output_path.joinpath("loss.json")) # Save final model weights to disk aae.save_weights( cfg.output_path.joinpath("encoder-weights.pt"), cfg.output_path.joinpath("generator-weights.pt"), cfg.output_path.joinpath("discriminator-weights.pt"), )