def run(self): self.config.logger.info('Machine master starting with PID {0} at {1}'.format( os.getpid(), time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()))) self.config.logger.info('Simulation started with: {0}'.format(self.config.cmdline)) # Log Sailfish version if running from a git repository. sailfish_root_dir = os.path.join(os.path.realpath( os.path.dirname(__file__)), '..') try: git_hash = subprocess.check_output('cd %s ; git rev-parse HEAD' % sailfish_root_dir, shell=True).rstrip() self.config.logger.info('Sailfish version: {0}'.format(git_hash)) except subprocess.CalledProcessError: pass self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in self.subdomain_specs])) self.sim = self.lb_class(self.config) subdomain2gpu = self._assign_subdomains_to_gpus() self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu)) ipc_files = self._init_connectors() output_initializer = self._init_visualization_and_io(self.sim) try: backend_cls = util.get_backends(self.config.backends.split(',')).next() except StopIteration: self.config.logger.error('Failed to initialize compute backend.' ' Make sure pycuda/pyopencl is installed.') return self.config.logger.info('Selected backend: {0}'.format(backend_cls.name)) if self.config.debug_single_process: assert len(self.subdomain_specs) == 1, ('Only a single subdomain can be' 'simulated in the single process mode.') subdomain = self.subdomain_specs[0] output = output_initializer(subdomain) self.runner = _start_subdomain_runner(subdomain, self.config, self.sim, len(self.subdomain_specs), backend_cls, subdomain2gpu[subdomain.id], output, self._quit_event, None, self._channel is not None) self.config.logger.debug('Finished single process subdomain runner.') else: self._run_subprocesses(output_initializer, backend_cls, subdomain2gpu) self._finish_visualization() for ipcfile in ipc_files: os.unlink(ipcfile) return self._quit_event.is_set()
def run(self): self.config.logger.info('Machine master starting with PID {0} at {1}'.format( os.getpid(), time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()))) self.config.logger.info('Simulation started with: {0}'.format(self.config.cmdline)) # Log Sailfish version if running from a git repository. sailfish_root_dir = os.path.join(os.path.realpath( os.path.dirname(__file__)), '..') try: git_hash = subprocess.check_output('cd %s ; git rev-parse HEAD' % sailfish_root_dir, shell=True).rstrip() self.config.logger.info('Sailfish version: {0}'.format(git_hash)) except subprocess.CalledProcessError: pass self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in self.subdomain_specs])) self.sim = self.lb_class(self.config) subdomain2gpu = self._assign_subdomains_to_gpus() self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu)) ipc_files = self._init_connectors() output_initializer = self._init_visualization_and_io(self.sim) try: backend_cls = next(util.get_backends(self.config.backends.split(','))) except StopIteration: self.config.logger.error('Failed to initialize compute backend.' ' Make sure pycuda/pyopencl is installed.') return self.config.logger.info('Selected backend: {0}'.format(backend_cls.name)) if self.config.debug_single_process: assert len(self.subdomain_specs) == 1, ('Only a single subdomain can be' 'simulated in the single process mode.') subdomain = self.subdomain_specs[0] output = output_initializer(subdomain) self.runner = _start_subdomain_runner(subdomain, self.config, self.sim, len(self.subdomain_specs), backend_cls, subdomain2gpu[subdomain.id], output, self._quit_event, None, self._channel is not None) self.config.logger.debug('Finished single process subdomain runner.') else: self._run_subprocesses(output_initializer, backend_cls, subdomain2gpu) self._finish_visualization() for ipcfile in ipc_files: os.unlink(ipcfile) return self._quit_event.is_set()
def run(self): self.config.logger.info('Machine master starting with PID {0}'.format( os.getpid())) self.config.logger.info('Handling subdomains: {0}'.format( [b.id for b in self.subdomains])) self.sim = self.lb_class(self.config) subdomain2gpu = self._assign_subdomains_to_gpus() self.config.logger.info( 'Subdomain -> GPU map: {0}'.format(subdomain2gpu)) ipc_files = self._init_connectors() output_initializer = self._init_visualization_and_io(self.sim) try: backend_cls = util.get_backends( self.config.backends.split(',')).next() except StopIteration: self.config.logger.error( 'Failed to initialize compute backend.' ' Make sure pycuda/pyopencl is installed.') return if self.config.debug_single_process: assert len(self.subdomains) == 1, ( 'Only a single subdomain can be' 'simulated in the single process mode.') subdomain = self.subdomains[0] output = output_initializer(subdomain) self.runner = _start_subdomain_runner( subdomain, self.config, self.sim, len(self.subdomains), backend_cls, subdomain2gpu[subdomain.id], output, self._quit_event, None, self._channel is not None) self.config.logger.debug( 'Finished single process subdomain runner.') else: self._run_subprocesses(output_initializer, backend_cls, subdomain2gpu) self._finish_visualization() for ipcfile in ipc_files: os.unlink(ipcfile) return self._quit_event.is_set()
def run(self): self.config.logger.info('Machine master starting with PID {0}'.format(os.getpid())) self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in self.subdomains])) self.sim = self.lb_class(self.config) subdomain2gpu = self._assign_subdomains_to_gpus() self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu)) ipc_files = self._init_connectors() output_initializer = self._init_visualization_and_io(self.sim) try: backend_cls = util.get_backends(self.config.backends.split(',')).next() except StopIteration: self.config.logger.error('Failed to initialize compute backend.' ' Make sure pycuda/pyopencl is installed.') return if self.config.debug_single_process: assert len(self.subdomains) == 1, ('Only a single subdomain can be' 'simulated in the single process mode.') subdomain = self.subdomains[0] output = output_initializer(subdomain) _start_subdomain_runner(subdomain, self.config, self.sim, len(self.subdomains), backend_cls, subdomain2gpu[subdomain.id], output, self._quit_event, None, self._channel is not None) self.config.logger.debug('Finished single process subdomain runner.') else: self._run_subprocesses(output_initializer, backend_cls, subdomain2gpu) self._finish_visualization() for ipcfile in ipc_files: os.unlink(ipcfile) return self._quit_event.is_set()
def __init__(self, lb_class, lb_geo=None, default_config=None): """ :param lb_class: class describing the simulation, derived from LBSim :param lb_geo: class describing the global geometry in terms of SubdomainSpec, derived from LBGeometry :param default_config: dictionary mapping command line option names to their new default values """ self._config_parser = config.LBConfigParser() self._lb_class = lb_class # Use a default global geometry is one has not been # specified explicitly. if lb_geo is None: if self.dim == 2: lb_geo = LBGeometry2D else: lb_geo = LBGeometry3D self._lb_geo = lb_geo self._tmpdir = tempfile.mkdtemp() group = self._config_parser.add_group('Runtime mode settings') group.add_argument('--mode', help='runtime mode', type=str, choices=['batch', 'visualization', 'benchmark'], default='batch'), group.add_argument('--every', help='save/visualize simulation results every N iterations ', metavar='N', type=int, default=100) group.add_argument('--from', dest='from_', help='save/visualize simulation results from N iterations ', metavar='N', type=int, default=0) group.add_argument('--perf_stats_every', help='how often to display performance stats', metavar='N', type=int, default=1000) group.add_argument('--max_iters', help='number of iterations to run; use 0 to run indefinitely', type=int, default=0) group.add_argument('--init_iters', help='number of iterations to use for the ' 'initialization phase. If <= 0, no initialization ' 'phase is run.', type=int, default=0) group.add_argument('--output', help='save simulation results to FILE', metavar='FILE', type=str, default='') group.add_argument('--output_format', help='output format', type=str, choices=list(io.format_name_to_cls.keys()), default='npy') group.add_argument('--nooutput_compress', dest='output_compress', action='store_false', default=True, help='stores the output in compressed files' 'if the selected format supports it') group.add_argument('--backends', type=str, default='cuda,opencl', help='computational backends to use; multiple backends ' 'can be separated by a comma') group.add_argument('--vis_engine', type=str, default='pygame', help='visualization engine to use') group.add_argument('--gpus', nargs='+', default=0, type=int, help='which GPUs to use') group.add_argument('--debug_dump_dists', action='store_true', default=False, help='dump the contents of the distribution ' 'arrays to files'), group.add_argument('--debug_single_process', action='store_true', default=False, help='If True, will run the controller, master and subdomain ' 'runner in a single process, which can be helpful for ' 'debugging.') group.add_argument('--debug_dump_node_type_map', action='store_true', default=False, help='Dump the contents of the node type map ' 'into a file'), group.add_argument('--base_name', type=str, default='', help='Specifies the base file name that will be used for ' 'logging, checkpoint and data output. This makes it ' 'possible to avoid specifying --log, --output, and ' '--checkpoint_file separately. Whenever some of ' 'these options are specified, their value takes ' 'precedence over the one automatically generated ' 'using --base_name.') group.add_argument('--log', type=str, default='', help='name of the file to which data is to be logged') group.add_argument('--loglevel', type=int, default=logging.INFO, help='minimum log level for the file logger') group.add_argument('--nobulk_boundary_split', dest='bulk_boundary_split', action='store_false', default=True, help='Disable separate handling of bulk and ' 'boundary nodes (increases parallelism)') group.add_argument('--cluster_spec', type=str, default='', help='path of a Python module with the cluster specification') group.add_argument('--cluster_sync', type=str, default='', help='local_path:dest_path; if specified, will send the ' 'contents of "local_path" to "dest_path" on all cluster ' 'machines before starting the simulation.') group.add_argument('--nocluster_pbs', action='store_false', default=True, dest='cluster_pbs', help='If True, standard PBS variables will be used to run ' 'the job in a cluster.') group.add_argument('--cluster_pbs_initscript', type=str, default='sailfish-init.sh', help='Script to execute on remote ' 'nodes in order to set the environment prior to starting ' 'a machine master.') group.add_argument('--cluster_interface', type=str, dest='cluster_interface', default='', help='Network interface to use on PBS/LSF nodes for ' 'internode communication.') group.add_argument('--nocluster_lsf', action='store_false', default=True, dest='cluster_lsf', help='If True, ' 'standard LSF variables will be used to run the job ' 'in a cluster.') group.add_argument('--nofdust', action='store_false', default=True, dest='fdust', help='If True, will use ' 'logical GPUs 0..n via libfairydust (if present).') group.add_argument('--nocheck_invalid_results_host', action='store_false', dest='check_invalid_results_host', default=True, help='If True, will terminate the simulation if ' 'the results obtained on the host contain invalid values ' '(inf, nan).') group.add_argument('--nocheck_invalid_results_gpu', action='store_false', dest='check_invalid_results_gpu', default=True, help='If True, will terminate the simulation ' 'when invalid values (inf, nan) are detected in the domain ' 'during the simulation.') group.add_argument('--compress_intersubdomain_data', action='store_true', default=False, help='Uses blosc to ' 'compress data exchanged between subdomains. Can improve ' 'performance in distributed simulations limited by bandwidth ' 'available between computational nodes.') group.add_argument('--seed', type=int, default=int(time.time()), help='PRNG seed value') group = self._config_parser.add_group('Checkpointing') group.add_argument('--checkpoint_file', type=str, help='Location of ' 'the checkpoint file.', metavar='PATH', default='') group.add_argument('--single_checkpoint', action='store_true', default=False, help='If True, only a single checkpoint file will be ' 'generated. If multiple checkpoints are requested, only the ' 'last one will be retained. This is useful together with ' '--check_invalid_results_gpu') group.add_argument('--restore_from', type=str, metavar='PATH', help='Location of a checkpoint file from which to start the ' 'simulation.', default='') group.add_argument('--norestore_time', action='store_false', dest='restore_time', default=True, help='If True, and simulation time' ' will be restored when reading a checkpoint.') group.add_argument('--final_checkpoint', action='store_true', default=False, help='Generates a checkpoint after the simulation ' 'is completed.') group.add_argument('--checkpoint_every', type=int, default=0, metavar='N', help='Generates a checkpoint every N steps.') group.add_argument('--checkpoint_from', type=int, default=0, metavar='N', help='Starts generating checkpoints after N ' 'steps of the simulation have been completed.') group = self._config_parser.add_group('Benchmarking') group.add_argument('--benchmark_sample_from', type=int, default=1000, metavar='N', help='Start sampling performance ' 'data at N-th iteration.') group.add_argument('--benchmark_minibatch', type=int, default=50, help='Number of simulation steps used for batching ' 'for purposes of standard deviation calculation.') group = self._config_parser.add_group('Simulation-specific settings') for base in lb_class.mro(): if 'add_options' in base.__dict__: base.add_options(group, self.dim) group = self._config_parser.add_group('Geometry settings') lb_geo.add_options(group) group = self._config_parser.add_group('Code generator options') codegen.BlockCodeGenerator.add_options(group) # Backend options for backend in util.get_backends(): group = self._config_parser.add_group( "'{0}' backend options".format(backend.name)) backend.add_options(group) # Do not try to import visualization engine modules if we already # know that the simulation will be running in batch mode. if (default_config is None or 'mode' not in default_config or default_config['mode'] == 'visualization'): for engine in util.get_visualization_engines(): group = self._config_parser.add_group( "'{0}' visualization engine".format(engine.name)) engine.add_options(group) # Set default values defined by the simulation-specific class. defaults = {} lb_class.update_defaults(defaults) self._config_parser.set_defaults(defaults) if default_config is not None: self._config_parser.set_defaults(default_config)