예제 #1
0
파일: master.py 프로젝트: PokerN/sailfish
    def run(self):
        self.config.logger.info('Machine master starting with PID {0} at {1}'.format(
            os.getpid(), time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())))
        self.config.logger.info('Simulation started with: {0}'.format(self.config.cmdline))

        # Log Sailfish version if running from a git repository.
        sailfish_root_dir = os.path.join(os.path.realpath(
            os.path.dirname(__file__)), '..')
        try:
            git_hash = subprocess.check_output('cd %s ; git rev-parse HEAD' %
                                               sailfish_root_dir,
                                               shell=True).rstrip()
            self.config.logger.info('Sailfish version: {0}'.format(git_hash))
        except subprocess.CalledProcessError:
            pass

        self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in
            self.subdomain_specs]))

        self.sim = self.lb_class(self.config)
        subdomain2gpu = self._assign_subdomains_to_gpus()
        self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu))

        ipc_files = self._init_connectors()
        output_initializer = self._init_visualization_and_io(self.sim)
        try:
            backend_cls = util.get_backends(self.config.backends.split(',')).next()
        except StopIteration:
            self.config.logger.error('Failed to initialize compute backend.'
                    ' Make sure pycuda/pyopencl is installed.')
            return

        self.config.logger.info('Selected backend: {0}'.format(backend_cls.name))

        if self.config.debug_single_process:
            assert len(self.subdomain_specs) == 1, ('Only a single subdomain can be'
                    'simulated in the single process mode.')

            subdomain = self.subdomain_specs[0]
            output = output_initializer(subdomain)
            self.runner = _start_subdomain_runner(subdomain, self.config, self.sim,
                    len(self.subdomain_specs), backend_cls,
                    subdomain2gpu[subdomain.id], output, self._quit_event,
                    None, self._channel is not None)
            self.config.logger.debug('Finished single process subdomain runner.')
        else:
            self._run_subprocesses(output_initializer, backend_cls,
                    subdomain2gpu)

        self._finish_visualization()

        for ipcfile in ipc_files:
            os.unlink(ipcfile)

        return self._quit_event.is_set()
예제 #2
0
    def run(self):
        self.config.logger.info('Machine master starting with PID {0} at {1}'.format(
            os.getpid(), time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime())))
        self.config.logger.info('Simulation started with: {0}'.format(self.config.cmdline))

        # Log Sailfish version if running from a git repository.
        sailfish_root_dir = os.path.join(os.path.realpath(
            os.path.dirname(__file__)), '..')
        try:
            git_hash = subprocess.check_output('cd %s ; git rev-parse HEAD' %
                                               sailfish_root_dir,
                                               shell=True).rstrip()
            self.config.logger.info('Sailfish version: {0}'.format(git_hash))
        except subprocess.CalledProcessError:
            pass

        self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in
            self.subdomain_specs]))

        self.sim = self.lb_class(self.config)
        subdomain2gpu = self._assign_subdomains_to_gpus()
        self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu))

        ipc_files = self._init_connectors()
        output_initializer = self._init_visualization_and_io(self.sim)
        try:
            backend_cls = next(util.get_backends(self.config.backends.split(',')))
        except StopIteration:
            self.config.logger.error('Failed to initialize compute backend.'
                    ' Make sure pycuda/pyopencl is installed.')
            return

        self.config.logger.info('Selected backend: {0}'.format(backend_cls.name))

        if self.config.debug_single_process:
            assert len(self.subdomain_specs) == 1, ('Only a single subdomain can be'
                    'simulated in the single process mode.')

            subdomain = self.subdomain_specs[0]
            output = output_initializer(subdomain)
            self.runner = _start_subdomain_runner(subdomain, self.config, self.sim,
                    len(self.subdomain_specs), backend_cls,
                    subdomain2gpu[subdomain.id], output, self._quit_event,
                    None, self._channel is not None)
            self.config.logger.debug('Finished single process subdomain runner.')
        else:
            self._run_subprocesses(output_initializer, backend_cls,
                    subdomain2gpu)

        self._finish_visualization()

        for ipcfile in ipc_files:
            os.unlink(ipcfile)

        return self._quit_event.is_set()
예제 #3
0
파일: master.py 프로젝트: nwoznica/sailfish
    def run(self):
        self.config.logger.info('Machine master starting with PID {0}'.format(
            os.getpid()))
        self.config.logger.info('Handling subdomains: {0}'.format(
            [b.id for b in self.subdomains]))

        self.sim = self.lb_class(self.config)
        subdomain2gpu = self._assign_subdomains_to_gpus()

        self.config.logger.info(
            'Subdomain -> GPU map: {0}'.format(subdomain2gpu))

        ipc_files = self._init_connectors()
        output_initializer = self._init_visualization_and_io(self.sim)
        try:
            backend_cls = util.get_backends(
                self.config.backends.split(',')).next()
        except StopIteration:
            self.config.logger.error(
                'Failed to initialize compute backend.'
                ' Make sure pycuda/pyopencl is installed.')
            return

        if self.config.debug_single_process:
            assert len(self.subdomains) == 1, (
                'Only a single subdomain can be'
                'simulated in the single process mode.')

            subdomain = self.subdomains[0]
            output = output_initializer(subdomain)
            self.runner = _start_subdomain_runner(
                subdomain, self.config, self.sim, len(self.subdomains),
                backend_cls, subdomain2gpu[subdomain.id], output,
                self._quit_event, None, self._channel is not None)
            self.config.logger.debug(
                'Finished single process subdomain runner.')
        else:
            self._run_subprocesses(output_initializer, backend_cls,
                                   subdomain2gpu)

        self._finish_visualization()

        for ipcfile in ipc_files:
            os.unlink(ipcfile)

        return self._quit_event.is_set()
예제 #4
0
파일: master.py 프로젝트: vikeu/sailfish
    def run(self):
        self.config.logger.info('Machine master starting with PID {0}'.format(os.getpid()))
        self.config.logger.info('Handling subdomains: {0}'.format([b.id for b in
            self.subdomains]))

        self.sim = self.lb_class(self.config)
        subdomain2gpu = self._assign_subdomains_to_gpus()

        self.config.logger.info('Subdomain -> GPU map: {0}'.format(subdomain2gpu))

        ipc_files = self._init_connectors()
        output_initializer = self._init_visualization_and_io(self.sim)
        try:
            backend_cls = util.get_backends(self.config.backends.split(',')).next()
        except StopIteration:
            self.config.logger.error('Failed to initialize compute backend.'
                    ' Make sure pycuda/pyopencl is installed.')
            return

        if self.config.debug_single_process:
            assert len(self.subdomains) == 1, ('Only a single subdomain can be'
                    'simulated in the single process mode.')

            subdomain = self.subdomains[0]
            output = output_initializer(subdomain)
            _start_subdomain_runner(subdomain, self.config, self.sim,
                    len(self.subdomains), backend_cls,
                    subdomain2gpu[subdomain.id], output, self._quit_event,
                    None, self._channel is not None)
            self.config.logger.debug('Finished single process subdomain runner.')
        else:
            self._run_subprocesses(output_initializer, backend_cls,
                    subdomain2gpu)

        self._finish_visualization()

        for ipcfile in ipc_files:
            os.unlink(ipcfile)

        return self._quit_event.is_set()
예제 #5
0
    def __init__(self, lb_class, lb_geo=None, default_config=None):
        """
        :param lb_class: class describing the simulation, derived from LBSim
        :param lb_geo: class describing the global geometry in terms of
                SubdomainSpec, derived from LBGeometry
        :param default_config: dictionary mapping command line option names
                to their new default values
        """
        self._config_parser = config.LBConfigParser()
        self._lb_class = lb_class

        # Use a default global geometry is one has not been
        # specified explicitly.
        if lb_geo is None:
            if self.dim == 2:
                lb_geo = LBGeometry2D
            else:
                lb_geo = LBGeometry3D

        self._lb_geo = lb_geo
        self._tmpdir = tempfile.mkdtemp()

        group = self._config_parser.add_group('Runtime mode settings')
        group.add_argument('--mode', help='runtime mode', type=str,
            choices=['batch', 'visualization', 'benchmark'], default='batch'),
        group.add_argument('--every',
            help='save/visualize simulation results every N iterations ',
            metavar='N', type=int, default=100)
        group.add_argument('--from', dest='from_',
            help='save/visualize simulation results from N iterations ', metavar='N',
            type=int, default=0)
        group.add_argument('--perf_stats_every',
                           help='how often to display performance stats',
                           metavar='N', type=int, default=1000)
        group.add_argument('--max_iters',
            help='number of iterations to run; use 0 to run indefinitely',
            type=int, default=0)
        group.add_argument('--init_iters',
                           help='number of iterations to use for the '
                           'initialization phase. If <= 0, no initialization '
                           'phase is run.', type=int, default=0)
        group.add_argument('--output',
            help='save simulation results to FILE', metavar='FILE',
            type=str, default='')
        group.add_argument('--output_format',
            help='output format', type=str,
            choices=list(io.format_name_to_cls.keys()), default='npy')
        group.add_argument('--nooutput_compress', dest='output_compress',
                           action='store_false', default=True,
                           help='stores the output in compressed files'
                           'if the selected format supports it')
        group.add_argument('--backends',
            type=str, default='cuda,opencl',
            help='computational backends to use; multiple backends '
                 'can be separated by a comma')
        group.add_argument('--vis_engine',
            type=str, default='pygame',
            help='visualization engine to use')
        group.add_argument('--gpus', nargs='+', default=0, type=int,
            help='which GPUs to use')
        group.add_argument('--debug_dump_dists', action='store_true',
                default=False, help='dump the contents of the distribution '
                'arrays to files'),
        group.add_argument('--debug_single_process', action='store_true',
                default=False,
                help='If True, will run the controller, master and subdomain '
                'runner in a single process, which can be helpful for '
                'debugging.')
        group.add_argument('--debug_dump_node_type_map', action='store_true',
                default=False, help='Dump the contents of the node type map '
                'into a file'),
        group.add_argument('--base_name', type=str, default='',
                           help='Specifies the base file name that will be used for '
                           'logging, checkpoint and data output. This makes it '
                           'possible to avoid specifying --log, --output, and '
                           '--checkpoint_file separately. Whenever some of '
                           'these options are specified, their value takes '
                           'precedence over the one automatically generated '
                           'using --base_name.')
        group.add_argument('--log', type=str, default='',
                help='name of the file to which data is to be logged')
        group.add_argument('--loglevel', type=int, default=logging.INFO,
                help='minimum log level for the file logger')
        group.add_argument('--nobulk_boundary_split',
                           dest='bulk_boundary_split', action='store_false',
                           default=True,
                           help='Disable separate handling of bulk and '
                           'boundary nodes (increases parallelism)')
        group.add_argument('--cluster_spec', type=str, default='',
                help='path of a Python module with the cluster specification')
        group.add_argument('--cluster_sync', type=str, default='',
                help='local_path:dest_path; if specified, will send the '
                'contents of "local_path" to "dest_path" on all cluster '
                'machines before starting the simulation.')
        group.add_argument('--nocluster_pbs', action='store_false', default=True,
                dest='cluster_pbs',
                help='If True, standard PBS variables will be used to run '
                'the job in a cluster.')
        group.add_argument('--cluster_pbs_initscript', type=str,
                default='sailfish-init.sh', help='Script to execute on remote '
                'nodes in order to set the environment prior to starting '
                'a machine master.')
        group.add_argument('--cluster_interface', type=str,
                           dest='cluster_interface',
                           default='',
                           help='Network interface to use on PBS/LSF nodes for '
                           'internode communication.')
        group.add_argument('--nocluster_lsf', action='store_false',
                           default=True, dest='cluster_lsf', help='If True, '
                           'standard LSF variables will be used to run the job '
                           'in a cluster.')
        group.add_argument('--nofdust', action='store_false',
                           default=True, dest='fdust', help='If True, will use '
                           'logical GPUs 0..n via libfairydust (if present).')
        group.add_argument('--nocheck_invalid_results_host', action='store_false',
                dest='check_invalid_results_host',
                default=True, help='If True, will terminate the simulation if '
                'the results obtained on the host contain invalid values '
                '(inf, nan).')
        group.add_argument('--nocheck_invalid_results_gpu',
                action='store_false', dest='check_invalid_results_gpu',
                default=True, help='If True, will terminate the simulation '
                'when invalid values (inf, nan) are detected in the domain '
                'during the simulation.')
        group.add_argument('--compress_intersubdomain_data',
                action='store_true', default=False, help='Uses blosc to '
                'compress data exchanged between subdomains. Can improve '
                'performance in distributed simulations limited by bandwidth '
                'available between computational nodes.')
        group.add_argument('--seed', type=int, default=int(time.time()),
                help='PRNG seed value')

        group = self._config_parser.add_group('Checkpointing')
        group.add_argument('--checkpoint_file', type=str, help='Location of '
                'the checkpoint file.', metavar='PATH', default='')
        group.add_argument('--single_checkpoint', action='store_true',
                default=False,
                help='If True, only a single checkpoint file will be '
                'generated. If multiple checkpoints are requested, only the '
                'last one will be retained. This is useful together with '
                '--check_invalid_results_gpu')
        group.add_argument('--restore_from', type=str, metavar='PATH',
                help='Location of a checkpoint file from which to start the '
                'simulation.', default='')
        group.add_argument('--norestore_time', action='store_false',
                           dest='restore_time',
                           default=True, help='If True, and simulation time'
                ' will be restored when reading a checkpoint.')
        group.add_argument('--final_checkpoint', action='store_true',
                default=False, help='Generates a checkpoint after the simulation '
                'is completed.')
        group.add_argument('--checkpoint_every', type=int, default=0,
                metavar='N', help='Generates a checkpoint every N steps.')
        group.add_argument('--checkpoint_from', type=int, default=0,
                metavar='N', help='Starts generating checkpoints after N '
                'steps of the simulation have been completed.')

        group = self._config_parser.add_group('Benchmarking')
        group.add_argument('--benchmark_sample_from', type=int, default=1000,
                           metavar='N', help='Start sampling performance '
                           'data at N-th iteration.')
        group.add_argument('--benchmark_minibatch', type=int, default=50,
                           help='Number of simulation steps used for batching '
                           'for purposes of standard deviation calculation.')
        group = self._config_parser.add_group('Simulation-specific settings')

        for base in lb_class.mro():
            if 'add_options' in base.__dict__:
                base.add_options(group, self.dim)

        group = self._config_parser.add_group('Geometry settings')
        lb_geo.add_options(group)

        group = self._config_parser.add_group('Code generator options')
        codegen.BlockCodeGenerator.add_options(group)

        # Backend options
        for backend in util.get_backends():
            group = self._config_parser.add_group(
                    "'{0}' backend options".format(backend.name))
            backend.add_options(group)

        # Do not try to import visualization engine modules if we already
        # know that the simulation will be running in batch mode.
        if (default_config is None or 'mode' not in default_config or
            default_config['mode'] == 'visualization'):
            for engine in util.get_visualization_engines():
                group = self._config_parser.add_group(
                        "'{0}' visualization engine".format(engine.name))
                engine.add_options(group)

        # Set default values defined by the simulation-specific class.
        defaults = {}
        lb_class.update_defaults(defaults)
        self._config_parser.set_defaults(defaults)

        if default_config is not None:
            self._config_parser.set_defaults(default_config)
예제 #6
0
    def __init__(self, lb_class, lb_geo=None, default_config=None):
        """
        :param lb_class: class describing the simulation, derived from LBSim
        :param lb_geo: class describing the global geometry in terms of
                SubdomainSpec, derived from LBGeometry
        :param default_config: dictionary mapping command line option names
                to their new default values
        """
        self._config_parser = config.LBConfigParser()
        self._lb_class = lb_class

        # Use a default global geometry is one has not been
        # specified explicitly.
        if lb_geo is None:
            if self.dim == 2:
                lb_geo = LBGeometry2D
            else:
                lb_geo = LBGeometry3D

        self._lb_geo = lb_geo
        self._tmpdir = tempfile.mkdtemp()

        group = self._config_parser.add_group('Runtime mode settings')
        group.add_argument('--mode', help='runtime mode', type=str,
            choices=['batch', 'visualization', 'benchmark'], default='batch'),
        group.add_argument('--every',
            help='save/visualize simulation results every N iterations ',
            metavar='N', type=int, default=100)
        group.add_argument('--from', dest='from_',
            help='save/visualize simulation results from N iterations ', metavar='N',
            type=int, default=0)
        group.add_argument('--perf_stats_every',
                           help='how often to display performance stats',
                           metavar='N', type=int, default=1000)
        group.add_argument('--max_iters',
            help='number of iterations to run; use 0 to run indefinitely',
            type=int, default=0)
        group.add_argument('--init_iters',
                           help='number of iterations to use for the '
                           'initialization phase. If <= 0, no initialization '
                           'phase is run.', type=int, default=0)
        group.add_argument('--output',
            help='save simulation results to FILE', metavar='FILE',
            type=str, default='')
        group.add_argument('--output_format',
            help='output format', type=str,
            choices=list(io.format_name_to_cls.keys()), default='npy')
        group.add_argument('--nooutput_compress', dest='output_compress',
                           action='store_false', default=True,
                           help='stores the output in compressed files'
                           'if the selected format supports it')
        group.add_argument('--backends',
            type=str, default='cuda,opencl',
            help='computational backends to use; multiple backends '
                 'can be separated by a comma')
        group.add_argument('--vis_engine',
            type=str, default='pygame',
            help='visualization engine to use')
        group.add_argument('--gpus', nargs='+', default=0, type=int,
            help='which GPUs to use')
        group.add_argument('--debug_dump_dists', action='store_true',
                default=False, help='dump the contents of the distribution '
                'arrays to files'),
        group.add_argument('--debug_single_process', action='store_true',
                default=False,
                help='If True, will run the controller, master and subdomain '
                'runner in a single process, which can be helpful for '
                'debugging.')
        group.add_argument('--debug_dump_node_type_map', action='store_true',
                default=False, help='Dump the contents of the node type map '
                'into a file'),
        group.add_argument('--base_name', type=str, default='',
                           help='Specifies the base file name that will be used for '
                           'logging, checkpoint and data output. This makes it '
                           'possible to avoid specifying --log, --output, and '
                           '--checkpoint_file separately. Whenever some of '
                           'these options are specified, their value takes '
                           'precedence over the one automatically generated '
                           'using --base_name.')
        group.add_argument('--log', type=str, default='',
                help='name of the file to which data is to be logged')
        group.add_argument('--loglevel', type=int, default=logging.INFO,
                help='minimum log level for the file logger')
        group.add_argument('--nobulk_boundary_split',
                           dest='bulk_boundary_split', action='store_false',
                           default=True,
                           help='Disable separate handling of bulk and '
                           'boundary nodes (increases parallelism)')
        group.add_argument('--cluster_spec', type=str, default='',
                help='path of a Python module with the cluster specification')
        group.add_argument('--cluster_sync', type=str, default='',
                help='local_path:dest_path; if specified, will send the '
                'contents of "local_path" to "dest_path" on all cluster '
                'machines before starting the simulation.')
        group.add_argument('--nocluster_pbs', action='store_false', default=True,
                dest='cluster_pbs',
                help='If True, standard PBS variables will be used to run '
                'the job in a cluster.')
        group.add_argument('--cluster_pbs_initscript', type=str,
                default='sailfish-init.sh', help='Script to execute on remote '
                'nodes in order to set the environment prior to starting '
                'a machine master.')
        group.add_argument('--cluster_interface', type=str,
                           dest='cluster_interface',
                           default='',
                           help='Network interface to use on PBS/LSF nodes for '
                           'internode communication.')
        group.add_argument('--nocluster_lsf', action='store_false',
                           default=True, dest='cluster_lsf', help='If True, '
                           'standard LSF variables will be used to run the job '
                           'in a cluster.')
        group.add_argument('--nofdust', action='store_false',
                           default=True, dest='fdust', help='If True, will use '
                           'logical GPUs 0..n via libfairydust (if present).')
        group.add_argument('--nocheck_invalid_results_host', action='store_false',
                dest='check_invalid_results_host',
                default=True, help='If True, will terminate the simulation if '
                'the results obtained on the host contain invalid values '
                '(inf, nan).')
        group.add_argument('--nocheck_invalid_results_gpu',
                action='store_false', dest='check_invalid_results_gpu',
                default=True, help='If True, will terminate the simulation '
                'when invalid values (inf, nan) are detected in the domain '
                'during the simulation.')
        group.add_argument('--compress_intersubdomain_data',
                action='store_true', default=False, help='Uses blosc to '
                'compress data exchanged between subdomains. Can improve '
                'performance in distributed simulations limited by bandwidth '
                'available between computational nodes.')
        group.add_argument('--seed', type=int, default=int(time.time()),
                help='PRNG seed value')

        group = self._config_parser.add_group('Checkpointing')
        group.add_argument('--checkpoint_file', type=str, help='Location of '
                'the checkpoint file.', metavar='PATH', default='')
        group.add_argument('--single_checkpoint', action='store_true',
                default=False,
                help='If True, only a single checkpoint file will be '
                'generated. If multiple checkpoints are requested, only the '
                'last one will be retained. This is useful together with '
                '--check_invalid_results_gpu')
        group.add_argument('--restore_from', type=str, metavar='PATH',
                help='Location of a checkpoint file from which to start the '
                'simulation.', default='')
        group.add_argument('--norestore_time', action='store_false',
                           dest='restore_time',
                           default=True, help='If True, and simulation time'
                ' will be restored when reading a checkpoint.')
        group.add_argument('--final_checkpoint', action='store_true',
                default=False, help='Generates a checkpoint after the simulation '
                'is completed.')
        group.add_argument('--checkpoint_every', type=int, default=0,
                metavar='N', help='Generates a checkpoint every N steps.')
        group.add_argument('--checkpoint_from', type=int, default=0,
                metavar='N', help='Starts generating checkpoints after N '
                'steps of the simulation have been completed.')

        group = self._config_parser.add_group('Benchmarking')
        group.add_argument('--benchmark_sample_from', type=int, default=1000,
                           metavar='N', help='Start sampling performance '
                           'data at N-th iteration.')
        group.add_argument('--benchmark_minibatch', type=int, default=50,
                           help='Number of simulation steps used for batching '
                           'for purposes of standard deviation calculation.')
        group = self._config_parser.add_group('Simulation-specific settings')

        for base in lb_class.mro():
            if 'add_options' in base.__dict__:
                base.add_options(group, self.dim)

        group = self._config_parser.add_group('Geometry settings')
        lb_geo.add_options(group)

        group = self._config_parser.add_group('Code generator options')
        codegen.BlockCodeGenerator.add_options(group)

        # Backend options
        for backend in util.get_backends():
            group = self._config_parser.add_group(
                    "'{0}' backend options".format(backend.name))
            backend.add_options(group)

        # Do not try to import visualization engine modules if we already
        # know that the simulation will be running in batch mode.
        if (default_config is None or 'mode' not in default_config or
            default_config['mode'] == 'visualization'):
            for engine in util.get_visualization_engines():
                group = self._config_parser.add_group(
                        "'{0}' visualization engine".format(engine.name))
                engine.add_options(group)

        # Set default values defined by the simulation-specific class.
        defaults = {}
        lb_class.update_defaults(defaults)
        self._config_parser.set_defaults(defaults)

        if default_config is not None:
            self._config_parser.set_defaults(default_config)