def connect(self, *args, **kwargs): host = config.get("kvs", "host") port = config.get("kvs", "port") port = int(port) if port else 6379 stats_db = config.get("kvs", "stats_db") stats_db = int(stats_db) if stats_db else 15 args = {"host": host, "port": port, "db": stats_db} return redis.Redis(**args)
def _redis(): """Return a connection to the redis store.""" host = config.get("kvs", "host") port = config.get("kvs", "port") port = int(port) if port else 6379 stats_db = config.get("kvs", "stats_db") stats_db = int(stats_db) if stats_db else DEFAULT_STATS_DB args = {"host": host, "port": port, "db": stats_db} return redis.Redis(**args)
def __init__(self, job, monitor=None): self.job = job self.oqparam = self.job.get_oqparam() self.monitor = monitor or EnginePerformanceMonitor('', job.id) self.num_tasks = None self._task_args = [] # parameters from openquake.cfg self.concurrent_tasks = int(config.get('celery', 'concurrent_tasks')) self.max_input_weight = float(config.get('hazard', 'max_input_weight')) self.max_output_weight = float( config.get('hazard', 'max_output_weight')) TrtModel.POINT_SOURCE_WEIGHT = float( config.get('hazard', 'point_source_weight'))
def __init__(self, job): super(BaseHazardCalculator, self).__init__(job) # three crucial parameters from openquake.cfg self.source_max_weight = int( config.get('hazard', 'source_max_weight')) self.concurrent_tasks = int( config.get('hazard', 'concurrent_tasks')) # a dictionary trt_model_id -> num_ruptures self.num_ruptures = collections.defaultdict(int) # now a dictionary (trt_model_id, gsim) -> poes self.curves = {}
def __init__(self, job): self.job = job self.num_tasks = None self._task_args = [] # parameters from openquake.cfg self.concurrent_tasks = int( config.get('celery', 'concurrent_tasks')) self.max_input_weight = float( config.get('hazard', 'max_input_weight')) self.max_output_weight = float( config.get('hazard', 'max_output_weight')) SourceCollector.POINT_SOURCE_WEIGHT = float( config.get('hazard', 'point_source_weight'))
def __init__(self, job, monitor=None): self.job = job self.oqparam = self.job.get_oqparam() self.monitor = monitor or EnginePerformanceMonitor('', job.id) self.num_tasks = None self._task_args = [] # parameters from openquake.cfg self.concurrent_tasks = self.oqparam.concurrent_tasks self.max_input_weight = float( config.get('hazard', 'max_input_weight')) self.max_output_weight = float( config.get('hazard', 'max_output_weight')) TrtModel.POINT_SOURCE_WEIGHT = float( config.get('hazard', 'point_source_weight'))
def test_get_with_unknown_key(self): """config.get() returns `None` if the `key` is not known.""" with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict(b=1) self.assertTrue(config.get("arghh", "c") is None) self.assertEqual(1, mock.call_count) self.assertEqual([("arghh", ), {}], mock.call_args)
def record_init_stats(self): """ Record some basic job stats, including the number of sites, realizations (end branches), and total number of tasks for the job. This should be run between the `pre-execute` and `execute` phases, once the job has been fully initialized. """ # Record num sites, num realizations, and num tasks. num_sites = len(self.computation_mesh) realizations = models.LtRealization.objects.filter(hazard_calculation=self.hc.id) num_rlzs = realizations.count() # Compute the number of tasks. block_size = int(config.get("hazard", "block_size")) num_tasks = 0 for lt_rlz in realizations: # Each realization has the potential to choose a random source # model, and thus there may be a variable number of tasks for each # realization (depending on the number of the sources in the model # which was chosen for the realization). num_sources = models.SourceProgress.objects.filter(lt_realization=lt_rlz).count() num_tasks += math.ceil(float(num_sources) / block_size) [job_stats] = models.JobStats.objects.filter(oq_job=self.job.id) job_stats.num_sites = num_sites job_stats.num_tasks = num_tasks job_stats.num_realizations = num_rlzs job_stats.save()
def initialize_sources(self): """ Parse source models and validate source logic trees. It also filters the sources far away and apply uncertainties to the relevant ones. As a side effect it populates the instance dictionary `.source_blocks_per_ltpath`. Notice that sources are automatically split. :returns: a list with the number of sources for each source model """ logs.LOG.progress("initializing sources") smlt_file = self.hc.inputs['source_model_logic_tree'] self.smlt = logictree.SourceModelLogicTree( file(smlt_file).read(), self.hc.base_path, smlt_file) sm_paths = list(self.smlt.get_sm_paths()) nblocks = ceil(config.get('hazard', 'concurrent_tasks'), len(sm_paths)) # here we are doing a full enumeration of the source model logic tree; # this is not bad since for very large source models there are # typically very few realizations; moreover, the filtering will remove # most of the sources, so the memory occupation is typically low lt_models = [] for i, (sm, path) in enumerate(sm_paths): smpath = tuple(path) fname = os.path.join(self.hc.base_path, sm) source_collector = source.parse_source_model_smart( fname, self.hc.sites_affected_by, self.smlt.make_apply_uncertainties(path), self.hc) if not source_collector.source_weights: raise RuntimeError( 'Could not find sources close to the sites in %s ' '(maximum_distance=%s km)' % (fname, self.hc.maximum_distance)) lt_model = models.LtSourceModel.objects.create( hazard_calculation=self.hc, ordinal=i, sm_lt_path=smpath) lt_models.append(lt_model) for trt, blocks in source_collector.split_blocks(nblocks): self.source_blocks_per_ltpath[smpath, trt] = blocks n = sum(len(block) for block in blocks) logs.LOG.info('Found %d relevant source(s) for %s %s, TRT=%s', n, sm, path, trt) logs.LOG.info('Splitting in %d blocks', len(blocks)) for i, block in enumerate(blocks, 1): logs.LOG.debug('%s, block %d: %d source(s), weight %s', trt, i, len(block), block.weight) # save LtModelInfo objects for each tectonic region type for trt in source_collector.sorted_trts(): models.LtModelInfo.objects.create( lt_model=lt_model, tectonic_region_type=trt, num_sources=len(source_collector.source_weights[trt]), num_ruptures=source_collector.num_ruptures[trt], min_mag=source_collector.min_mag[trt], max_mag=source_collector.max_mag[trt]) return lt_models
def test_get_with_empty_section_data(self): # config.get() returns `None` if the section data dict is empty with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict() self.assertTrue(config.get("whatever", "key") is None) self.assertEqual(1, mock.call_count) self.assertEqual([("whatever",), {}], mock.call_args)
def initialize_sources(self): """ Parse source models, apply uncertainties and validate source logic trees. Save in the database LtSourceModel and TrtModel objects. """ logs.LOG.progress("initializing sources") parallel_source_splitting = valid.boolean( config.get('hazard', 'parallel_source_splitting') or 'false') self.composite_model = readinput.get_composite_source_model( self.oqparam, self.site_collection, no_distribute=not parallel_source_splitting) for sm in self.composite_model: # create an LtSourceModel for each distinct source model lt_model = models.LtSourceModel.objects.create( hazard_calculation=self.job, sm_lt_path=self.tilepath + sm.path, ordinal=sm.ordinal, sm_name=sm.name, weight=sm.weight, samples=sm.samples) self._source_models.append(lt_model) gsims_by_trt = sm.gsim_lt.values # save TrtModels for each tectonic region type # and stored the db ID in the in-memory models for trt_mod in sm.trt_models: trt_mod.id = models.TrtModel.objects.create( lt_model=lt_model, tectonic_region_type=trt_mod.trt, num_sources=len(trt_mod), num_ruptures=trt_mod.num_ruptures, min_mag=trt_mod.min_mag, max_mag=trt_mod.max_mag, gsims=gsims_by_trt[trt_mod.trt]).id # rebuild the info object with the trt_ids coming from the db self.composite_model.info = source.CompositionInfo( self.composite_model.source_model_lt, self.composite_model.source_models)
def test_get_with_empty_section_data(self): """config.get() returns `None` if the section data dict is empty.""" with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict() self.assertTrue(config.get("whatever", "key") is None) self.assertEqual(1, mock.call_count) self.assertEqual([("whatever", ), {}], mock.call_args)
def test_get_with_unknown_key(self): """config.get() returns `None` if the `key` is not known.""" with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict(b=1) self.assertTrue(config.get("arghh", "c") is None) self.assertEqual(1, mock.call_count) self.assertEqual([("arghh",), {}], mock.call_args)
def test_get_with_nonempty_section_data_and_known_key(self): # config.get() correctly returns the configuration datum for known # sections/keys with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict(a=11) self.assertEqual(11, config.get("hmmm", "a")) self.assertEqual(1, mock.call_count) self.assertEqual([("hmmm", ), {}], mock.call_args)
def test_get_with_nonempty_section_data_and_known_key(self): # config.get() correctly returns the configuration datum for known # sections/keys with patch('openquake.engine.utils.config.get_section') as mock: mock.return_value = dict(a=11) self.assertEqual(11, config.get("hmmm", "a")) self.assertEqual(1, mock.call_count) self.assertEqual([("hmmm",), {}], mock.call_args)
def pre_execute(self): """ Do pre-execution work. At the moment, this work entails: parsing and initializing sources, parsing and initializing the site model (if there is one), and generating logic tree realizations. (The latter piece basically defines the work to be done in the `execute` phase.) """ # Parse logic trees and create source Inputs. self.initialize_sources() # Deal with the site model and compute site data for the calculation # (if a site model was specified, that is). self.initialize_site_model() # Once the site model is init'd, create and cache the site collection; self.hc.init_site_collection() # Now bootstrap the logic tree realizations and related data. # This defines for us the "work" that needs to be done when we reach # the `execute` phase. # This will also stub out hazard curve result records. Workers will # update these periodically with partial results (partial meaning, # result curves for just a subset of the overall sources) when some # work is complete. self.initialize_realizations( rlz_callbacks=[self.initialize_hazard_curve_progress]) self.record_init_stats() # Set the progress counters: num_sources = models.SourceProgress.objects.filter( is_complete=False, lt_realization__hazard_calculation=self.hc).count() self.progress['total'] += num_sources self.progress['hc_total'] = num_sources realizations = models.LtRealization.objects.filter( hazard_calculation=self.hc, is_complete=False) num_rlzs = realizations.count() num_points = len(self.hc.points_to_compute()) self.progress['total'] += num_rlzs * num_points # Update stats to consider the disagg tasks as well: [job_stats] = models.JobStats.objects.filter(oq_job=self.job.id) block_size = int(config.get('hazard', 'block_size')) job_stats.num_tasks += int( math.ceil(float(num_points) * num_rlzs / block_size) ) job_stats.save() # Update the progress info on the realizations, to include the disagg # phase: for rlz in realizations: rlz.total_items += num_points rlz.save() self.initialize_pr_data()
def oqtask(task_func): """ Task function decorator which sets up logging and catches (and logs) any errors which occur inside the task. Also checks to make sure the job is actually still running. If it is not running, the task doesn't get executed, so we don't do useless computation. """ @wraps(task_func) def wrapped(*args): """ Initialize logs, make sure the job is still running, and run the task code surrounded by a try-except. If any error occurs, log it as a critical failure. """ # job_id is always assumed to be the first argument job_id = args[0] job = models.OqJob.objects.get(id=job_id) if job.is_running is False: # the job was killed, it is useless to run the task return # it is important to save the task id soon, so that # the revoke functionality can work EnginePerformanceMonitor.store_task_id(job_id, tsk) with EnginePerformanceMonitor( 'total ' + task_func.__name__, job_id, tsk, flush=True): with EnginePerformanceMonitor( 'loading calculation object', job_id, tsk, flush=True): calculation = job.calculation # tasks write on the celery log file logs.init_logs( level=job.log_level, calc_domain='hazard' if isinstance( calculation, models.HazardCalculation) else'risk', calc_id=calculation.id) try: return task_func(*args), None except: etype, exc, tb = sys.exc_info() tb_str = ''.join(traceback.format_tb(tb)) return '%s\n%s' % (exc, tb_str), etype finally: CacheInserter.flushall() # the task finished, we can remove from the performance # table the associated row 'storing task id' models.Performance.objects.filter( oq_job=job, operation='storing task id', task_id=tsk.request.id).delete() celery_queue = config.get('amqp', 'celery_queue') tsk = task(wrapped, queue=celery_queue) tsk.task_func = task_func return tsk
def oqtask(task_func): """ Task function decorator which sets up logging and catches (and logs) any errors which occur inside the task. Also checks to make sure the job is actually still running. If it is not running, the task doesn't get executed, so we don't do useless computation. :param task_func: the function to decorate """ def wrapped(*args): """ Initialize logs, make sure the job is still running, and run the task code surrounded by a try-except. If any error occurs, log it as a critical failure. """ # the last argument is assumed to be a monitor monitor = args[-1] job = models.OqJob.objects.get(id=monitor.job_id) if job.is_running is False: # the job was killed, it is useless to run the task raise JobNotRunning(monitor.job_id) # it is important to save the task id soon, so that # the revoke functionality can work with monitor("storing task id", task=tsk, autoflush=True): pass with logs.handle(job): check_mem_usage() # warn if too much memory is used # run the task try: total = "total " + task_func.__name__ with monitor(total, task=tsk): with GroundShakingIntensityModel.forbid_instantiation(): return task_func(*args) finally: # save on the db CacheInserter.flushall() # the task finished, we can remove from the performance # table the associated row 'storing task id' models.Performance.objects.filter( oq_job=job, operation="storing task id", task_id=tsk.request.id ).delete() celery_queue = config.get("amqp", "celery_queue") f = lambda *args: safely_call(wrapped, args, pickle=True) f.__name__ = task_func.__name__ f.__module__ = task_func.__module__ tsk = task(f, queue=celery_queue) tsk.__func__ = tsk tsk.task_func = task_func return tsk
def oqtask(task_func): """ Task function decorator which sets up logging and catches (and logs) any errors which occur inside the task. Also checks to make sure the job is actually still running. If it is not running, the task doesn't get executed, so we don't do useless computation. :param task_func: the function to decorate """ def wrapped(*args): """ Initialize logs, make sure the job is still running, and run the task code surrounded by a try-except. If any error occurs, log it as a critical failure. """ # the last argument is assumed to be a monitor monitor = args[-1] job = models.OqJob.objects.get(id=monitor.job_id) if job.is_running is False: # the job was killed, it is useless to run the task raise JobNotRunning(monitor.job_id) # it is important to save the task id soon, so that # the revoke functionality can work with monitor('storing task id', task=tsk, autoflush=True): pass with logs.handle(job): check_mem_usage() # warn if too much memory is used # run the task try: total = 'total ' + task_func.__name__ with monitor(total, task=tsk, autoflush=True): return task_func(*args) finally: # save on the db CacheInserter.flushall() # the task finished, we can remove from the performance # table the associated row 'storing task id' models.Performance.objects.filter( oq_job=job, operation='storing task id', task_id=tsk.request.id).delete() celery_queue = config.get('amqp', 'celery_queue') f = lambda *args: safely_call(wrapped, args, pickle=True) f.__name__ = task_func.__name__ f.__module__ = task_func.__module__ tsk = task(f, queue=celery_queue) tsk.__func__ = tsk tsk.task_func = task_func return tsk
def pre_execute(self): """ In this phase, the general workflow is: 1. Parse the exposure to get the taxonomies 2. Parse the available risk models 3. Validate exposure and risk models """ with self.monitor('get exposure'): exposure = self.rc.exposure_model if exposure is None: ExposureDBWriter(self.job).serialize( parsers.ExposureModelParser(self.rc.inputs['exposure'])) self.taxonomies_asset_count = \ self.rc.exposure_model.taxonomies_in(self.rc.region_constraint) with self.monitor('parse risk models'): self.risk_models = self.get_risk_models() # populate ImtTaxonomy imt_taxonomy_set = set() for rm in self.risk_models.itervalues(): self.loss_types.update(rm.loss_types) for imt in rm.imts: imt_taxonomy_set.add((imt, rm.taxonomy)) # insert the IMT in the db, if not already there models.Imt.save_new([from_string(imt)]) for imt, taxonomy in imt_taxonomy_set: models.ImtTaxonomy.objects.create( job=self.job, imt=models.Imt.get(imt), taxonomy=taxonomy) # consider only the taxonomies in the risk models if # taxonomies_from_model has been set to True in the # job.ini if self.rc.taxonomies_from_model: self.taxonomies_asset_count = dict( (t, count) for t, count in self.taxonomies_asset_count.items() if t in self.risk_models) for validator_class in self.validators: validator = validator_class(self) error = validator.get_error() if error: raise ValueError("""Problems in calculator configuration: %s""" % error) num_assets = sum(self.taxonomies_asset_count.itervalues()) num_taxonomies = len(self.taxonomies_asset_count) logs.LOG.info('Considering %d assets of %d distinct taxonomies', num_assets, num_taxonomies) self.eps_sampling = int(config.get('risk', 'epsilon_sampling'))
def oqtask(task_func): """ Task function decorator which sets up logging and catches (and logs) any errors which occur inside the task. Also checks to make sure the job is actually still running. If it is not running, the task doesn't get executed, so we don't do useless computation. :param task_func: the function to decorate """ def wrapped(*args): """ Initialize logs, make sure the job is still running, and run the task code surrounded by a try-except. If any error occurs, log it as a critical failure. """ # job_id is always assumed to be the first argument job_id = args[0] job = models.OqJob.objects.get(id=job_id) if job.is_running is False: # the job was killed, it is useless to run the task raise JobNotRunning(job_id) # it is important to save the task id soon, so that # the revoke functionality can work EnginePerformanceMonitor.store_task_id(job_id, tsk) with EnginePerformanceMonitor( 'total ' + task_func.__name__, job_id, tsk, flush=True): # tasks write on the celery log file logs.set_level(job.log_level) try: # log a warning if too much memory is used check_mem_usage(SOFT_MEM_LIMIT, HARD_MEM_LIMIT) # run the task return task_func(*args) finally: # save on the db CacheInserter.flushall() # the task finished, we can remove from the performance # table the associated row 'storing task id' models.Performance.objects.filter( oq_job=job, operation='storing task id', task_id=tsk.request.id).delete() celery_queue = config.get('amqp', 'celery_queue') f = lambda *args: safely_call(wrapped, args, pickle=True) f.__name__ = task_func.__name__ tsk = task(f, queue=celery_queue) tsk.task_func = task_func return tsk
def check_nodes(self): """ Check that the expected celery nodes are all up. The loop continues until the main thread keeps running. """ while self.job_is_running(sleep=self.interval): live_nodes = self.ping(timeout=self.interval) if live_nodes < self.live_nodes: dead_nodes = list(self.live_nodes - live_nodes) logs.LOG.critical( 'Cluster nodes not accessible: %s', dead_nodes) terminate = boolean( config.get('celery', 'terminate_job_when_celery_is_down')) if terminate: os.kill(os.getpid(), signal.SIGABRT) # commit suicide
def task_arg_gen(self): """ Generator function for creating the arguments for each task. It is responsible for the distribution strategy. It divides the considered exposure into chunks of homogeneous assets (i.e. having the same taxonomy). The chunk size is given by the `block_size` openquake config parameter. :returns: An iterator over a list of arguments. Each contains: 1. the job id 2. a getter object needed to get the hazard data 3. the needed risklib calculators 4. the output containers to be populated 5. the specific calculator parameter set """ block_size = int(config.get('risk', 'block_size')) output_containers = writers.combine_builders( [builder(self) for builder in self.output_builders]) num_tasks = 0 for taxonomy, assets_nr in self.taxonomies_asset_count.items(): asset_offsets = range(0, assets_nr, block_size) for offset in asset_offsets: with logs.tracing("getting assets"): assets = models.ExposureData.objects.get_asset_chunk( self.rc, taxonomy, offset, block_size) calculation_units = [ self.calculation_unit(loss_type, assets) for loss_type in models.loss_types(self.risk_models)] num_tasks += 1 yield [self.job.id, calculation_units, output_containers, self.calculator_parameters] # sanity check to protect against future changes of the distribution # logic expected_tasks = self.expected_tasks(block_size) if num_tasks != expected_tasks: raise RuntimeError('Expected %d tasks, generated %d!' % ( expected_tasks, num_tasks))
def get_client(**kwargs): """ Return a redis kvs client connection for general OpenQuake engine calculation usage.. PLEASE NOTE: The 'db' argument is automatically read from the openquake.cfg and set. If specified in ``kwargs``, it will be overridden with the setting in openquake.cfg. """ global __KVS_CONN_POOL if __KVS_CONN_POOL is None: cfg = config.get_section("kvs") # get the default db from the openquake.cfg: db = int(config.get('kvs', 'redis_db')) __KVS_CONN_POOL = redis.ConnectionPool( max_connections=1, host=cfg["host"], port=int(cfg["port"]), db=db) kwargs.update({"connection_pool": __KVS_CONN_POOL}) return redis.Redis(**kwargs)
def initialize_sources(self): """ Parse source models and validate source logic trees. It also filters the sources far away and apply uncertainties to the relevant ones. As a side effect it populates the instance dictionary `.source_blocks_per_ltpath`. Notice that sources are automatically split. :returns: a list with the number of sources for each source model """ logs.LOG.progress("initializing sources") smlt_file = self.hc.inputs['source_model_logic_tree'] self.smlt = logictree.SourceModelLogicTree( file(smlt_file).read(), self.hc.base_path, smlt_file) sm_paths = list(self.smlt.get_sm_paths()) nblocks = ceil(config.get('hazard', 'concurrent_tasks'), len(sm_paths)) bs = SequenceSplitter(nblocks) # here we are doing a full enumeration of the source model logic tree; # this is not bad because for very large source models there are # typically very few realizations; moreover, the filtering will remove # most of the sources, so the memory occupation is typically low num_sources = [] # the number of sources per sm_lt_path for sm, path in sm_paths: smpath = tuple(path) source_weight_pairs = source.parse_source_model_smart( os.path.join(self.hc.base_path, sm), self.hc.sites_affected_by, self.smlt.make_apply_uncertainties(path), self.hc) blocks = bs.split_on_max_weight(list(source_weight_pairs)) self.source_blocks_per_ltpath[smpath] = blocks n = sum(len(block) for block in blocks) logs.LOG.info('Found %d relevant source(s) for %s %s', n, sm, path) logs.LOG.info('Splitting in blocks with at maximum %d ruptures', bs.max_weight) for i, block in enumerate(blocks, 1): logs.LOG.info('Block %d: %d sources, %d ruptures', i, len(block), block.weight) num_sources.append(n) return num_sources
def do_hazard_map_post_process(job): """ Create and distribute tasks for processing hazard curves into hazard maps. :param job: A :class:`openquake.engine.db.models.OqJob` which has some hazard curves associated with it. """ logs.LOG.debug('> Post-processing - Hazard Maps') block_size = int(config.get('hazard', 'concurrent_tasks')) poes = job.hazard_calculation.poes_hazard_maps # Stats for debug logging: hazard_curve_ids = models.HazardCurve.objects.filter( output__oq_job=job).values_list('id', flat=True) logs.LOG.debug('num haz curves: %s' % len(hazard_curve_ids)) # Limit the number of concurrent tasks to the configured concurrency level: block_gen = block_splitter(hazard_curve_ids, block_size) total_blocks = int(math.ceil(len(hazard_curve_ids) / float(block_size))) for i, block in enumerate(block_gen): logs.LOG.debug('> Hazard post-processing block, %s of %s' % (i + 1, total_blocks)) if openquake.engine.no_distribute(): # just execute the post-processing using the plain function form of # the task for hazard_curve_id in block: hazard_curves_to_hazard_map_task(job.id, hazard_curve_id, poes) else: tasks = [] for hazard_curve_id in block: tasks.append(hazard_curves_to_hazard_map_task.subtask( (job.id, hazard_curve_id, poes))) results = TaskSet(tasks=tasks).apply_async() utils_tasks._check_exception(results) logs.LOG.debug('< Done Hazard Map post-processing block, %s of %s' % (i + 1, total_blocks)) logs.LOG.debug('< Done post-processing - Hazard Maps')
def get_client(**kwargs): """ Return a redis kvs client connection for general OpenQuake engine calculation usage.. PLEASE NOTE: The 'db' argument is automatically read from the openquake.cfg and set. If specified in ``kwargs``, it will be overridden with the setting in openquake.cfg. """ global __KVS_CONN_POOL if __KVS_CONN_POOL is None: cfg = config.get_section("kvs") # get the default db from the openquake.cfg: db = int(config.get('kvs', 'redis_db')) __KVS_CONN_POOL = redis.ConnectionPool(max_connections=1, host=cfg["host"], port=int(cfg["port"]), db=db) kwargs.update({"connection_pool": __KVS_CONN_POOL}) return redis.Redis(**kwargs)
def job_from_files(cfg_files, username, log_level='info', exports='', **extras): """ Create a full job profile from a job config file. :param str cfg_files_path: Path to the job.ini files. :param str username: The user who will own this job profile and all results. :param str log_level: Desired log level. :param exports: Comma-separated sting of desired export types. :params extras: Extra parameters (used only in the tests to override the params) :returns: :class:`openquake.engine.db.models.OqJob` object :raises: `RuntimeError` if the input job configuration is not valid """ from openquake.commonlib.calculators import base # create the current job job = create_job(user_name=username, log_level=log_level) models.JobStats.objects.create(oq_job=job) with logs.handle(job, log_level): # read calculation params and create the calculation profile params = readinput.get_params(cfg_files) params['hazard_output_id'] = None params['hazard_calculation_id'] = None params.update(extras) # build and validate an OqParam object oqparam = readinput.get_oqparam(params, calculators=base.calculators) oqparam.concurrent_tasks = int(config.get('celery', 'concurrent_tasks')) job.save_params(vars(oqparam)) job.save() return job
def pre_execute(self): """ In this phase, the general workflow is: 1. Parse the exposure to get the taxonomies 2. Parse the available risk models 3. Validate exposure and risk models """ with self.monitor('get exposure'): self.taxonomies_asset_count = ( self.rc.preloaded_exposure_model or ExposureDBWriter(self.job).serialize( parsers.ExposureModelParser(self.rc.inputs['exposure'])) ).taxonomies_in(self.rc.region_constraint) with self.monitor('parse risk models'): self.risk_models = self.get_risk_models() for rm in self.risk_models.itervalues(): self.loss_types.update(rm.loss_types) # consider only the taxonomies in the risk models if # taxonomies_from_model has been set to True in the # job.ini if self.rc.taxonomies_from_model: self.taxonomies_asset_count = dict( (t, count) for t, count in self.taxonomies_asset_count.items() if t in self.risk_models) for validator_class in self.validators: validator = validator_class(self) error = validator.get_error() if error: raise ValueError("""Problems in calculator configuration: %s""" % error) num_assets = sum(self.taxonomies_asset_count.itervalues()) num_taxonomies = len(self.taxonomies_asset_count) logs.LOG.info('Considering %d assets of %d distinct taxonomies', num_assets, num_taxonomies) self.eps_sampling = int(config.get('risk', 'epsilon_sampling'))
def do_post_process(job): """ Run the GMF to hazard curve post-processing tasks for the given ``job``. :param job: A :class:`openquake.engine.db.models.OqJob` instance. """ logs.LOG.debug('> Post-processing - GMFs to Hazard Curves') block_size = int(config.get('hazard', 'concurrent_tasks')) block_gen = block_splitter(gmf_post_process_arg_gen(job), block_size) hc = job.hazard_calculation # Stats for debug logging: n_imts = len(hc.intensity_measure_types_and_levels) n_sites = len(hc.points_to_compute()) n_rlzs = models.LtRealization.objects.filter(hazard_calculation=hc).count() total_blocks = int(math.ceil( (n_imts * n_sites * n_rlzs) / float(block_size))) for i, block in enumerate(block_gen): logs.LOG.debug('> GMF post-processing block, %s of %s' % (i + 1, total_blocks)) # Run the tasks in blocks, to avoid overqueueing: tasks = [] for the_args in block: tasks.append(gmf_to_hazard_curve_task.subtask(the_args)) results = TaskSet(tasks=tasks).apply_async() # Check for Exceptions in the results and raise utils_tasks._check_exception(results) logs.LOG.debug('< Done GMF post-processing block, %s of %s' % (i + 1, total_blocks)) logs.LOG.debug('< Done post-processing - GMFs to Hazard Curves')
class SupervisorLogMessageConsumer(logs.AMQPLogSource): """ Supervise an OpenQuake job by: - handling its "critical" and "error" messages - periodically checking that the job process is still running """ # Failure counter check delay, translates to 60 seconds with the current # settings. FCC_DELAY = 60 terminate = general.str2bool( config.get('celery', 'terminate_workers_on_revoke')) def __init__(self, job_id, job_pid, timeout=1): self.job_id = job_id job = OqJob.objects.get(id=job_id) self.calc_id = job.calculation.id if job.hazard_calculation is not None: self.calc_domain = 'hazard' else: self.calc_domain = 'risk' self.selflogger = logging.getLogger('oq.%s.%s.supervisor' % (self.calc_domain, self.calc_id)) self.selflogger.debug('Entering supervisor for %s calc %s' % (self.calc_domain, self.calc_id)) logger_name = 'oq.%s.%s' % (self.calc_domain, self.calc_id) key = '%s.#' % logger_name super(SupervisorLogMessageConsumer, self).__init__(timeout=timeout, routing_key=key) self.job_pid = job_pid self.joblogger = logging.getLogger(logger_name) self.jobhandler = logging.Handler(logging.ERROR) self.jobhandler.emit = self.log_callback self.joblogger.addHandler(self.jobhandler) # Failure counter check delay value self.fcc_delay_value = 0 def run(self): """ Wrap superclass' method just to add cleanup. """ started = datetime.utcnow() super(SupervisorLogMessageConsumer, self).run() stopped = datetime.utcnow() self.selflogger.info( '%s calc %s finished in %s' % (self.calc_domain, self.calc_id, stopped - started)) self.joblogger.removeHandler(self.jobhandler) self.selflogger.debug('Exiting supervisor for %s calc %s' % (self.calc_domain, self.calc_id)) def log_callback(self, record): """ Handles messages of severe level from the supervised job. """ if record.name == self.selflogger.name: # ignore error log messages sent by selflogger. # this way we don't try to kill the job if its # process has crashed (or has been stopped). # we emit selflogger's error messages from # timeout_callback(). return terminate_job(self.job_pid) update_job_status(self.job_id) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id, self.terminate) self.stop() def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) failed_nodes = None if failures: message = "job terminated with failures: %s" % failures else: # Don't check for failed nodes if distribution is disabled. # In this case, we don't expect any nodes to be present, and # thus, there are none that can fail. if not openquake.engine.no_distribute(): failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'complete': message = 'job process %s succeeded' % self.job_pid self.selflogger.debug(message) elif not job_status == 'complete': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status(self.job_id) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id, self.terminate) raise StopIteration()
def concurrent_tasks(self): """ For hazard calculators, the number of tasks to be in queue at any given time is specified in the configuration file. """ return int(config.get('hazard', 'concurrent_tasks'))
def point_source_block_size(self): """ Similar to :meth:`block_size`, except that this parameter applies specifically to grouping of point sources. """ return int(config.get('hazard', 'point_source_block_size'))
def block_size(self): """ For hazard calculators, the number of work items per task is specified in the configuration file. """ return int(config.get("hazard", "block_size"))
def block_size(self): """ Number of assets handled per task. """ return int(config.get('risk', 'block_size'))
def open(cls): """Initialize the test store.""" if TestStore._conn is not None: return TestStore._conn = redis.Redis(db=int(config.get("kvs", "test_db")))
# Set up logging via amqp. if isinstance(calculation, models.HazardCalculation): logs.init_logs_amqp_send(level=job.log_level, calc_domain='hazard', calc_id=calculation.id) else: logs.init_logs_amqp_send(level=job.log_level, calc_domain='risk', calc_id=calculation.id) try: res = task_func(*args, **kwargs) except Exception, err: logs.LOG.critical('Error occurred in task: %s', err) logs.LOG.exception(err) raise else: return res finally: CacheInserter.flushall() # the task finished, we can remove from the performance # table the associated row 'storing task id', then the # supervisor will not try revoke it without need models.Performance.objects.filter( oq_job=job, operation='storing task id', task_id=tsk.request.id).delete() celery_queue = config.get('amqp', 'celery_queue') tsk = task(wrapped, ignore_result=True, queue=celery_queue) return tsk
# You should have received a copy of the GNU Affero General Public License # along with OpenQuake. If not, see <http://www.gnu.org/licenses/>. """Utility functions related to splitting work into tasks.""" from celery.result import ResultSet from celery.app import current_app from celery.task import task from openquake.commonlib.parallel import \ TaskManager, safely_call, check_mem_usage from openquake.engine import logs from openquake.engine.db import models from openquake.engine.utils import config from openquake.engine.writer import CacheInserter CONCURRENT_TASKS = int(config.get('celery', 'concurrent_tasks')) SOFT_MEM_LIMIT = int(config.get('memory', 'soft_mem_limit')) HARD_MEM_LIMIT = int(config.get('memory', 'hard_mem_limit')) check_mem_usage.__defaults__ = (SOFT_MEM_LIMIT, HARD_MEM_LIMIT) class JobNotRunning(Exception): pass class OqTaskManager(TaskManager): """ A celery-based task manager. The usage is:: oqm = OqTaskManager(do_something, logs.LOG.progress)
def concurrent_tasks(self): """ Number of tasks to be in queue at any given time. """ return int(config.get('risk', 'concurrent_tasks'))
logs.init_logs_amqp_send(level=job.log_level, calc_domain='hazard', calc_id=calculation.id) else: logs.init_logs_amqp_send(level=job.log_level, calc_domain='risk', calc_id=calculation.id) try: # Tasks can be used in the `execute` or `post-process` phase if job.is_running is False: raise JobCompletedError('Job %d was killed' % job_id) elif job.status not in ('executing', 'post_processing'): raise JobCompletedError( 'The status of job %d is %s, should be executing or ' 'post_processing' % (job_id, job.status)) # else continue with task execution res = task_func(*args, **kwargs) # TODO: should we do something different with JobCompletedError? except Exception, err: logs.LOG.critical('Error occurred in task: %s', err) logs.LOG.exception(err) raise else: return res finally: CacheInserter.flushall() celery_queue = config.get('amqp', 'celery_queue') tsk = task(wrapped, ignore_result=True, queue=celery_queue) return tsk
"""Engine: A collection of fundamental functions for initializing and running calculations.""" import sys import traceback from openquake.baselib.performance import Monitor from openquake.commonlib import valid from openquake.commonlib.oqvalidation import OqParam from openquake.calculators import base from openquake.engine import logs from openquake.engine.utils import config, tasks TERMINATE = valid.boolean( config.get('celery', 'terminate_workers_on_revoke') or 'false') USE_CELERY = valid.boolean(config.get('celery', 'use_celery') or 'false') if USE_CELERY: import celery.task.control def set_concurrent_tasks_default(): """ Set the default for concurrent_tasks to twice the number of workers. Returns the number of live celery nodes (i.e. the number of machines). """ stats = celery.task.control.inspect(timeout=1).stats() if not stats: sys.exit("No live compute nodes, aborting calculation") num_cores = sum(stats[k]['pool']['max-concurrency'] for k in stats)
from openquake.engine.performance import EnginePerformanceMonitor from openquake.engine.writer import CacheInserter from openquake.engine.settings import DATABASES from openquake.engine.db.models import Performance from openquake.engine.db.schema.upgrades import upgrader from openquake import hazardlib, risklib, commonlib from openquake.commonlib import readinput, valid INPUT_TYPES = set(dict(models.INPUT_TYPE_CHOICES)) UNABLE_TO_DEL_HC_FMT = 'Unable to delete hazard calculation: %s' UNABLE_TO_DEL_RC_FMT = 'Unable to delete risk calculation: %s' TERMINATE = valid.boolean(config.get('celery', 'terminate_workers_on_revoke')) class InvalidHazardCalculationID(Exception): pass RISK_HAZARD_MAP = dict(scenario_risk=['scenario'], scenario_damage=['scenario'], classical_risk=['classical'], classical_bcr=['classical'], classical_damage=['classical'], event_based_risk=['event_based'], event_based_bcr=['event_based'])
def block_size(self): """ For hazard calculators, the number of work items per task is specified in the configuration file. """ return int(config.get('hazard', 'block_size'))
def test_task(func, *args, **kwargs): kwargs['queue'] = config.get('amqp', 'celery_queue') return task(func, *args, **kwargs)
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with OpenQuake. If not, see <http://www.gnu.org/licenses/>. """Utility functions related to splitting work into tasks.""" import types from openquake.baselib.performance import Monitor from openquake.commonlib import parallel, valid from openquake.engine import logs from openquake.engine.utils import config litetask = parallel.litetask celery_queue = config.get('amqp', 'celery_queue') SOFT_MEM_LIMIT = int(config.get('memory', 'soft_mem_limit')) HARD_MEM_LIMIT = int(config.get('memory', 'hard_mem_limit')) USE_CELERY = valid.boolean(config.get('celery', 'use_celery') or 'false') parallel.check_mem_usage.__defaults__ = (Monitor(), SOFT_MEM_LIMIT, HARD_MEM_LIMIT) if USE_CELERY: from celery.result import ResultSet from celery.app import current_app from celery.task import task class OqTaskManager(parallel.TaskManager): """ A celery-based task manager. The usage is::
from openquake import risklib from openquake import nrmllib from openquake.commonlib import readini, valid INPUT_TYPES = set(dict(models.INPUT_TYPE_CHOICES)) UNABLE_TO_DEL_HC_FMT = "Unable to delete hazard calculation: %s" UNABLE_TO_DEL_RC_FMT = "Unable to delete risk calculation: %s" LOG_FORMAT = ( "[%(asctime)s %(job_type)s job #%(job_id)s %(hostname)s " "%(levelname)s %(processName)s/%(process)s] %(message)s" ) TERMINATE = valid.boolean(config.get("celery", "terminate_workers_on_revoke")) def cleanup_after_job(job, terminate): """ Release the resources used by an openquake job. In particular revoke the running tasks (if any). :param int job_id: the job id :param bool terminate: the celery revoke command terminate flag """ # Using the celery API, terminate and revoke and terminate any running # tasks associated with the current job. task_ids = Performance.objects.filter(oq_job=job, operation="storing task id", task_id__isnull=False).values_list( "task_id", flat=True )
from openquake.engine.calculators import base from openquake.engine.calculators.risk import \ writers, validation, hazard_getters from openquake.engine.utils import config, tasks from openquake.engine.performance import EnginePerformanceMonitor from openquake.engine.input.exposure import ExposureDBWriter MEMORY_ERROR = '''Running the calculation will require approximately %dM, i.e. more than the memory which is available right now (%dM). Please increase the free memory or apply a stringent region constraint to reduce the number of assets. Alternatively you can set epsilon_sampling in openquake.cfg. It the correlation is nonzero, consider setting asset_correlation=0 to avoid building the correlation matrix.''' eps_sampling = int(config.get('risk', 'epsilon_sampling')) @tasks.oqtask def prepare_risk(counts_taxonomy, calc, monitor): """ Associates the assets to the closest hazard sites and populate the table asset_site. For some calculators also initializes the epsilon matrices and save them on the database. :param counts_taxonomy: a sorted list of pairs (counts, taxonomy) for each bunch of assets :param calc: the current risk calculator :param monitor: monitor of the current risk job