def __iter__(self): self.received = [] for fut in self.futures: check_mem_usage() # log a warning if too much memory is used if hasattr(fut, 'result'): result = fut.result() else: result = fut if isinstance(result, BaseException): # this happens for instance with WorkerLostError with celery raise result elif hasattr(result, 'unpickle'): self.received.append(len(result)) val, etype, mon = result.unpickle() else: val, etype, mon = result self.received.append(len(Pickled(result))) if etype: raise RuntimeError(val) if self.num_tasks: next(self.log_percent) if not self.name.startswith('_'): # no info for private tasks self.save_task_data(mon) yield val if self.received: tot = sum(self.received) max_per_task = max(self.received) self.progress('Received %s of data, maximum per task %s', humansize(tot), humansize(max_per_task)) received = {'max_per_task': max_per_task, 'tot': tot} tname = self.name dic = {tname: {'sent': self.sent, 'received': received}} mon.save_info(dic)
def reduce(self, agg=operator.add, acc=None): """ Loop on a set of results and update the accumulator by using the aggregation function. :param agg: the aggregation function, (acc, val) -> new acc :param acc: the initial value of the accumulator :returns: the final value of the accumulator """ if acc is None: acc = AccumDict() log_percent = log_percent_gen( self.name, len(self.results), self.progress) next(log_percent) def agg_and_percent(acc, val_exc): (val, exc) = val_exc if exc: raise RuntimeError(val) res = agg(acc, val) next(log_percent) return res if self.no_distribute: agg_result = reduce(agg_and_percent, self.results, acc) else: self.progress('Sent %s of data', humansize(self.sent)) agg_result = self.aggregate_result_set(agg_and_percent, acc) self.progress('Received %s of data', humansize(self.received)) self.results = [] return agg_result
def ebr_data_transfer(token, dstore): """ Display the data transferred in an event based risk calculation """ attrs = dstore['losses_by_event'].attrs sent = humansize(attrs['sent']) received = humansize(attrs['tot_received']) return 'Event Based Risk: sent %s, received %s' % (sent, received)
def view_contents(token, dstore): """ Returns the size of the contents of the datastore and its total size """ oq = dstore['oqparam'] data = sorted((dstore.getsize(key), key) for key in dstore) rows = [(key, humansize(nbytes)) for nbytes, key in data] total = '\n%s : %s' % ( dstore.hdf5path, humansize(os.path.getsize(dstore.hdf5path))) return rst_table(rows, header=(oq.description, '')) + total
def source_data_transfer(token, dstore): """ Determine the amount of data transferred from the controller node to the workers and back in a classical calculation. """ block_info, to_send_forward, to_send_back = get_data_transfer(dstore) tbl = [ ('Number of tasks to generate', len(block_info)), ('Estimated sources to send', humansize(to_send_forward)), ('Estimated hazard curves to receive', humansize(to_send_back))] return rst_table(tbl)
def view_contents(token, dstore): """ Returns the size of the contents of the datastore and its total size """ try: desc = dstore['oqparam'].description except KeyError: desc = '' data = sorted((dstore.getsize(key), key) for key in dstore) rows = [(key, humansize(nbytes)) for nbytes, key in data] total = '\n%s : %s' % ( dstore.filename, humansize(os.path.getsize(dstore.filename))) return rst_table(rows, header=(desc, '')) + total
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ sitecol = dstore['sitecol'] rlzs_assoc = dstore['rlzs_assoc'] rupture_by_tag = sum(dstore['sescollection'], AccumDict()) all_tags = dstore['tags'].value oq = dstore['oqparam'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) samples = oq.number_of_logic_tree_samples fmt = ekey[-1] gmfs = dstore[ekey[0]] nbytes = gmfs.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warn(GMF_WARNING, dstore.hdf5path) fnames = [] for rlz, gmf_by_idx in zip( rlzs_assoc.realizations, rlzs_assoc.combine_gmfs(gmfs)): tags = all_tags[list(gmf_by_idx)] gmfs = list(gmf_by_idx.values()) if not gmfs: continue ruptures = [rupture_by_tag[tag] for tag in tags] fname = build_name(rlz, 'gmf', fmt, samples) fnames.append(os.path.join(dstore.export_dir, fname)) globals()['export_gmf_%s' % fmt]( ('gmf', fmt), dstore.export_dir, fname, sitecol, ruptures, gmfs, rlz, investigation_time) return fnames
def run(job_ini, concurrent_tasks=None, loglevel='info', hc=None, exports=''): """ Run a calculation. Optionally, set the number of concurrent_tasks (0 to disable the parallelization). """ logging.basicConfig(level=getattr(logging, loglevel.upper())) job_inis = job_ini.split(',') assert len(job_inis) in (1, 2), job_inis monitor = performance.Monitor('total', measuremem=True) if len(job_inis) == 1: # run hazard or risk oqparam = readinput.get_oqparam(job_inis[0], hc_id=hc) if hc and hc < 0: # interpret negative calculation ids calc_ids = datastore.get_calc_ids() try: hc = calc_ids[hc] except IndexError: raise SystemExit('There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), hc)) calc = base.calculators(oqparam, monitor) monitor.monitor_dir = calc.datastore.calc_dir with monitor: calc.run(concurrent_tasks=concurrent_tasks, exports=exports, hazard_calculation_id=hc) else: # run hazard + risk calc = run2( job_inis[0], job_inis[1], concurrent_tasks, exports, monitor) logging.info('Total time spent: %s s', monitor.duration) logging.info('Memory allocated: %s', general.humansize(monitor.mem)) monitor.flush() print('See the output with hdfview %s/output.hdf5' % calc.datastore.calc_dir) return calc
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ sitecol = dstore['sitecol'] rlzs_assoc = dstore['rlzs_assoc'] oq = dstore['oqparam'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) samples = oq.number_of_logic_tree_samples fmt = ekey[-1] sid_data = dstore['sid_data'] gmf_data = dstore['gmf_data'] nbytes = gmf_data.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warn(GMF_WARNING, dstore.hdf5path) fnames = [] for rlz, rup_by_etag in zip(rlzs_assoc.realizations, rlzs_assoc.combine_gmfs(gmf_data, sid_data)): ruptures = [rup_by_etag[etag] for etag in sorted(rup_by_etag)] fname = build_name(dstore, rlz, 'gmf', fmt, samples) fnames.append(fname) globals()['export_gmf_%s' % fmt]( ('gmf', fmt), fname, sitecol, ruptures, rlz, investigation_time) return fnames
def __repr__(self): if self.measuremem: return '<%s %s, duration=%ss, memory=%s>' % ( self.__class__.__name__, self.operation, self.duration, humansize(self.mem)) return '<%s %s, duration=%ss>' % (self.__class__.__name__, self.operation, self.duration)
def post_execute(self, result): """ Save the event loss table in the datastore. :param result: a numpy array of shape (O, L, R) containing lists of arrays """ nses = self.oqparam.ses_per_logic_tree_path saved = {out: 0 for out in self.outs} N = len(self.assetcol) with self.monitor('saving loss table', autoflush=True, measuremem=True): for (o, l, r), data in numpy.ndenumerate(result): if not data: # empty list continue if o in (ELT, ILT): # loss tables, data is a list of arrays losses = numpy.concatenate(data) self.datasets[o, l, r].extend(losses) saved[self.outs[o]] += losses.nbytes else: # risk curves, data is a list of counts dictionaries cb = self.riskmodel.curve_builders[l] counts_matrix = cb.get_counts(N, data) curves = cb.build_rcurves( counts_matrix, nses, self.assetcol) self.datasets[o, l, r].dset[:] = curves saved[self.outs[o]] += curves.nbytes self.datastore.hdf5.flush() for out in self.outs: nbytes = saved[out] if nbytes: self.datastore[out].attrs['nbytes'] = nbytes logging.info('Saved %s in %s', humansize(nbytes), out) else: # remove empty outputs del self.datastore[out]
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ sitecol = dstore['sitecol'] rlzs_assoc = dstore['csm_info'].get_rlzs_assoc() oq = dstore['oqparam'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) samples = oq.number_of_logic_tree_samples fmt = ekey[-1] etags = dstore['etags'].value gmf_data = dstore['gmf_data'] nbytes = gmf_data.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warn(GMF_WARNING, dstore.hdf5path) fnames = [] for rlz in rlzs_assoc.realizations: gmf_arr = gmf_data['%04d' % rlz.ordinal].value ruptures = [] for eid, gmfa in group_array(gmf_arr, 'eid').items(): rup = util.Rupture(etags[eid], sorted(set(gmfa['sid']))) rup.gmfa = gmfa ruptures.append(rup) ruptures.sort(key=operator.attrgetter('etag')) fname = build_name(dstore, rlz, 'gmf', fmt, samples) fnames.append(fname) globals()['export_gmf_%s' % fmt]( ('gmf', fmt), fname, sitecol, oq.imtls, ruptures, rlz, investigation_time) return fnames
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ oq = dstore['oqparam'] if not oq.calculation_mode.startswith('scenario'): return [] sitecol = dstore['sitecol'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) fmt = ekey[-1] gmf_data = dstore['gmf_data'] nbytes = gmf_data.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warning(GMF_WARNING, dstore.filename) data = gmf_data['data'].value ses_idx = 1 # for scenario only events = [] for eid, gmfa in group_array(data, 'eid').items(): rup = Event(eid, ses_idx, sorted(set(gmfa['sid'])), gmfa) events.append(rup) fname = dstore.build_fname('gmf', 'scenario', fmt) writer = hazard_writers.EventBasedGMFXMLWriter( fname, sm_lt_path='', gsim_lt_path='') writer.serialize( GmfCollection(sitecol, oq.imtls, events, investigation_time)) return [fname]
def post_execute(self, num_events): """ Save an array of losses by taxonomy of shape (T, L, R). """ event_based.EventBasedRuptureCalculator.__dict__['post_execute']( self, num_events) if self.gmfbytes == 0: raise RuntimeError('No GMFs were generated, perhaps they were ' 'all below the minimum_intensity threshold') logging.info('Generated %s of GMFs', humansize(self.gmfbytes)) self.datastore.save('job_info', {'gmfbytes': self.gmfbytes}) A, E = len(self.assetcol), num_events if 'all_loss_ratios' in self.datastore: for rlzname in self.datastore['all_loss_ratios']: self.datastore.set_nbytes('all_loss_ratios/' + rlzname) self.datastore.set_nbytes('all_loss_ratios') asslt = self.datastore['all_loss_ratios'] for rlz, dset in asslt.items(): for ds in dset.values(): ds.attrs['nonzero_fraction'] = len(ds) / (A * E) if 'agg_loss_table' not in self.datastore: logging.warning( 'No losses were generated: most likely there is an error in y' 'our input files or the GMFs were below the minimum intensity') else: for rlzname in self.datastore['agg_loss_table']: self.datastore.set_nbytes('agg_loss_table/' + rlzname) self.datastore.set_nbytes('agg_loss_table') agglt = self.datastore['agg_loss_table'] for rlz, dset in agglt.items(): for ds in dset.values(): ds.attrs['nonzero_fraction'] = len(ds) / E
def submit_all(self): """ :returns: an IterResult object """ try: nargs = len(self.task_args) except TypeError: # generators have no len nargs = '' if nargs == 1: [args] = self.task_args self.progress('Executing a single task in process') fut = mkfuture(safely_call(self.task_func, args)) return IterResult([fut], self.name) task_no = 0 for args in self.task_args: task_no += 1 if task_no == 1: # first time self.progress('Submitting %s "%s" tasks', nargs, self.name) if isinstance(args[-1], Monitor): # add incremental task number args[-1].task_no = task_no weight = getattr(args[0], 'weight', None) if weight: args[-1].weight = weight self.submit(*args) if not task_no: self.progress('No %s tasks were submitted', self.name) # NB: keep self._iterfutures() an iterator, especially with celery! ir = IterResult(self._iterfutures(), self.name, task_no, self.progress) ir.sent = self.sent # for information purposes if self.sent: self.progress('Sent %s of data in %d task(s)', humansize(sum(self.sent.values())), ir.num_tasks) return ir
def view_job_info(token, dstore): """ Determine the amount of data transferred from the controller node to the workers and back in a classical calculation. """ data = [['task', 'sent', 'received']] for task in dstore['task_info']: dset = dstore['task_info/' + task] if 'argnames' in dset.attrs: argnames = dset.attrs['argnames'].split() totsent = dset.attrs['sent'] sent = ['%s=%s' % (a, humansize(s)) for s, a in sorted(zip(totsent, argnames), reverse=True)] recv = dset['received'].sum() data.append((task, ' '.join(sent), humansize(recv))) return rst_table(data)
def _humansize(literal): dic = ast.literal_eval(decode(literal)) if isinstance(dic, dict): items = sorted(dic.items(), key=operator.itemgetter(1), reverse=True) lst = ['%s %s' % (k, humansize(v)) for k, v in items] return ', '.join(lst) else: return str(dic)
def __repr__(self): calc_id = ' #%s ' % self.calc_id if self.calc_id else ' ' msg = '%s%s%s' % (self.__class__.__name__, calc_id, self.operation) if self.measuremem: return '<%s, duration=%ss, memory=%s>' % ( msg, self.duration, humansize(self.mem)) elif self.duration: return '<%s, duration=%ss>' % (msg, self.duration) else: return '<%s>' % msg
def show(calc_id, key=None, rlzs=None): """ Show the content of a datastore. :param calc_id: numeric calculation ID; if 0, show all calculations :param key: key of the datastore :param rlzs: flag; if given, print out the realizations in order """ if not calc_id: if not os.path.exists(datastore.DATADIR): return rows = [] for calc_id in datastore.get_calc_ids(datastore.DATADIR): try: oq = OqParam.from_(datastore.DataStore(calc_id).attrs) cmode, descr = oq.calculation_mode, oq.description except: # invalid datastore directory logging.warn('Removed invalid calculation %d', calc_id) shutil.rmtree(os.path.join( datastore.DATADIR, 'calc_%s' % calc_id)) else: rows.append((calc_id, cmode, descr)) for row in sorted(rows, key=lambda row: row[0]): # by calc_id print('#%d %s: %s' % row) return ds = datastore.DataStore(calc_id) if key: if key in datastore.view: print(datastore.view(key, ds)) return obj = ds[key] if hasattr(obj, 'value'): # an array print(write_csv(io.StringIO(), obj.value)) else: print(obj) return # print all keys oq = OqParam.from_(ds.attrs) print(oq.calculation_mode, 'calculation (%r) saved in %s contains:' % (oq.description, ds.hdf5path)) for key in ds: print(key, humansize(ds.getsize(key))) # this part is experimental and not tested on purpose if rlzs and 'curves_by_trt_gsim' in ds: min_value = 0.01 # used in rmsep curves_by_rlz, mean_curves = combined_curves(ds) dists = [] for rlz in sorted(curves_by_rlz): curves = curves_by_rlz[rlz] dist = sum(rmsep(mean_curves[imt], curves[imt], min_value) for imt in mean_curves.dtype.fields) dists.append((dist, rlz)) for dist, rlz in sorted(dists): print('rlz=%s, rmsep=%s' % (rlz, dist))
def reduce(self, agg=operator.add, acc=None, posthook=None): """ Loop on a set of results and update the accumulator by using the aggregation function. :param agg: the aggregation function, (acc, val) -> new acc :param acc: the initial value of the accumulator :returns: the final value of the accumulator """ if acc is None: acc = AccumDict() num_tasks = len(self.results) if num_tasks == 0: logging.warn('No tasks were submitted') return acc log_percent = log_percent_gen(self.name, num_tasks, self.progress) next(log_percent) def agg_and_percent(acc, triple): (val, exc, mon) = triple if exc: raise RuntimeError(val) res = agg(acc, val) next(log_percent) mon.flush() return res if self.no_distribute: agg_result = reduce(agg_and_percent, self.results, acc) else: self.progress('Sent %s of data in %d task(s)', humansize(sum(self.sent.values())), num_tasks) agg_result = self.aggregate_result_set(agg_and_percent, acc) self.progress('Received %s of data, maximum per task %s', humansize(sum(self.received)), humansize(max(self.received))) if posthook: posthook(self) self.results = [] return agg_result
def view_gmfs_total_size(name, dstore): """ :returns: the total size of the GMFs as human readable string; it assumes 4 bytes for the rupture index, 4 bytes for the realization index and 8 bytes for each float (there are num_imts floats per gmf) """ nbytes = 0 num_imts = len(dstore['oqparam'].imtls) for counts in dstore['counts_per_rlz']: nbytes += 8 * counts['gmf'] * (num_imts + 1) return humansize(nbytes)
def post_execute(self, num_events): """ Save an array of losses by taxonomy of shape (T, L, R). """ if self.gmfbytes == 0: raise RuntimeError('No GMFs were generated, perhaps they were ' 'all below the minimum_intensity threshold') logging.info('Generated %s of GMFs', humansize(self.gmfbytes)) self.datastore.save('job_info', {'gmfbytes': self.gmfbytes}) logging.info('Saved %s losses by taxonomy', (self.T, self.L, self.R)) logging.info('Saved %d event losses', num_events) self.datastore.set_nbytes('agg_loss_table') self.datastore.set_nbytes('events')
def avglosses_data_transfer(token, dstore): """ Determine the amount of average losses transferred from the workers to the controller node in a risk calculation. """ oq = OqParam.from_(dstore.attrs) N = len(dstore['assetcol']) R = len(dstore['rlzs_assoc'].realizations) L = len(dstore['riskmodel'].loss_types) ct = oq.concurrent_tasks size_bytes = N * R * L * 2 * 8 * ct # two 8 byte floats, loss and ins_loss return ('%d asset(s) x %d realization(s) x %d loss type(s) x 2 losses x ' '8 bytes x %d tasks = %s' % (N, R, L, ct, humansize(size_bytes)))
def zip_all(directory): """ Zip source models and exposures recursively """ zips = [] for cwd, dirs, files in os.walk(directory): if 'ssmLT.xml' in files: zips.append(zip_source_model(os.path.join(cwd, 'ssmLT.xml'))) for f in files: if f.endswith('.xml') and 'exposure' in f.lower(): zips.append(zip_exposure(os.path.join(cwd, f))) total = sum(os.path.getsize(z) for z in zips) logging.info('Generated %s of zipped data', general.humansize(total))
def show(calc_id, key=None, rlzs=None): """ Show the content of a datastore. :param calc_id: numeric calculation ID; if 0, show all calculations :param key: key of the datastore :param rlzs: flag; if given, print out the realizations in order """ if not calc_id: if not os.path.exists(datastore.DATADIR): return rows = [] for name in sorted(os.listdir(datastore.DATADIR)): mo = re.match('calc_(\d+)', name) if mo: calc_id = int(mo.group(1)) try: oq = datastore.DataStore(calc_id)['oqparam'] except: # invalid datastore directory shutil.rmtree(os.path.join( datastore.DATADIR, 'calc_%s' % calc_id)) else: rows.append((calc_id, oq.calculation_mode, oq.description)) for row in sorted(rows, key=lambda row: row[0]): # by calc_id print('#%d %s: %s' % row) return ds = datastore.DataStore(calc_id) if key: obj = ds[key] if key.startswith('/') and hasattr(obj, 'value'): print(obj.value) else: print(obj) return # print all keys oq = ds['oqparam'] print(oq.calculation_mode, 'calculation (%r) saved in %s contains:' % (oq.description, ds.calc_dir)) for key in ds: print(key, humansize(ds.getsize(key))) if rlzs and 'curves_by_trt_gsim' in ds: min_value = 0.01 # used in rmsep curves_by_rlz, mean_curves = combined_curves(ds) dists = [] for rlz in sorted(curves_by_rlz): curves = curves_by_rlz[rlz] dist = sum(rmsep(mean_curves[imt], curves[imt], min_value) for imt in mean_curves.dtype.fields) dists.append((dist, rlz)) for dist, rlz in sorted(dists): print('rlz=%s, rmsep=%s' % (rlz, dist))
def __iter__(self): self.received = [] for fut in self.futures: check_mem_usage() # log a warning if too much memory is used if hasattr(fut, 'result'): result = fut.result() else: result = fut if hasattr(result, 'unpickle'): self.received.append(len(result)) val, etype, mon = result.unpickle() else: val, etype, mon = result if etype: raise RuntimeError(val) if self.num_tasks: next(self.log_percent) self.save_task_data(mon) yield val if self.received: self.progress('Received %s of data, maximum per task %s', humansize(sum(self.received)), humansize(max(self.received)))
def export(calc_id, datastore_key, format='csv', export_dir='.'): """ Export an output from the datastore. """ logging.basicConfig(level=logging.INFO) dstore = datastore.DataStore(calc_id) dstore.export_dir = export_dir with performance.PerformanceMonitor('export', measuremem=True) as mon: for fmt in format.split(','): fnames = export_((datastore_key, fmt), dstore) nbytes = sum(os.path.getsize(f) for f in fnames) print('Exported %s in %s' % (general.humansize(nbytes), fnames)) if mon.duration > 1: print(mon)
def avglosses_data_transfer(token, dstore): """ Determine the amount of average losses transferred from the workers to the controller node in a risk calculation. """ oq = dstore['oqparam'] N = len(dstore['assetcol']) R = dstore['csm_info'].get_num_rlzs() L = len(dstore.get_attr('risk_model', 'loss_types')) ct = oq.concurrent_tasks size_bytes = N * R * L * 8 * ct # 8 byte floats return ( '%d asset(s) x %d realization(s) x %d loss type(s) losses x ' '8 bytes x %d tasks = %s' % (N, R, L, ct, humansize(size_bytes)))
def info(name, filtersources=False, weightsources=False, datatransfer=False): """ Give information. You can pass the name of an available calculator, a job.ini file, or a zip archive with the input files. """ logging.basicConfig(level=logging.INFO) with Monitor('info', measuremem=True) as mon: if datatransfer: oqparam = readinput.get_oqparam(name) calc = base.calculators(oqparam) calc.pre_execute() n_tasks, to_send_forward, to_send_back = data_transfer(calc) _print_info(calc.rlzs_assoc, oqparam, calc.composite_source_model, calc.sitecol, weightsources=True) print('Number of tasks to be generated: %d' % n_tasks) print('Estimated data to be sent forward: %s' % humansize(to_send_forward)) print('Estimated data to be sent back: %s' % humansize(to_send_back)) else: _info(name, filtersources, weightsources) if mon.duration > 1: print(mon)
def __init__(self, dstore): self.dstore = dstore self.oq = oq = dstore['oqparam'] self.text = (oq.description.encode('utf8') + '\n' + '=' * len(oq.description)) info = dstore['job_info'] dpath = dstore.hdf5path mtime = os.path.getmtime(dpath) self.text += '\n\n%s:%s updated %s' % (info.hostname, dpath, time.ctime(mtime)) # NB: in the future, the sitecol could be transferred as # an array by leveraging the HDF5 serialization protocol in # litetask decorator; for the moment however the size of the # data to transfer is given by the usual pickle sitecol_size = humansize(len(parallel.Pickled(dstore['sitecol']))) self.text += '\n\nnum_sites = %d, sitecol = %s' % (len( dstore['sitemesh']), sitecol_size)
def __init__(self, iresults, taskname, argnames, num_tasks, sent, progress=logging.info, hdf5=None): self.iresults = iresults self.name = taskname self.argnames = ' '.join(argnames) self.num_tasks = num_tasks self.sent = sent self.progress = progress self.hdf5 = hdf5 self.received = [] if self.num_tasks: self.log_percent = self._log_percent() next(self.log_percent) else: self.progress('No %s tasks were submitted', self.name) self.progress('Sent %s of data in %s task(s)', humansize(sent.sum()), num_tasks)
def export(datastore_key, export_dir='.', calc_id=-1, exports='csv'): """ Export an output from the datastore. """ logging.basicConfig(level=logging.INFO) dstore = datastore.read(calc_id) parent_id = dstore['oqparam'].hazard_calculation_id if parent_id: dstore.set_parent(datastore.read(parent_id)) dstore.export_dir = export_dir with performance.Monitor('export', measuremem=True) as mon: for fmt in exports.split(','): fnames = export_((datastore_key, fmt), dstore) nbytes = sum(os.path.getsize(f) for f in fnames) print('Exported %s in %s' % (general.humansize(nbytes), fnames)) if mon.duration > 1: print(mon)
def export(datastore_key, calc_id=-1, exports='csv', export_dir='.'): """ Export an output from the datastore. """ logging.basicConfig(level=logging.INFO) dstore = datastore.read(calc_id) parent_id = dstore['oqparam'].hazard_calculation_id if parent_id: dstore.parent = datastore.read(parent_id) dstore.export_dir = export_dir with performance.Monitor('export', measuremem=True) as mon: for fmt in exports.split(','): fnames = export_((datastore_key, fmt), dstore) nbytes = sum(os.path.getsize(f) for f in fnames) print('Exported %s in %s' % (general.humansize(nbytes), fnames)) if mon.duration > 1: print(mon)
def _run(job_inis, concurrent_tasks, calc_id, pdb, loglevel, hc, exports, params): global calc_path assert len(job_inis) in (1, 2), job_inis # set the logs first of all calc_id = logs.init(calc_id, getattr(logging, loglevel.upper())) # disable gzip_input base.BaseCalculator.gzip_inputs = lambda self: None with performance.Monitor('total runtime', measuremem=True) as monitor: if os.environ.get('OQ_DISTRIBUTE') not in ('no', 'processpool'): os.environ['OQ_DISTRIBUTE'] = 'processpool' if len(job_inis) == 1: # run hazard or risk if hc: hc_id = hc[0] rlz_ids = hc[1:] else: hc_id = None rlz_ids = () oqparam = readinput.get_oqparam(job_inis[0], hc_id=hc_id) if not oqparam.cachedir: # enable caching oqparam.cachedir = datastore.get_datadir() if hc_id and hc_id < 0: # interpret negative calculation ids calc_ids = datastore.get_calc_ids() try: hc_id = calc_ids[hc_id] except IndexError: raise SystemExit('There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), hc_id)) calc = base.calculators(oqparam, calc_id) calc.run(concurrent_tasks=concurrent_tasks, pdb=pdb, exports=exports, hazard_calculation_id=hc_id, rlz_ids=rlz_ids, **params) else: # run hazard + risk calc = run2(job_inis[0], job_inis[1], calc_id, concurrent_tasks, pdb, loglevel, exports, params) logging.info('Total time spent: %s s', monitor.duration) logging.info('Memory allocated: %s', general.humansize(monitor.mem)) print('See the output with silx view %s' % calc.datastore.filename) calc_path, _ = os.path.splitext(calc.datastore.filename) # used below return calc
def main(datastore_key, calc_id: int = -1, *, exports='csv', export_dir='.'): """ Export an output from the datastore. To see the available datastore keys, use the command `oq info exports`. """ dstore = util.read(calc_id) parent_id = dstore['oqparam'].hazard_calculation_id if parent_id: dstore.parent = util.read(parent_id) dstore.export_dir = export_dir with performance.Monitor('export', measuremem=True) as mon: for fmt in exports.split(','): fnames = export_((datastore_key, fmt), dstore) nbytes = sum(os.path.getsize(f) for f in fnames) print('Exported %s in %s' % (general.humansize(nbytes), fnames)) if mon.duration > 1: print(mon) dstore.close()
def get(self, what): """ :param what: what to extract :returns: an ArrayWrapper instance """ url = '%s/v1/calc/%d/extract/%s' % (self.server, self.calc_id, what) logging.info('GET %s', url) resp = self.sess.get(url) if resp.status_code != 200: raise WebAPIError(resp.text) logging.info('Read %s of data' % general.humansize(len(resp.content))) npz = numpy.load(io.BytesIO(resp.content)) attrs = {k: npz[k] for k in npz if k != 'array'} try: arr = npz['array'] except KeyError: arr = () return ArrayWrapper(arr, attrs)
def zip_all_jobs(directory): """ Zip job.ini files recursively """ zips = [] for cwd, dirs, files in os.walk(directory): job_inis = [os.path.join(cwd, f) for f in sorted(files) if f.endswith('.ini')] if not job_inis: continue elif len(job_inis) == 2: job_ini, risk_ini = job_inis else: [job_ini], risk_ini = job_inis, '' archive_zip = job_ini[:-4].replace('_hazard', '') + '.zip' zips.append(zip_job(job_ini, archive_zip, risk_ini)) total = sum(os.path.getsize(z) for z in zips) logging.info('Generated %s of zipped data', general.humansize(total))
def run_calc(log): """ Run a calculation. :param log: LogContext of the current job """ register_signals() setproctitle('oq-job-%d' % log.calc_id) with log: oqparam = log.get_oqparam() calc = base.calculators(oqparam, log.calc_id) logging.info('%s running %s [--hc=%s]', getpass.getuser(), calc.oqparam.inputs['job_ini'], calc.oqparam.hazard_calculation_id) logging.info('Using engine version %s', __version__) msg = check_obsolete_version(oqparam.calculation_mode) # NB: disabling the warning should be done only for users with # an updated LTS version, but we are doing it for all users # if msg: # logging.warning(msg) calc.from_engine = True if config.zworkers['host_cores']: set_concurrent_tasks_default(calc) else: logging.warning('Assuming %d %s workers', parallel.Starmap.num_cores, OQ_DISTRIBUTE) t0 = time.time() calc.run() logging.info('Exposing the outputs to the database') expose_outputs(calc.datastore) path = calc.datastore.filename size = general.humansize(getsize(path)) logging.info('Stored %s on %s in %d seconds', size, path, time.time() - t0) calc.datastore.close() for line in logs.dbcmd('list_outputs', log.calc_id, False): general.safeprint(line) # sanity check to make sure that the logging on file is working if (log.log_file and log.log_file != os.devnull and getsize(log.log_file) == 0): logging.warning('The log file %s is empty!?' % log.log_file) return calc
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ sitecol = dstore['sitecol'] rlzs_assoc = dstore['csm_info'].get_rlzs_assoc() oq = dstore['oqparam'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) fmt = ekey[-1] gmf_data = dstore['gmf_data'] nbytes = gmf_data.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warn(GMF_WARNING, dstore.hdf5path) fnames = [] ruptures_by_rlz = collections.defaultdict(list) for grp_id, gsim in rlzs_assoc: key = 'grp-%02d' % grp_id try: events = dstore['events/' + key] except KeyError: # source model producing zero ruptures continue eventdict = dict(zip(events['eid'], events)) try: data = gmf_data['%s/%s' % (key, gsim)].value except KeyError: # no GMFs for the given realization continue for rlzi, rlz in enumerate(rlzs_assoc[grp_id, gsim]): ruptures = ruptures_by_rlz[rlz] gmf_arr = get_array(data, rlzi=rlzi) for eid, gmfa in group_array(gmf_arr, 'eid').items(): ses_idx = eventdict[eid]['ses'] rup = Rup(eid, ses_idx, sorted(set(gmfa['sid'])), gmfa) ruptures.append(rup) for rlz in sorted(ruptures_by_rlz): ruptures_by_rlz[rlz].sort(key=operator.attrgetter('eid')) fname = dstore.build_fname('gmf', rlz, fmt) fnames.append(fname) globals()['export_gmf_%s' % fmt](('gmf', fmt), fname, sitecol, oq.imtls, ruptures_by_rlz[rlz], rlz, investigation_time) return fnames
def execute(self): oq = self.oqparam self.set_param( num_taxonomies=self.assetcol.num_taxonomies_by_site(), maxweight=oq.ebrisk_maxweight / (oq.concurrent_tasks or 1), epspath=cache_epsilons(self.datastore, oq, self.assetcol, self.riskmodel, self.E)) parent = self.datastore.parent if parent: hdf5path = parent.filename grp_indices = parent['ruptures'].attrs['grp_indices'] nruptures = len(parent['ruptures']) else: hdf5path = self.datastore.hdf5cache() grp_indices = self.datastore['ruptures'].attrs['grp_indices'] nruptures = len(self.datastore['ruptures']) with hdf5.File(hdf5path, 'r+') as cache: self.datastore.hdf5.copy('weights', cache) self.datastore.hdf5.copy('ruptures', cache) self.datastore.hdf5.copy('rupgeoms', cache) self.init_logic_tree(self.csm_info) smap = parallel.Starmap(self.core_task.__func__, monitor=self.monitor()) trt_by_grp = self.csm_info.grp_by("trt") samples = self.csm_info.get_samples_by_grp() rlzs_by_gsim_grp = self.csm_info.get_rlzs_by_gsim_grp() ruptures_per_block = numpy.ceil(nruptures / (oq.concurrent_tasks or 1)) first_event = 0 for grp_id, rlzs_by_gsim in rlzs_by_gsim_grp.items(): start, stop = grp_indices[grp_id] for indices in general.block_splitter(range(start, stop), ruptures_per_block): rgetter = getters.RuptureGetter(hdf5path, list(indices), grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim, first_event) first_event += rgetter.num_events smap.submit(rgetter, self.src_filter, self.param) self.events_per_sid = [] self.gmf_nbytes = 0 res = smap.reduce(self.agg_dicts, numpy.zeros(self.N)) logging.info('Produced %s of GMFs', general.humansize(self.gmf_nbytes)) return res
def save_gmdata(calc, n_rlzs): """ Save a composite array `gmdata` in the datastore. :param calc: a calculator with a dictionary .gmdata {rlz: data} :param n_rlzs: the total number of realizations """ n_sites = len(calc.sitecol) dtlist = ([(imt, F32) for imt in calc.oqparam.imtls] + [('events', U32), ('nbytes', U32)]) array = numpy.zeros(n_rlzs, dtlist) for rlzi in sorted(calc.gmdata): data = calc.gmdata[rlzi] # (imts, events, nbytes) events = data[-2] nbytes = data[-1] gmv = data[:-2] / events / n_sites array[rlzi] = tuple(gmv) + (events, nbytes) calc.datastore['gmdata'] = array logging.info('Generated %s of GMFs', humansize(array['nbytes'].sum()))
def post_execute(self, dummy): oq = self.oqparam L, Dc = self.dmgcsq.shape[1:] """ for loss_id in range(L): for dci in range(Dc): dmgcsq = self.dmgcsq[:, loss_id, dci] * oq.time_ratio dic['loss_id'] = loss_id self.datastore.create_df('dmgcsq', pandas.DataFrame(dic)) """ size = self.datastore.getsize('risk_by_event') logging.info('Building aggregated curves from %s of risk_by_event', general.humansize(size)) builder = get_loss_builder(self.datastore) alt_df = self.datastore.read_df('risk_by_event') del alt_df['event_id'] dic = general.AccumDict(accum=[]) columns = sorted( set(alt_df.columns) - {'agg_id', 'loss_id', 'variance'}) periods = [0] + list(builder.return_periods) for (agg_id, loss_id), df in alt_df.groupby([alt_df.agg_id, alt_df.loss_id]): tots = [df[col].sum() * oq.time_ratio for col in columns] curves = [ builder.build_curve(df[col].to_numpy()) for col in columns ] for p, period in enumerate(periods): dic['agg_id'].append(agg_id) dic['loss_id'].append(loss_id) dic['return_period'].append(period) if p == 0: for col, tot in zip(columns, tots): dic[col].append(tot) else: for col, curve in zip(columns, curves): dic[col].append(curve[p - 1]) fix_dtype(dic, U16, ['agg_id']) fix_dtype(dic, U8, ['loss_id']) fix_dtype(dic, U32, ['return_period']) fix_dtype(dic, F32, columns) ls = ' '.join(self.crmodel.damage_states[1:]) self.datastore.create_df('aggcurves', dic.items(), limit_states=ls)
def post_execute(self, dummy): oq = self.oqparam A, L, Dc = self.dmgcsq.shape dmgcsq = self.dmgcsq * oq.time_ratio self.datastore['damages-rlzs'] = dmgcsq.reshape((A, 1, L, Dc)) set_rlzs_stats(self.datastore, 'damages', asset_id=self.assetcol['id'], loss_type=oq.loss_names, dmg_state=['no_damage'] + self.crmodel.get_dmg_csq()) size = self.datastore.getsize('risk_by_event') logging.info('Building aggregated curves from %s of risk_by_event', general.humansize(size)) alt_df = self.datastore.read_df('risk_by_event') del alt_df['event_id'] dic = general.AccumDict(accum=[]) columns = sorted( set(alt_df.columns) - {'agg_id', 'loss_id', 'variance'}) periods = [0] + list(self.builder.return_periods) for (agg_id, loss_id), df in alt_df.groupby([alt_df.agg_id, alt_df.loss_id]): tots = [df[col].sum() * oq.time_ratio for col in columns] curves = [ self.builder.build_curve(df[col].to_numpy()) for col in columns ] for p, period in enumerate(periods): dic['agg_id'].append(agg_id) dic['loss_id'].append(loss_id) dic['return_period'].append(period) if p == 0: for col, tot in zip(columns, tots): dic[col].append(tot) else: for col, curve in zip(columns, curves): dic[col].append(curve[p - 1]) fix_dtype(dic, U16, ['agg_id']) fix_dtype(dic, U8, ['loss_id']) fix_dtype(dic, U32, ['return_period']) fix_dtype(dic, F32, columns) ls = ' '.join(self.crmodel.damage_states[1:]) self.datastore.create_df('aggcurves', dic.items(), limit_states=ls) self.sanity_check(dmgcsq)
def __init__(self, iresults, taskname, num_tasks, progress=logging.info, sent=0): self.iresults = iresults self.name = taskname self.num_tasks = num_tasks self.progress = progress self.sent = sent self.received = [] if self.num_tasks: self.log_percent = self._log_percent() next(self.log_percent) else: self.progress('No %s tasks were submitted', self.name) if sent: self.progress('Sent %s of data in %s task(s)', humansize(sum(sent.values())), num_tasks)
def pre_execute(self): oq = self.oqparam ds = self.datastore self.reaggreate = False if oq.hazard_calculation_id and not ds.parent: ds.parent = datastore.read(oq.hazard_calculation_id) assetcol = ds['assetcol'] self.aggkey = base.save_agg_values(ds, assetcol, oq.loss_names, oq.aggregate_by) aggby = ds.parent['oqparam'].aggregate_by self.reaggreate = aggby and oq.aggregate_by != aggby if self.reaggreate: self.num_tags = dict( zip(aggby, assetcol.tagcol.agg_shape(aggby))) else: assetcol = ds['assetcol'] self.aggkey = assetcol.tagcol.get_aggkey(oq.aggregate_by) self.L = len(oq.loss_names) size = general.humansize(ds.getsize('agg_loss_table')) logging.info('Stored %s in the agg_loss_table', size)
def get_max_gmf_size(dstore): """ Upper limit for the size of the GMFs """ oq = dstore['oqparam'] n_imts = len(oq.imtls) rlzs_by_trt_id = dstore['csm_info'].get_rlzs_assoc().get_rlzs_by_trt_id() n_ruptures = collections.Counter() size = collections.Counter() # by trt_id for serial in dstore['sescollection']: ebr = dstore['sescollection/' + serial] trt_id = ebr.trt_id n_ruptures[trt_id] += 1 # there are 4 bytes per float size[trt_id] += (len(ebr.indices) * ebr.multiplicity * len(rlzs_by_trt_id[trt_id]) * n_imts) * 4 [(trt_id, maxsize)] = size.most_common(1) return dict(n_imts=n_imts, size=maxsize, n_ruptures=n_ruptures[trt_id], n_rlzs=len(rlzs_by_trt_id[trt_id]), trt_id=trt_id, humansize=humansize(maxsize))
def run(job_ini, concurrent_tasks=None, pdb=None, loglevel='info', hc=None, exports=''): """ Run a calculation. Optionally, set the number of concurrent_tasks (0 to disable the parallelization). """ logging.basicConfig(level=getattr(logging, loglevel.upper())) job_inis = job_ini.split(',') assert len(job_inis) in (1, 2), job_inis monitor = performance.PerformanceMonitor('total', measuremem=True) if len(job_inis) == 1: # run hazard or risk oqparam = readinput.get_oqparam(job_inis[0], hc_id=hc) if hc and hc < 0: # interpret negative calculation ids calc_ids = datastore.get_calc_ids() try: hc = calc_ids[hc] except IndexError: raise SystemExit('There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), hc)) calc = base.calculators(oqparam, monitor) monitor.monitor_dir = calc.datastore.calc_dir with monitor: calc.run(concurrent_tasks=concurrent_tasks, pdb=pdb, exports=exports, hazard_calculation_id=hc) else: # run hazard + risk calc = run2(job_inis[0], job_inis[1], concurrent_tasks, pdb, exports, monitor) logging.info('Total time spent: %s s', monitor.duration) logging.info('Memory allocated: %s', general.humansize(monitor.mem)) monitor.flush() print('See the output with hdfview %s/output.hdf5' % calc.datastore.calc_dir) return calc
def submit_all(self): """ :returns: an IterResult object """ try: nargs = len(self.task_args) except TypeError: # generators have no len nargs = '' if nargs == 1: [args] = self.task_args self.progress('Executing "%s" in process', self.name) fut = mkfuture(safely_call(self.task_func, args)) return IterResult([fut], self.name, nargs) if self.distribute == 'qsub': logging.warn('EXPERIMENTAL: sending tasks to the grid engine') allargs = list(self.task_args) return IterResult(qsub(self.task_func, allargs), self.name, len(allargs), self.progress) task_no = 0 for args in self.task_args: task_no += 1 if task_no == 1: # first time self.progress('Submitting %s "%s" tasks', nargs, self.name) if isinstance(args[-1], Monitor): # add incremental task number and task weight args[-1].task_no = task_no args[-1].weight = getattr(args[0], 'weight', 1.) self.submit(*args) if not task_no: self.progress('No %s tasks were submitted', self.name) # NB: keep self._iterfutures() an iterator, especially with celery! ir = IterResult(self._iterfutures(), self.name, task_no, self.progress) ir.sent = self.sent # for information purposes if self.sent: self.progress('Sent %s of data in %d task(s)', humansize(sum(self.sent.values())), ir.num_tasks) return ir
def __init__(self, futures, taskname, num_tasks, progress=logging.info, sent=0): self.futures = futures self.name = taskname self.num_tasks = num_tasks if self.name.startswith("_"): # private task, log only in debug self.progress = logging.debug else: self.progress = progress self.sent = sent self.received = [] if self.num_tasks: self.log_percent = self._log_percent() next(self.log_percent) if sent: self.progress('Sent %s of data in %s task(s)', humansize(sum(sent.values())), num_tasks)
def _run(job_ini, concurrent_tasks, pdb, loglevel, hc, exports, params): global calc_path logging.basicConfig(level=getattr(logging, loglevel.upper())) job_inis = job_ini.split(',') assert len(job_inis) in (1, 2), job_inis monitor = performance.Monitor( 'total runtime', measuremem=True) if len(job_inis) == 1: # run hazard or risk if hc: hc_id = hc[0] rlz_ids = hc[1:] else: hc_id = None rlz_ids = () oqparam = readinput.get_oqparam(job_inis[0], hc_id=hc_id) if hc_id and hc_id < 0: # interpret negative calculation ids calc_ids = datastore.get_calc_ids() try: hc_id = calc_ids[hc_id] except IndexError: raise SystemExit( 'There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), hc_id)) calc = base.calculators(oqparam, monitor) with calc.monitor: calc.run(concurrent_tasks=concurrent_tasks, pdb=pdb, exports=exports, hazard_calculation_id=hc_id, rlz_ids=rlz_ids, **params) else: # run hazard + risk calc = run2( job_inis[0], job_inis[1], concurrent_tasks, pdb, exports, params, monitor) logging.info('Total time spent: %s s', monitor.duration) logging.info('Memory allocated: %s', general.humansize(monitor.mem)) monitor.flush() print('See the output with hdfview %s' % calc.datastore.hdf5path) calc_path = calc.datastore.calc_dir # used for the .pstat filename return calc
def _print_info(dstore, filtersources=True, weightsources=True): assoc = dstore['rlzs_assoc'] oqparam = OqParam.from_(dstore.attrs) csm = dstore['composite_source_model'] sitecol = dstore['sitecol'] print(csm.get_info()) print('See https://github.com/gem/oq-risklib/blob/master/doc/' 'effective-realizations.rst for an explanation') print(assoc) if filtersources or weightsources: [info] = readinput.get_job_info(oqparam, csm, sitecol) info['n_sources'] = csm.get_num_sources() curve_matrix_size = (info['n_sites'] * info['n_levels'] * info['n_imts'] * len(assoc) * 8) for k in info.dtype.fields: if k == 'input_weight' and not weightsources: pass else: print(k, info[k]) print('curve_matrix_size', humansize(curve_matrix_size)) if 'num_ruptures' in dstore: print(datastore.view('rupture_collections', dstore))
def _run(job_inis, concurrent_tasks, pdb, loglevel, hc, exports, params): global calc_path assert len(job_inis) in (1, 2), job_inis # set the logs first of all calc_id = logs.init(level=getattr(logging, loglevel.upper())) with performance.Monitor('total runtime', measuremem=True) as monitor: if len(job_inis) == 1: # run hazard or risk if hc: hc_id = hc[0] rlz_ids = hc[1:] else: hc_id = None rlz_ids = () oqparam = readinput.get_oqparam(job_inis[0], hc_id=hc_id) vars(oqparam).update(params) if hc_id and hc_id < 0: # interpret negative calculation ids calc_ids = datastore.get_calc_ids() try: hc_id = calc_ids[hc_id] except IndexError: raise SystemExit('There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), hc_id)) calc = base.calculators(oqparam, calc_id) calc.run(concurrent_tasks=concurrent_tasks, pdb=pdb, exports=exports, hazard_calculation_id=hc_id, rlz_ids=rlz_ids) else: # run hazard + risk calc = run2(job_inis[0], job_inis[1], calc_id, concurrent_tasks, pdb, loglevel, exports, params) logging.info('Total time spent: %s s', monitor.duration) logging.info('Memory allocated: %s', general.humansize(monitor.mem)) print('See the output with hdfview %s' % calc.datastore.hdf5path) calc_path, _ = os.path.splitext(calc.datastore.hdf5path) # used below return calc
def export_hcurves_rlzs(ekey, dstore): """ Export all hazard curves in a single .hdf5 file. This is not recommended, even if this exporter is parallel and very efficient. I was able to export 6 GB of curves per minute. However for large calculations it is then impossible to view the .hdf5 file with the hdfviewer because you will run out of memory. Also, compression is not enabled, otherwise all the time will be spent in the compression phase in the controller node with the workers doing nothing. The recommended way to postprocess large computations is to instantiate the PmapGetter and to work one block of sites at the time, discarding what it is not needed. The exporter here is meant for small/medium calculation and as an example of what you should implement yourself if you need to postprocess the hazard curves. """ oq = dstore['oqparam'] imtls = oq.imtls rlzs_assoc = dstore['csm_info'].get_rlzs_assoc() sitecol = dstore['sitecol'] pgetter = calc.PmapGetter(dstore, rlzs_assoc) N = len(sitecol) R = len(rlzs_assoc.realizations) fname = dstore.export_path('%s.%s' % ekey) monitor = performance.Monitor(ekey[0], fname) size = humansize(dstore.get_attr('poes', 'nbytes')) logging.info('Reading %s of probability maps', size) allargs = [(pgetter.new(tile.sids), imtls, monitor) for tile in sitecol.split_in_tiles(R)] with hdf5.File(fname, 'w') as f: f['imtls'] = imtls dset = f.create_dataset('hcurves-rlzs', (N, R), imtls.dt) dset.attrs['investigation_time'] = oq.investigation_time logging.info('Building the hazard curves for %d sites, %d rlzs', N, R) for sids, allcurves in parallel.Processmap(build_hcurves, allargs): for sid, curves in zip(sids, allcurves): dset[sid] = curves return [fname]
def export_gmf(ekey, dstore): """ :param ekey: export key, i.e. a pair (datastore key, fmt) :param dstore: datastore object """ oq = dstore['oqparam'] if not oq.calculation_mode.startswith('scenario'): logging.warn('The GMF exporter in .xml format has been removed, ' 'use the one in .csv format') return [] sitecol = dstore['sitecol'] investigation_time = (None if oq.calculation_mode == 'scenario' else oq.investigation_time) fmt = ekey[-1] gmf_data = dstore['gmf_data'] nbytes = gmf_data.attrs['nbytes'] logging.info('Internal size of the GMFs: %s', humansize(nbytes)) if nbytes > GMF_MAX_SIZE: logging.warn(GMF_WARNING, dstore.hdf5path) fnames = [] events_by_rlz = collections.defaultdict(list) data = gmf_data['data'].value ses_idx = 1 # for scenario only for rlzi, gmf_arr in group_array(data, 'rlzi').items(): events = events_by_rlz[rlzi] for eid, gmfa in group_array(gmf_arr, 'eid').items(): rup = Event(eid, ses_idx, sorted(set(gmfa['sid'])), gmfa) events.append(rup) rlzs = dstore['csm_info'].get_rlzs_assoc().realizations for rlzi in sorted(events_by_rlz): events_by_rlz[rlzi].sort(key=operator.attrgetter('eid')) fname = dstore.build_fname('gmf', rlzi, fmt) fnames.append(fname) globals()['export_gmf_%s' % fmt]( ('gmf', fmt), fname, sitecol, oq.imtls, events_by_rlz[rlzi], rlzs[rlzi], investigation_time) return fnames
def execute(self): self.datastore.flush() # just to be sure oq = self.oqparam self.set_param(hdf5path=self.datastore.filename, tempname=cache_epsilons(self.datastore, oq, self.assetcol, self.crmodel, self.E)) srcfilter = self.src_filter() logging.info('Sending {:_d} ruptures'.format( len(self.datastore['ruptures']))) self.events_per_sid = [] self.datastore.swmr_on() self.avg_gmf = general.AccumDict(accum=numpy.zeros(self.N, F32)) # imt -> gmvs smap = parallel.Starmap(start_ebrisk, h5=self.datastore.hdf5) smap.monitor.save('srcfilter', srcfilter) smap.monitor.save('crmodel', self.crmodel) for rg in getters.gen_rupture_getters(self.datastore, oq.concurrent_tasks): smap.submit((rg, self.param)) smap.reduce(self.agg_dicts) gmf_bytes = self.datastore['gmf_info']['gmfbytes'].sum() logging.info('Produced %s of GMFs', general.humansize(gmf_bytes)) return 1
def pre_execute(self): oq = self.oqparam oq.ground_motion_fields = False super().pre_execute() self.param['lba'] = lba = (LossesByAsset(self.assetcol, oq.loss_names, self.policy_name, self.policy_dict)) self.param['ses_ratio'] = oq.ses_ratio self.param['aggregate_by'] = oq.aggregate_by self.param.pop('oqparam', None) # unneeded self.L = L = len(lba.loss_names) A = len(self.assetcol) self.datastore.create_dset('avg_losses-stats', F32, (A, 1, L)) # mean shp = self.assetcol.tagcol.agg_shape((L, ), oq.aggregate_by) elt_dt = [('event_id', U32), ('rlzi', U16), ('loss', (F32, shp))] elt_nbytes = 4 * self.E * numpy.prod(shp) logging.info('Approx size of the event loss table: %s', general.humansize(elt_nbytes)) if elt_nbytes / (oq.concurrent_tasks or 1) > TWO32: raise RuntimeError('The event loss table is too big to be transfer' 'red with %d tasks' % oq.concurrent_tasks) self.datastore.create_dset('losses_by_event', elt_dt) self.zerolosses = numpy.zeros(shp, F32) # to get the multi-index self.datastore.create_dset('gmf_info', gmf_info_dt)
def __init__(self, dstore): self.dstore = dstore self.oq = oq = dstore['oqparam'] self.text = (decode(oq.description) + '\n' + '=' * len(oq.description)) try: info = { decode(k): ast.literal_eval(decode(v)) for k, v in dict(dstore['job_info']).items() } except KeyError: # job_info not in the datastore (scenario hazard) info = dict(hostname='localhost') dpath = dstore.hdf5path mtime = os.path.getmtime(dpath) host = '%s:%s' % (info['hostname'], decode(dpath)) updated = str(time.ctime(mtime)) versions = sorted(dstore['/'].attrs.items()) self.text += '\n\n' + views.rst_table([[host, updated]] + versions) # NB: in the future, the sitecol could be transferred as # an array by leveraging the HDF5 serialization protocol; # for the moment however the size of the # data to transfer is given by the usual pickle sitecol_size = humansize(len(parallel.Pickled(dstore['sitecol']))) self.text += '\n\nnum_sites = %d, sitecol = %s' % (len( dstore['sitecol']), sitecol_size)
def post_execute(self, result): """ Save the SES collection """ oq = self.oqparam N = len(self.sitecol.complete) L = len(oq.imtls.array) if oq.hazard_calculation_id is None: self.rupser.close() num_events = sum(set_counts(self.datastore, 'events').values()) if num_events == 0: raise RuntimeError( 'No seismic events! Perhaps the investigation time is too ' 'small or the maximum_distance is too small') if oq.save_ruptures: logging.info('Setting %d event years on %d ruptures', num_events, self.rupser.nruptures) with self.monitor('setting event years', measuremem=True, autoflush=True): numpy.random.seed(self.oqparam.ses_seed) set_random_years(self.datastore, 'events', int(self.oqparam.investigation_time)) if self.gmf_size: self.datastore.set_attrs('events', max_gmf_size=self.gmf_size) msg = 'less than ' if self.get_min_iml(self.oqparam).sum() else '' logging.info('Generating %s%s of GMFs', msg, humansize(self.gmf_size)) if oq.hazard_curves_from_gmfs: rlzs = self.csm_info.rlzs_assoc.realizations # compute and save statistics; this is done in process and can # be very slow if there are thousands of realizations weights = [rlz.weight for rlz in rlzs] hstats = self.oqparam.hazard_stats() if len(hstats): logging.info('Computing statistical hazard curves') for kind, stat in hstats: pmap = compute_pmap_stats(result.values(), [stat], weights) arr = numpy.zeros((N, L), F32) for sid in pmap: arr[sid] = pmap[sid].array[:, 0] self.datastore['hcurves/' + kind] = arr self.save_hmaps() if self.datastore.parent: self.datastore.parent.open('r') if 'gmf_data' in self.datastore: self.save_gmf_bytes() if oq.compare_with_classical: # compute classical curves export_dir = os.path.join(oq.export_dir, 'cl') if not os.path.exists(export_dir): os.makedirs(export_dir) oq.export_dir = export_dir # one could also set oq.number_of_logic_tree_samples = 0 self.cl = ClassicalCalculator(oq) # TODO: perhaps it is possible to avoid reprocessing the source # model, however usually this is quite fast and do not dominate # the computation self.cl.run(close=False) cl_mean_curves = get_mean_curves(self.cl.datastore) eb_mean_curves = get_mean_curves(self.datastore) rdiff, index = util.max_rel_diff_index(cl_mean_curves, eb_mean_curves) logging.warn( 'Relative difference with the classical ' 'mean curves: %d%% at site index %d', rdiff * 100, index)
def execute(self): self.datastore.flush() # just to be sure oq = self.oqparam parent = self.datastore.parent if parent: grp_indices = parent['ruptures'].attrs['grp_indices'] n_occ = parent['ruptures']['n_occ'] dstore = parent csm_info = parent['csm_info'] else: grp_indices = self.datastore['ruptures'].attrs['grp_indices'] n_occ = self.datastore['ruptures']['n_occ'] dstore = self.datastore csm_info = self.csm_info per_block = numpy.ceil(n_occ.sum() / (oq.concurrent_tasks or 1)) self.set_param( hdf5path=self.datastore.filename, task_duration=oq.task_duration or 1200, # 20min tempname=cache_epsilons(self.datastore, oq, self.assetcol, self.crmodel, self.E)) self.init_logic_tree(csm_info) trt_by_grp = csm_info.grp_by("trt") samples = csm_info.get_samples_by_grp() rlzs_by_gsim_grp = csm_info.get_rlzs_by_gsim_grp() ngroups = 0 fe = 0 eslices = self.datastore['eslices'] allargs = [] allpairs = list(enumerate(n_occ)) srcfilter = self.src_filter(self.datastore.tempname) for grp_id, rlzs_by_gsim in rlzs_by_gsim_grp.items(): start, stop = grp_indices[grp_id] if start == stop: # no ruptures for the given grp_id continue ngroups += 1 for pairs in general.block_splitter(allpairs[start:stop], per_block, weight=get_n_occ): indices = [i for i, n in pairs] rup_array = dstore['ruptures'][indices] rgetter = getters.RuptureGetter( rup_array, dstore.filename, grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim, eslices[fe:fe + len(indices), 0]) allargs.append((rgetter, srcfilter, self.param)) fe += len(indices) logging.info('Found %d/%d source groups with ruptures', ngroups, len(rlzs_by_gsim_grp)) self.events_per_sid = [] self.lossbytes = 0 self.datastore.swmr_on() smap = parallel.Starmap(self.core_task.__func__, allargs, h5=self.datastore.hdf5) res = smap.reduce(self.agg_dicts, numpy.zeros(self.N)) gmf_bytes = self.datastore['gmf_info']['gmfbytes'].sum() logging.info('Produced %s of GMFs', general.humansize(gmf_bytes)) logging.info('Produced %s of losses', general.humansize(self.lossbytes)) return res
def acc0(self): """ Initial accumulator, a dict grp_id -> ProbabilityMap(L, G) """ zd = AccumDict() num_levels = len(self.oqparam.imtls.array) rparams = {'grp_id', 'occurrence_rate', 'weight', 'probs_occur', 'lon_', 'lat_', 'rrup_' #} ,'source_id'} gsims_by_trt = self.full_lt.get_gsims_by_trt() n = len(self.full_lt.sm_rlzs) trts = list(self.full_lt.gsim_lt.values) for sm in self.full_lt.sm_rlzs: for grp_id in self.full_lt.grp_ids(sm.ordinal): trt = trts[grp_id // n] gsims = gsims_by_trt[trt] cm = ContextMaker(trt, gsims) rparams.update(cm.REQUIRES_RUPTURE_PARAMETERS) for dparam in cm.REQUIRES_DISTANCES: rparams.add(dparam + '_') zd[grp_id] = ProbabilityMap(num_levels, len(gsims)) zd.eff_ruptures = AccumDict(accum=0) # trt -> eff_ruptures #if self.few_sites: self.rparams = sorted(rparams) for k in self.rparams: # variable length arrays if k == 'grp_id': #print(k) self.datastore.create_dset('rup/' + k, U16) ############################################# elif k =='source_id': print(k) self.datastore.create_dset('rup/' + k, hdf5.vstr) ################################################# elif k == 'probs_occur': # vlen #print(k) self.datastore.create_dset('rup/' + k, hdf5.vfloat32) elif k.endswith('_'): # array of shape (U, N) #print(k) self.datastore.create_dset( 'rup/' + k, F32, shape=(None, self.N), compression='gzip') else: self.datastore.create_dset('rup/' + k, F32) #else: # self.rparams = {} self.by_task = {} # task_no => src_ids self.totrups = 0 # total number of ruptures before collapsing self.maxradius = 0 self.gidx = {tuple(grp_ids): i for i, grp_ids in enumerate(self.datastore['grp_ids'])} # estimate max memory per core max_num_gsims = max(len(gsims) for gsims in gsims_by_trt.values()) max_num_grp_ids = max(len(grp_ids) for grp_ids in self.gidx) pmapbytes = self.N * num_levels * max_num_gsims * max_num_grp_ids * 8 if pmapbytes > TWO32: logging.warning( TOOBIG % (self.N, num_levels, max_num_gsims, max_num_grp_ids, humansize(pmapbytes))) logging.info(MAXMEMORY % (self.N, num_levels, max_num_gsims, max_num_grp_ids, humansize(pmapbytes))) return zd