def execute(args): """ Executes a weather/fire simulation. The args dictionary contains :param args: a dictionary with the following keys :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ logging.basicConfig(level=logging.INFO) # initialize the job state from the arguments js = JobState(args) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = f90nml.read(args['wps_namelist_path']) js.wrf_nml = f90nml.read(args['wrf_namelist_path']) js.fire_nml = f90nml.read(args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in args: js.ems_nml = f90nml.read(args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) num_doms = len(js.domain_conf) js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc)] * num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * num_doms js.wps_nml['share']['interval_seconds'] = 3600 logging.info("number of domains defined is %d." % num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) logging.info("cloning WPS into %s" % js.wps_dir) # step 1: clone WPS and WRF directories cln = WRFCloner(args) cln.clone_wps(js.wps_dir, js.grib_source.vtables(), []) # step 2: process domain information and patch namelist for geogrid js.wps_nml['geogrid']['geog_data_path'] = args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) grib_proc = Process(target=retrieve_gribs_and_run_ungrib, args=(js, proc_q)) geogrid_proc.start() grib_proc.start() # wait until both tasks are done geogrid_proc.join() grib_proc.join() if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() # step 5: execute metgrid after ensuring all grids will be processed js.domain_conf.prepare_for_metgrid(js.wps_nml) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("cloning WRF into %s" % js.wrf_dir) # step 6: clone wrf directory, symlink all met_em* files cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") logging.info("running REAL") # step 7: patch input namelist, fire namelist, emissions namelist (if required) # and execute real.exe time_ctrl = update_time_control(js.start_utc, js.end_utc, num_doms) js.wrf_nml['time_control'].update(time_ctrl) update_namelist(js.wrf_nml, js.grib_source.namelist_keys()) if 'ignitions' in args: update_namelist(js.wrf_nml, render_ignitions(js, num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() # step 8: if requested, do fuel moisture DA if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: assimilate_fm10_observations(osp.join(wrf_dir, 'wrfinput_d%02d' % dom), None, js.fmda.token) logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) # step 8: execute wrf.exe on parallel backend js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email(js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info("WRF job submitted with id %s, waiting for rsl.error.0000" % js.task_id) # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None while wrf_out is None: try: wrf_out = open(osp.join(js.wrf_dir, 'rsl.error.0000')) break except IOError: logging.info('forecast: waiting 10 seconds for rsl.error.0000 file') time.sleep(5) logging.info('Detected rsl.error.0000') # step 10: track log output and check for history writes fro WRF pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is not None: js.pp_dir = osp.join(js.workspace_path, js.job_id, "products") make_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) max_pp_dom = max([int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) while True: line = wrf_out.readline().strip() if not line: time.sleep(0.2) continue if "SUCCESS COMPLETE WRF" in line: send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") break if "Timing for Writing wrfout" in line: esmf_time,domain_str = re.match(r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):' ,line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info("Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_fresh_wrfout(js.wrf_dir, dom_id) try: pp.process_vars(wrfout_path, dom_id, esmf_time, var_list) except Exception as e: logging.warning('Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # if this is the last processed domain for this timestamp in incremental mode, upload to server if dom_id == max_pp_dom and js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc, already_sent_files) logging.info('sent %d files to visualization server.' % len(sent_files_1)) already_sent_files = filter(lambda x: not x.endswith('json'), already_sent_files + sent_files_1) # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc)
def execute(args, job_args): """ Executes a weather/fire simulation. :param args: a dictionary with all to start the simulationfollowing keys :param job_args: a the original json given the forecast Keys in args: :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ logging.info('step 0 initialize the job state from the arguments') ## logging.info('args = %s' % json.dumps(jargs, open(osp.join(jobdir,'input.json'),'w'), indent=4, separators=(',', ': '))) js = JobState(args) ## logging.info('js = %s' % json.dumps(js, open(osp.join(jobdir,'input.json'),'w'), indent=4, separators=(',', ': '))) jobdir = osp.abspath(osp.join(js.workspace_path, js.job_id)) make_clean_dir(jobdir) json.dump(job_args, open(osp.join(jobdir, 'input.json'), 'w'), indent=4, separators=(',', ': ')) jsub = make_job_file(js) json.dump(jsub, open(jsub.jobfile, 'w'), indent=4, separators=(',', ': ')) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) sys.stdout.flush() send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = read_namelist(js.args['wps_namelist_path']) js.wrf_nml = read_namelist(js.args['wrf_namelist_path']) js.fire_nml = read_namelist(js.args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in js.args: js.ems_nml = read_namelist(js.args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) js.num_doms = len(js.domain_conf) js.wps_nml['share']['interval_seconds'] = js.grib_source[ 0].interval_seconds logging.info("number of domains defined is %d." % js.num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) #check_obj(args,'args') #check_obj(js,'Initial job state') logging.info("step 1: clone WPS and WRF directories") logging.info("cloning WPS into %s" % js.wps_dir) cln = WRFCloner(js.args) cln.clone_wps(js.wps_dir, []) js.grib_source[0].clone_vtables(js.wps_dir) logging.info( "step 2: process domain information and patch namelist for geogrid") js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc) ] * js.num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * js.num_doms js.wps_nml['geogrid']['geog_data_path'] = js.args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) # grib_proc = Process(target=retrieve_gribs_and_run_ungrib_all, args=(js, proc_q, ref_utc)) grib_proc = {} for grib_source in js.grib_source: grib_proc[grib_source.id] = Process( target=retrieve_gribs_and_run_ungrib, args=(js, grib_source, proc_q)) logging.info('starting GEOGRID and GRIB2/UNGRIB') if js.ungrib_only: logging.info( 'ungrib_only set, skipping GEOGRID, will exit after UNGRIB') else: geogrid_proc.start() for grib_source in js.grib_source: grib_proc[grib_source.id].start() # wait until all tasks are done logging.info('waiting until all tasks are done') for grib_source in js.grib_source: grib_proc[grib_source.id].join() if js.ungrib_only: return else: geogrid_proc.join() for grib_source in js.grib_source: if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() logging.info( "step 5: execute metgrid after ensuring all grids will be processed") update_namelist(js.wps_nml, js.grib_source[0].namelist_wps_keys()) js.domain_conf.prepare_for_metgrid(js.wps_nml) logging.info("namelist.wps for METGRID: %s" % json.dumps(js.wps_nml, indent=4, separators=(',', ': '))) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("METGRID complete") logging.info("cloning WRF into %s" % js.wrf_dir) logging.info( "step 6: clone wrf directory, symlink all met_em* files, make namelists" ) cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") time_ctrl = update_time_control(js.start_utc, js.end_utc, js.num_doms) js.wrf_nml['time_control'].update(time_ctrl) js.wrf_nml['time_control']['interval_seconds'] = js.grib_source[ 0].interval_seconds update_namelist(js.wrf_nml, js.grib_source[0].namelist_keys()) if 'ignitions' in js.args: update_namelist(js.wrf_nml, render_ignitions(js, js.num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: logging.debug('namelist.fire_emissions given, turning on tracers') f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * js.num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # step 7: execute real.exe logging.info("running REAL") # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() logging.info('step 7b: if requested, do fuel moisture DA') logging.info('fmda = %s' % js.fmda) if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: logging.info('assimilate_fm10_observations for domain %s' % dom) assimilate_fm10_observations( osp.join(js.wrf_dir, 'wrfinput_d%02d' % int(dom)), None, js.fmda.token) # step 8: execute wrf.exe on parallel backend logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] jsub.job_num = WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email( js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info( "WRF job %s submitted with id %s, waiting for rsl.error.0000" % (jsub.job_num, js.task_id)) jobfile = osp.abspath(osp.join(js.workspace_path, js.job_id, 'job.json')) json.dump(jsub, open(jobfile, 'w'), indent=4, separators=(',', ': ')) process_output(js.job_id)
def execute(args,job_args): """ Executes a weather/fire simulation. :param args: a dictionary with all to start the simulationfollowing keys :param job_args: a the original json given the forecast Keys in args: :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ # step 0 initialize the job state from the arguments js = JobState(args) jobdir = osp.abspath(osp.join(js.workspace_path, js.job_id)) make_clean_dir(jobdir) json.dump(job_args, open(osp.join(jobdir,'input.json'),'w'), indent=4, separators=(',', ': ')) jsub = make_job_file(js) json.dump(jsub, open(jsub.jobfile,'w'), indent=4, separators=(',', ': ')) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) sys.stdout.flush() send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = f90nml.read(js.args['wps_namelist_path']) js.wrf_nml = f90nml.read(js.args['wrf_namelist_path']) js.fire_nml = f90nml.read(js.args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in js.args: js.ems_nml = f90nml.read(js.args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) num_doms = len(js.domain_conf) js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc)] * num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * num_doms js.wps_nml['share']['interval_seconds'] = 3600 logging.info("number of domains defined is %d." % num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) #check_obj(args,'args') #check_obj(js,'Initial job state') # step 1: clone WPS and WRF directories logging.info("cloning WPS into %s" % js.wps_dir) cln = WRFCloner(js.args) cln.clone_wps(js.wps_dir, js.grib_source.vtables(), []) # step 2: process domain information and patch namelist for geogrid js.wps_nml['geogrid']['geog_data_path'] = js.args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) grib_proc = Process(target=retrieve_gribs_and_run_ungrib, args=(js, proc_q)) logging.info('starting GEOGRID and GRIB2/UNGRIB') geogrid_proc.start() grib_proc.start() # wait until both tasks are done logging.info('waiting until both tasks are done') grib_proc.join() geogrid_proc.join() if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() # step 5: execute metgrid after ensuring all grids will be processed js.domain_conf.prepare_for_metgrid(js.wps_nml) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("cloning WRF into %s" % js.wrf_dir) # step 6: clone wrf directory, symlink all met_em* files, make namelists cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") time_ctrl = update_time_control(js.start_utc, js.end_utc, num_doms) js.wrf_nml['time_control'].update(time_ctrl) update_namelist(js.wrf_nml, js.grib_source.namelist_keys()) if 'ignitions' in js.args: update_namelist(js.wrf_nml, render_ignitions(js, num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: logging.debug('namelist.fire_emissions given, turning on tracers') f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # step 7: execute real.exe logging.info("running REAL") # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() # step 7b: if requested, do fuel moisture DA if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: assimilate_fm10_observations(osp.join(wrf_dir, 'wrfinput_d%02d' % dom), None, js.fmda.token) # step 8: execute wrf.exe on parallel backend logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] jsub.job_num=WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email(js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info("WRF job %s submitted with id %s, waiting for rsl.error.0000" % (jsub.job_num, js.task_id)) jobfile = osp.abspath(osp.join(js.workspace_path, js.job_id,'job.json')) json.dump(jsub, open(jobfile,'w'), indent=4, separators=(',', ': ')) process_output(js.job_id)
def retrieve_gribs_and_run_ungrib(js, grib_source, q): """ This function retrieves required GRIB files and runs ungrib. It returns either 'SUCCESS' or 'FAILURE' on completion. :param js: the JobState object containing the forecast configuration :param grib_source: the GribSource object containing ungrib configuration :param q: the multiprocessing Queue into which we will send either 'SUCCESS' or 'FAILURE' """ wps_dir = osp.abspath(js.wps_dir) grib_dir = osp.join(wps_dir, grib_source.id) make_clean_dir(grib_dir) wps_nml = js.wps_nml try: logging.info("retrieving GRIB files from %s" % grib_source.id) download_whole_cycle = js.get('download_whole_cycle', False) manifest = grib_source.retrieve_gribs(js.start_utc, js.end_utc, js.ref_utc, js.cycle_start_utc, download_whole_cycle) # logging.info('manifest: ' + str(manifest)) cache_colmet = len(manifest) > 1 have_all_colmet = False if cache_colmet: have_all_colmet = len(manifest.colmet_missing) == 0 colmet_dir = osp.join(grib_source.cache_dir, manifest.colmet_prefix) logging.info('cache colmet %s, have all colmet %s' % (cache_colmet, have_all_colmet)) if not have_all_colmet: # this is also if we do not cache grib_source.symlink_gribs(manifest.grib_files, grib_dir) send_email( js, 'grib2', 'Job %s - %d GRIB2 files downloaded.' % (js.job_id, len(manifest))) logging.info("running UNGRIB for %s" % grib_source.id) logging.info( "step 4: patch namelist for ungrib end execute ungrib on %s files" % grib_source.id) update_namelist(wps_nml, grib_source.namelist_wps_keys()) if cache_colmet: wps_nml['share']['start_date'] = [ utc_to_esmf(manifest.colmet_files_utc[0]) ] * js.num_doms wps_nml['share']['end_date'] = [ utc_to_esmf(manifest.colmet_files_utc[-1]) ] * js.num_doms # logging.info("namelist.wps for UNGRIB: %s" % json.dumps(wps_nml, indent=4, separators=(',', ': '))) f90nml.write(wps_nml, osp.join(grib_dir, 'namelist.wps'), force=True) grib_source.clone_vtables(grib_dir) symlink_unless_exists(osp.join(wps_dir, 'ungrib.exe'), osp.join(grib_dir, 'ungrib.exe')) print(grib_dir + ':') os.system('ls -l %s' % grib_dir) Ungrib(grib_dir).execute().check_output() print(grib_dir + ':') os.system('ls -l %s' % grib_dir) if cache_colmet: # move output to cache directory make_dir(colmet_dir) for f in manifest.colmet_files: move(osp.join(grib_dir, f), osp.join(colmet_dir, f)) # now all colmet files should be in the cache if cache_colmet: for f in manifest.colmet_files: symlink_unless_exists(osp.join(colmet_dir, f), osp.join(wps_dir, f)) else: # move output for f in glob.glob(osp.join(grib_dir, grib_source.prefix() + '*')): move(f, wps_dir) send_email(js, 'ungrib', 'Job %s - ungrib complete.' % js.job_id) logging.info('UNGRIB complete for %s' % grib_source.id) q.put('SUCCESS') except Exception as e: logging.error('GRIB2/UNGRIB step failed with exception %s' % repr(e)) traceback.print_exc() q.put('FAILURE')
def execute(args): """ Executes a weather/fire simulation. The args dictionary contains :param args: a dictionary with the following keys :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ logging.basicConfig(level=logging.INFO) # initialize the job state from the arguments js = JobState(args) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = f90nml.read(args['wps_namelist_path']) js.wrf_nml = f90nml.read(args['wrf_namelist_path']) js.fire_nml = f90nml.read(args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in args: js.ems_nml = f90nml.read(args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) num_doms = len(js.domain_conf) js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc)] * num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * num_doms js.wps_nml['share']['interval_seconds'] = 3600 logging.info("number of domains defined is %d." % num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) logging.info("cloning WPS into %s" % js.wps_dir) # step 1: clone WPS and WRF directories cln = WRFCloner(args) cln.clone_wps(js.wps_dir, js.grib_source.vtables(), []) # step 2: process domain information and patch namelist for geogrid js.wps_nml['geogrid']['geog_data_path'] = args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) grib_proc = Process(target=retrieve_gribs_and_run_ungrib, args=(js, proc_q)) geogrid_proc.start() grib_proc.start() # wait until both tasks are done geogrid_proc.join() grib_proc.join() if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() # step 5: execute metgrid after ensuring all grids will be processed js.domain_conf.prepare_for_metgrid(js.wps_nml) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("cloning WRF into %s" % js.wrf_dir) # step 6: clone wrf directory, symlink all met_em* files cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") logging.info("running REAL") # step 7: patch input namelist, fire namelist, emissions namelist (if required) # and execute real.exe time_ctrl = update_time_control(js.start_utc, js.end_utc, num_doms) js.wrf_nml['time_control'].update(time_ctrl) update_namelist(js.wrf_nml, js.grib_source.namelist_keys()) if 'ignitions' in args: update_namelist(js.wrf_nml, render_ignitions(js, num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() # step 8: if requested, do fuel moisture DA if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: assimilate_fm10_observations( osp.join(wrf_dir, 'wrfinput_d%02d' % dom), None, js.fmda.token) logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) # step 8: execute wrf.exe on parallel backend js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email( js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info("WRF job submitted with id %s, waiting for rsl.error.0000" % js.task_id) # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None while wrf_out is None: try: wrf_out = open(osp.join(js.wrf_dir, 'rsl.error.0000')) break except IOError: logging.info( 'forecast: waiting 10 seconds for rsl.error.0000 file') time.sleep(5) logging.info('Detected rsl.error.0000') # step 10: track log output and check for history writes fro WRF pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is not None: js.pp_dir = osp.join(js.workspace_path, js.job_id, "products") make_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) max_pp_dom = max( [int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) while True: line = wrf_out.readline().strip() if not line: time.sleep(0.2) continue if "SUCCESS COMPLETE WRF" in line: send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") break if "Timing for Writing wrfout" in line: esmf_time, domain_str = re.match( r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):', line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info( "Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_fresh_wrfout(js.wrf_dir, dom_id) try: pp.process_vars(wrfout_path, dom_id, esmf_time, var_list) except Exception as e: logging.warning( 'Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # if this is the last processed domain for this timestamp in incremental mode, upload to server if dom_id == max_pp_dom and js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc, already_sent_files) logging.info('sent %d files to visualization server.' % len(sent_files_1)) already_sent_files = filter(lambda x: not x.endswith('json'), already_sent_files + sent_files_1) # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc)