class TestIntegration(unittest.TestCase): expected = None actual = None msg = None base_dir = os.path.join(os.path.dirname(__file__)) @property def output(self): return '%s\nExpected: %s\nActual: %s\n' % (self.msg, self.expected, self.actual) @property def assert_args(self): return (self.expected, self.actual, self.output) def setUp(self): self.job_name = 'job_name' self.job = Job(self.job_name) def tearDown(self): pass def test_submit(self): working_dir = os.path.join(self.base_dir, 'test_files', 'working_dir') self.job = Job('remote_test', Templates.vanilla_transfer_files, host='localhost', username=os.environ['USER'], private_key='~/.ssh/id_rsa', remote_input_files=['../copy_test.py', 'input.txt'], transfer_input_files='../input.txt', executable=os.path.join(self.base_dir, 'test_files', 'copy_test.py'), working_directory=working_dir) remote_base_path = os.path.expanduser('~/' + self.job._remote_id) if os.path.exists(remote_base_path): raise self.job.submit() self.assertTrue(os.path.exists(remote_base_path)) self.job.wait() self.job.sync_remote_output() local_output = os.path.join(working_dir, self.job.name) self.assertTrue(os.path.exists(local_output)) output = os.path.join(local_output, 'output.txt') self.assertTrue(os.path.exists(output)) shutil.rmtree(remote_base_path) shutil.rmtree(local_output)
def run_ecmwf_rapid_process(rapid_executable_location, #path to RAPID executable rapid_io_files_location, #path ro RAPID input/output directory ecmwf_forecast_location, #path to ECMWF forecasts subprocess_log_directory, #path to store HTCondor/multiprocess logs main_log_directory, #path to store main logs data_store_url="", #CKAN API url data_store_api_key="", #CKAN API Key, data_store_owner_org="", #CKAN owner organization app_instance_id="", #Streamflow Prediction tool instance ID sync_rapid_input_with_ckan=False, #match Streamflow Prediciton tool RAPID input download_ecmwf=True, #Download recent ECMWF forecast before running, date_string=None, #string of date of interest ftp_host="", #ECMWF ftp site path ftp_login="", #ECMWF ftp login name ftp_passwd="", #ECMWF ftp password ftp_directory="", #ECMWF ftp directory upload_output_to_ckan=False, #upload data to CKAN and remove local copy delete_output_when_done=False, #delete all output data from this code initialize_flows=False, #use forecast to initialize next run era_interim_data_location="", #path to ERA Interim return period data create_warning_points=False, #generate waring points for Streamflow Prediction Tool autoroute_executable_location="", #location of AutoRoute executable autoroute_io_files_location="", #path to AutoRoute input/outpuf directory geoserver_url='', #url to API endpoint ending in geoserver/rest geoserver_username='', #username for geoserver geoserver_password='', #password for geoserver mp_mode='htcondor', #valid options are htcondor and multiprocess, mp_execute_directory='',#required if using multiprocess mode ): """ This it the main ECMWF RAPID process """ time_begin_all = datetime.datetime.utcnow() if date_string == None: date_string = time_begin_all.strftime('%Y%m%d') if mp_mode == "multiprocess": if not mp_execute_directory or not os.path.exists(mp_execute_directory): raise Exception("If mode is multiprocess, mp_execute_directory is required ...") #date_string = datetime.datetime(2016,2,12).strftime('%Y%m%d') local_scripts_location = os.path.dirname(os.path.realpath(__file__)) if sync_rapid_input_with_ckan and app_instance_id and data_store_url and data_store_api_key: #sync with data store ri_manager = RAPIDInputDatasetManager(data_store_url, data_store_api_key, 'ecmwf', app_instance_id) ri_manager.sync_dataset(os.path.join(rapid_io_files_location,'input')) #clean up old log files clean_logs(subprocess_log_directory, main_log_directory) #get list of correclty formatted rapid input directories in rapid directory rapid_input_directories = get_valid_watershed_list(os.path.join(rapid_io_files_location, "input")) if download_ecmwf and ftp_host: #download all files for today ecmwf_folders = sorted(download_all_ftp(ecmwf_forecast_location, 'Runoff.%s*.netcdf.tar*' % date_string, ftp_host, ftp_login, ftp_passwd, ftp_directory)) else: ecmwf_folders = sorted(glob(os.path.join(ecmwf_forecast_location, 'Runoff.'+date_string+'*.netcdf'))) data_manager = None if upload_output_to_ckan and data_store_url and data_store_api_key: #init data manager for CKAN data_manager = ECMWFRAPIDDatasetManager(data_store_url, data_store_api_key, data_store_owner_org) #ADD SEASONAL INITIALIZATION WHERE APPLICABLE if initialize_flows: initial_forecast_date_timestep = get_date_timestep_from_forecast_folder(ecmwf_folders[0]) seasonal_init_job_list = [] for rapid_input_directory in rapid_input_directories: seasonal_master_watershed_input_directory = os.path.join(rapid_io_files_location, "input", rapid_input_directory) #add seasonal initialization if no initialization file and historical Qout file exists if era_interim_data_location and os.path.exists(era_interim_data_location): era_interim_watershed_directory = os.path.join(era_interim_data_location, rapid_input_directory) if os.path.exists(era_interim_watershed_directory): historical_qout_file = glob(os.path.join(era_interim_watershed_directory, "Qout*.nc")) if historical_qout_file: seasonal_init_job_list.append((historical_qout_file[0], seasonal_master_watershed_input_directory, initial_forecast_date_timestep)) if seasonal_init_job_list: #use multiprocessing instead of htcondor due to potential for huge file sizes if len(seasonal_init_job_list) > 1: seasonal_pool = mp_Pool() seasonal_pool.imap(compute_seasonal_initial_rapid_flows_multicore_worker, seasonal_init_job_list, chunksize=1) seasonal_pool.close() seasonal_pool.join() else: compute_seasonal_initial_rapid_flows_multicore_worker(seasonal_init_job_list[0]) #prepare ECMWF files master_job_info_list = [] for ecmwf_folder in ecmwf_folders: ecmwf_forecasts = glob(os.path.join(ecmwf_folder,'full_*.runoff.netcdf')) + \ glob(os.path.join(ecmwf_folder,'*.52.205.*.runoff.netcdf')) #look for new version of forecasts if not ecmwf_forecasts: ecmwf_forecasts = glob(os.path.join(ecmwf_folder,'*.runoff.nc')) #make the largest files first ecmwf_forecasts.sort(key=os.path.getsize, reverse=True) forecast_date_timestep = get_date_timestep_from_forecast_folder(ecmwf_folder) print forecast_date_timestep #submit jobs to downsize ecmwf files to watershed iteration = 0 rapid_watershed_jobs = {} for rapid_input_directory in rapid_input_directories: #keep list of jobs rapid_watershed_jobs[rapid_input_directory] = { 'jobs': [], 'jobs_info': [] } print "Running forecasts for:", rapid_input_directory, os.path.basename(ecmwf_folder) watershed, subbasin = get_watershed_subbasin_from_folder(rapid_input_directory) master_watershed_input_directory = os.path.join(rapid_io_files_location, "input", rapid_input_directory) master_watershed_outflow_directory = os.path.join(rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) #add USGS gage data to initialization file if initialize_flows: #update intial flows with usgs data update_inital_flows_usgs(master_watershed_input_directory, forecast_date_timestep) #create jobs for HTCondor for watershed_job_index, forecast in enumerate(ecmwf_forecasts): ensemble_number = get_ensemble_number_from_forecast(forecast) try: os.makedirs(master_watershed_outflow_directory) except OSError: pass #initialize HTCondor Logging Directory subprocess_forecast_log_dir = os.path.join(subprocess_log_directory, forecast_date_timestep) try: os.makedirs(subprocess_forecast_log_dir) except OSError: pass #get basin names outflow_file_name = 'Qout_%s_%s_%s.nc' % (watershed.lower(), subbasin.lower(), ensemble_number) node_rapid_outflow_file = outflow_file_name master_rapid_outflow_file = os.path.join(master_watershed_outflow_directory, outflow_file_name) job_name = 'job_%s_%s_%s_%s_%s' % (forecast_date_timestep, watershed, subbasin, ensemble_number, iteration) if mp_mode == "htcondor": #create job to downscale forecasts for watershed job = CJob(job_name, tmplt.vanilla_transfer_files) job.set('executable',os.path.join(local_scripts_location,'htcondor_ecmwf_rapid.py')) job.set('transfer_input_files', "%s, %s, %s" % (forecast, master_watershed_input_directory, local_scripts_location)) job.set('initialdir', subprocess_forecast_log_dir) job.set('arguments', '%s %s %s %s %s %s' % (forecast, forecast_date_timestep, watershed.lower(), subbasin.lower(), rapid_executable_location, initialize_flows)) job.set('transfer_output_remaps',"\"%s = %s\"" % (node_rapid_outflow_file, master_rapid_outflow_file)) job.submit() rapid_watershed_jobs[rapid_input_directory]['jobs'].append(job) rapid_watershed_jobs[rapid_input_directory]['jobs_info'].append({'watershed' : watershed, 'subbasin' : subbasin, 'outflow_file_name' : master_rapid_outflow_file, 'forecast_date_timestep' : forecast_date_timestep, 'ensemble_number': ensemble_number, 'master_watershed_outflow_directory': master_watershed_outflow_directory, }) elif mp_mode == "multiprocess": rapid_watershed_jobs[rapid_input_directory]['jobs'].append((forecast, forecast_date_timestep, watershed.lower(), subbasin.lower(), rapid_executable_location, initialize_flows, job_name, master_rapid_outflow_file, master_watershed_input_directory, mp_execute_directory, subprocess_forecast_log_dir, watershed_job_index)) ## run_ecmwf_rapid_multiprocess_worker((forecast, ## forecast_date_timestep, ## watershed.lower(), ## subbasin.lower(), ## rapid_executable_location, ## initialize_flows, ## job_name, ## master_rapid_outflow_file, ## master_watershed_input_directory, ## mp_execute_directory, ## subprocess_forecast_log_dir, ## watershed_job_index)) else: raise Exception("ERROR: Invalid mp_mode. Valid types are htcondor and multiprocess ...") iteration += 1 for rapid_input_directory, watershed_job_info in rapid_watershed_jobs.iteritems(): #add sub job list to master job list master_job_info_list = master_job_info_list + watershed_job_info['jobs_info'] if mp_mode == "htcondor": #wait for jobs to finish then upload files for job_index, job in enumerate(watershed_job_info['jobs']): job.wait() #upload file when done if data_manager: upload_single_forecast(watershed_job_info['jobs_info'][job_index], data_manager) elif mp_mode == "multiprocess": pool_main = mp_Pool() multiprocess_worker_list = pool_main.imap_unordered(run_ecmwf_rapid_multiprocess_worker, watershed_job_info['jobs'], chunksize=1) if data_manager: for multi_job_output in multiprocess_worker_list: job_index = multi_job_output[0] #upload file when done upload_single_forecast(watershed_job_info['jobs_info'][job_index], data_manager) #just in case ... pool_main.close() pool_main.join() #when all jobs in watershed are done, generate warning points if create_warning_points: watershed, subbasin = get_watershed_subbasin_from_folder(rapid_input_directory) forecast_directory = os.path.join(rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) era_interim_watershed_directory = os.path.join(era_interim_data_location, rapid_input_directory) if os.path.exists(era_interim_watershed_directory): print "Generating Warning Points for", watershed, subbasin, "from", forecast_date_timestep era_interim_files = glob(os.path.join(era_interim_watershed_directory, "return_period*.nc")) if era_interim_files: try: generate_warning_points(forecast_directory, era_interim_files[0], forecast_directory, threshold=10) if upload_output_to_ckan and data_store_url and data_store_api_key: data_manager.initialize_run_ecmwf(watershed, subbasin, forecast_date_timestep) data_manager.zip_upload_warning_points_in_directory(forecast_directory) except Exception, ex: print ex pass else: print "No ERA Interim file found. Skipping ..." else: print "No ERA Interim directory found for", rapid_input_directory, ". Skipping warning point generation..." #initialize flows for next run if initialize_flows: #create new init flow files/generate warning point files for rapid_input_directory in rapid_input_directories: input_directory = os.path.join(rapid_io_files_location, 'input', rapid_input_directory) forecast_directory = os.path.join(rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) if os.path.exists(forecast_directory): #loop through all the rapid_namelist files in directory watershed, subbasin = get_watershed_subbasin_from_folder(rapid_input_directory) if initialize_flows: print "Initializing flows for", watershed, subbasin, "from", forecast_date_timestep basin_files = find_current_rapid_output(forecast_directory, watershed, subbasin) try: compute_initial_rapid_flows(basin_files, input_directory, forecast_date_timestep) except Exception, ex: print ex pass
def run_ecmwf_forecast_process( rapid_executable_location, # path to RAPID executable rapid_io_files_location, # path ro RAPID input/output directory ecmwf_forecast_location, # path to ECMWF forecasts subprocess_log_directory, # path to store HTCondor/multiprocess logs main_log_directory, # path to store main logs region="", #1 of the 12 partitioned ECMWF files. Leave empty if using global, data_store_url="", # CKAN API url data_store_api_key="", # CKAN API Key, data_store_owner_org="", # CKAN owner organization app_instance_id="", # Streamflow Prediction tool instance ID sync_rapid_input_with_ckan=False, # match Streamflow Prediciton tool RAPID input download_ecmwf=True, # Download recent ECMWF forecast before running, date_string="", # string of date of interest ftp_host="", # ECMWF ftp site path ftp_login="", # ECMWF ftp login name ftp_passwd="", # ECMWF ftp password ftp_directory="", # ECMWF ftp directory delete_past_ecmwf_forecasts=True, # Deletes all past forecasts before next run upload_output_to_ckan=False, # upload data to CKAN and remove local copy delete_output_when_done=False, # delete all output data from this code initialize_flows=False, # use forecast to initialize next run warning_flow_threshold=100, # flows below this threshold will be ignored era_interim_data_location="", # path to ERA Interim return period data create_warning_points=False, # generate waring points for Streamflow Prediction Tool autoroute_executable_location="", # location of AutoRoute executable autoroute_io_files_location="", # path to AutoRoute input/outpuf directory geoserver_url="", # url to API endpoint ending in geoserver/rest geoserver_username="", # username for geoserver geoserver_password="", # password for geoserver mp_mode='htcondor', # valid options are htcondor and multiprocess, mp_execute_directory="", # required if using multiprocess mode initialization_time_step=12, # time step of ECMWF Forecast Process, in hours #doesn't appear to be used MJS 8/23/2020... watersheds_with_dams_list=[], # a list of all watersheds where dam outflows are being forced #doesn't appear to be used, MJS 8/23/2020... stream_ids_with_dams_dict={}, # a dictionary with the watershed key and a value of a list of stream IDs where dams are located #doesn't appear to be used, MJS 8/23/2020... dam_outflows={} # a dictionary with the key as a stream ID and a value of a list of outflows BS_opt_dam=False, IS_dam_tot=0, IS_dam_use=0, dam_tot_id_file="", dam_use_id_file="", dam_file=""): """ This it the main ECMWF RAPID forecast process """ time_begin_all = datetime.datetime.utcnow() LOCAL_SCRIPTS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) LOCK_INFO_FILE = os.path.join(main_log_directory, "spt_compute_ecmwf_run_info_lock.txt") log_file_path = os.path.join( main_log_directory, "spt_compute_ecmwf_{0}.log".format( time_begin_all.strftime("%y%m%d%H%M%S"))) with CaptureStdOutToLog(log_file_path): if not CONDOR_ENABLED and mp_mode == 'htcondor': raise ImportError( "condorpy is not installed. Please install condorpy to use the 'htcondor' option." ) if not AUTOROUTE_ENABLED and autoroute_executable_location and autoroute_io_files_location: raise ImportError( "AutoRoute is not enabled. Please install tethys_dataset_services" " and AutoRoutePy to use the AutoRoute option.") if mp_mode == "multiprocess": if not mp_execute_directory or not os.path.exists( mp_execute_directory): raise Exception( "If mode is multiprocess, mp_execute_directory is required ..." ) if sync_rapid_input_with_ckan and app_instance_id and data_store_url and data_store_api_key: # sync with data store ri_manager = RAPIDInputDatasetManager(data_store_url, data_store_api_key, 'ecmwf', app_instance_id) ri_manager.sync_dataset( os.path.join(rapid_io_files_location, 'input')) # clean up old log files clean_logs(subprocess_log_directory, main_log_directory, log_file_path=log_file_path) data_manager = None if upload_output_to_ckan and data_store_url and data_store_api_key: if not SPT_DATASET_ENABLED: raise ImportError( "spt_dataset_manager is not installed. " "Please install spt_dataset_manager to use the 'ckan' options." ) # init data manager for CKAN data_manager = ECMWFRAPIDDatasetManager(data_store_url, data_store_api_key, data_store_owner_org) # get list of correclty formatted rapid input directories in rapid directory rapid_input_directories = get_valid_watershed_list( os.path.join(rapid_io_files_location, "input")) if download_ecmwf and ftp_host: # get list of folders to download ecmwf_folders = sorted( get_ftp_forecast_list( 'Runoff.%s*%s*.netcdf.tar*' % (date_string, region), ftp_host, ftp_login, ftp_passwd, ftp_directory)) else: # get list of folders to run ecmwf_folders = sorted( glob( os.path.join(ecmwf_forecast_location, 'Runoff.' + date_string + '*.netcdf'))) # LOAD LOCK INFO FILE last_forecast_date = datetime.datetime.utcfromtimestamp(0) if os.path.exists(LOCK_INFO_FILE): with open(LOCK_INFO_FILE) as fp_lock_info: previous_lock_info = json.load(fp_lock_info) if previous_lock_info['running']: print("Another SPT ECMWF forecast process is running.\n" "The lock file is located here: {0}\n" "If this is an error, you have two options:\n" "1) Delete the lock file.\n" "2) Edit the lock file and set \"running\" to false. \n" "Then, re-run this script. \n Exiting ...".format( LOCK_INFO_FILE)) return else: last_forecast_date = datetime.datetime.strptime( previous_lock_info['last_forecast_date'], '%Y%m%d%H') run_ecmwf_folders = [] for ecmwf_folder in ecmwf_folders: # get date forecast_date = get_datetime_from_forecast_folder( ecmwf_folder) # if more recent, add to list # check to determine if forecast time step is 12 or 24 hours if initialization_time_step == 24: if forecast_date > last_forecast_date and forecast_date.hour != 12: run_ecmwf_folders.append(ecmwf_folder) elif initialization_time_step == 12: if forecast_date > last_forecast_date: run_ecmwf_folders.append(ecmwf_folder) ecmwf_folders = run_ecmwf_folders if not ecmwf_folders: print("No new forecasts found to run. Exiting ...") return # GENERATE NEW LOCK INFO FILE update_lock_info_file(LOCK_INFO_FILE, True, last_forecast_date.strftime('%Y%m%d%H')) # rapid_input_directories_sub = [rapid_input_directory for rapid_input_directory in rapid_input_directories if hydroshed_index in rapid_input_directory] # Try/Except added for lock file try: # ADD SEASONAL INITIALIZATION WHERE APPLICABLE if initialize_flows: initial_forecast_date_timestep = get_date_timestep_from_forecast_folder( ecmwf_folders[0]) seasonal_init_job_list = [] for rapid_input_directory in rapid_input_directories: seasonal_master_watershed_input_directory = os.path.join( rapid_io_files_location, "input", rapid_input_directory) # add seasonal initialization if no initialization file and historical Qout file exists if era_interim_data_location and os.path.exists( era_interim_data_location): era_interim_watershed_directory = os.path.join( era_interim_data_location, rapid_input_directory) if os.path.exists(era_interim_watershed_directory): # INITIALIZE FROM SEASONAL AVERAGE FILE seasonal_streamflow_file = glob( os.path.join(era_interim_watershed_directory, "seasonal_average*.nc")) if seasonal_streamflow_file: seasonal_init_job_list.append( (seasonal_streamflow_file[0], seasonal_master_watershed_input_directory, initial_forecast_date_timestep, "seasonal_average_file")) else: # INITIALIZE FROM HISTORICAL STREAMFLOW FILE historical_qout_file = glob( os.path.join( era_interim_watershed_directory, "Qout*.nc")) if historical_qout_file: seasonal_init_job_list.append(( historical_qout_file[0], seasonal_master_watershed_input_directory, initial_forecast_date_timestep, "historical_streamflow_file")) if seasonal_init_job_list: # use multiprocessing instead of htcondor due to potential for huge file sizes if len(seasonal_init_job_list) > 1: seasonal_pool = mp_Pool() seasonal_pool.imap( compute_seasonal_initial_rapid_flows_multicore_worker, seasonal_init_job_list, chunksize=1) seasonal_pool.close() seasonal_pool.join() else: compute_seasonal_initial_rapid_flows_multicore_worker( seasonal_init_job_list[0]) # ---------------------------------------------------------------------- # BEGIN ECMWF-RAPID FORECAST LOOP # ---------------------------------------------------------------------- master_job_info_list = [] for ecmwf_folder in ecmwf_folders: if download_ecmwf: # download forecast ecmwf_folder = download_and_extract_ftp( ecmwf_forecast_location, ecmwf_folder, ftp_host, ftp_login, ftp_passwd, ftp_directory, delete_past_ecmwf_forecasts) # get list of forecast files ecmwf_forecasts = glob( os.path.join(ecmwf_folder, '*.runoff.%s*nc' % region)) # look for old version of forecasts if not ecmwf_forecasts: ecmwf_forecasts = glob(os.path.join(ecmwf_folder, 'full_*.runoff.netcdf')) + \ glob(os.path.join(ecmwf_folder, '*.52.205.*.runoff.netcdf')) if not ecmwf_forecasts: print("ERROR: Forecasts not found in folder. Exiting ...") update_lock_info_file( LOCK_INFO_FILE, False, last_forecast_date.strftime('%Y%m%d%H')) return # make the largest files first ecmwf_forecasts.sort(key=os.path.getsize, reverse=True) forecast_date_timestep = get_date_timestep_from_forecast_folder( ecmwf_folder) print("Running ECMWF Forecast: {0}".format( forecast_date_timestep)) # submit jobs to downsize ecmwf files to watershed rapid_watershed_jobs = {} for rapid_input_directory in rapid_input_directories: # keep list of jobs rapid_watershed_jobs[rapid_input_directory] = { 'jobs': [], 'jobs_info': [] } print("Running forecasts for: {0} {1}".format( rapid_input_directory, os.path.basename(ecmwf_folder))) watershed, subbasin = get_watershed_subbasin_from_folder( rapid_input_directory) master_watershed_input_directory = os.path.join( rapid_io_files_location, "input", rapid_input_directory) master_watershed_outflow_directory = os.path.join( rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) try: os.makedirs(master_watershed_outflow_directory) except OSError: pass # initialize HTCondor/multiprocess Logging Directory subprocess_forecast_log_dir = os.path.join( subprocess_log_directory, forecast_date_timestep) try: os.makedirs(subprocess_forecast_log_dir) except OSError: pass # add USGS gage data to initialization file if initialize_flows: # update intial flows with usgs data update_inital_flows_usgs( master_watershed_input_directory, forecast_date_timestep) # create jobs for HTCondor/multiprocess for watershed_job_index, forecast in enumerate( ecmwf_forecasts): ensemble_number = get_ensemble_number_from_forecast( forecast) # get basin names outflow_file_name = 'Qout_%s_%s_%s.nc' % ( watershed.lower(), subbasin.lower(), ensemble_number) node_rapid_outflow_file = outflow_file_name master_rapid_outflow_file = os.path.join( master_watershed_outflow_directory, outflow_file_name) job_name = 'job_%s_%s_%s_%s' % (forecast_date_timestep, watershed, subbasin, ensemble_number) rapid_watershed_jobs[rapid_input_directory][ 'jobs_info'].append({ 'watershed': watershed, 'subbasin': subbasin, 'outflow_file_name': master_rapid_outflow_file, 'forecast_date_timestep': forecast_date_timestep, 'ensemble_number': ensemble_number, 'master_watershed_outflow_directory': master_watershed_outflow_directory, 'data_manager': data_manager # added this to try to upload forecast in mp }) if mp_mode == "htcondor": # create job to downscale forecasts for watershed job = CJob(job_name, tmplt.vanilla_transfer_files) job.set( 'executable', os.path.join(LOCAL_SCRIPTS_DIRECTORY, 'htcondor_ecmwf_rapid.py')) job.set( 'transfer_input_files', "%s, %s, %s" % (forecast, master_watershed_input_directory, LOCAL_SCRIPTS_DIRECTORY)) job.set('initialdir', subprocess_forecast_log_dir) job.set( 'arguments', '%s %s %s %s %s %s' % (forecast, forecast_date_timestep, watershed.lower(), subbasin.lower(), rapid_executable_location, initialize_flows)) job.set( 'transfer_output_remaps', "\"%s = %s\"" % (node_rapid_outflow_file, master_rapid_outflow_file)) job.submit() rapid_watershed_jobs[rapid_input_directory][ 'jobs'].append(job) elif mp_mode == "multiprocess": rapid_watershed_jobs[rapid_input_directory][ 'jobs'].append(( forecast, forecast_date_timestep, watershed.lower(), subbasin.lower(), rapid_executable_location, initialize_flows, job_name, master_rapid_outflow_file, master_watershed_input_directory, mp_execute_directory, subprocess_forecast_log_dir, watershed_job_index, initialization_time_step, # dam arguments included, MJS 8/23/2020 ........ BS_opt_dam, IS_dam_tot, IS_dam_use, dam_tot_id_file, dam_use_id_file, dam_file)) # COMMENTED CODE FOR DEBUGGING SERIALLY ## run_ecmwf_rapid_multiprocess_worker((forecast, ## forecast_date_timestep, ## watershed.lower(), ## subbasin.lower(), ## rapid_executable_location, ## initialize_flows, ## job_name, ## master_rapid_outflow_file, ## master_watershed_input_directory, ## mp_execute_directory, ## subprocess_forecast_log_dir, ## watershed_job_index ## initialization_time_step)) else: raise Exception( "ERROR: Invalid mp_mode. Valid types are htcondor and multiprocess ..." ) for rapid_input_directory, watershed_job_info in rapid_watershed_jobs.items( ): # add sub job list to master job list master_job_info_list = master_job_info_list + watershed_job_info[ 'jobs_info'] if mp_mode == "htcondor": # wait for jobs to finish then upload files for job_index, job in enumerate( watershed_job_info['jobs']): job.wait() # upload file when done if data_manager: upload_single_forecast( watershed_job_info['jobs_info'][job_index], data_manager) elif mp_mode == "multiprocess": pool_main = mp_Pool() func = partial(run_ecmwf_rapid_multiprocess_worker, watershed_job_info['jobs_info']) multiprocess_worker_list = pool_main.imap_unordered( func, watershed_job_info['jobs'], # watershed_job_info['jobs'], chunksize=1) if data_manager: for multi_job_index in multiprocess_worker_list: # upload file when done upload_single_forecast( watershed_job_info['jobs_info'] [multi_job_index], data_manager) # just in case ... pool_main.close() pool_main.join() # when all jobs in watershed are done, generate warning points if create_warning_points: watershed, subbasin = get_watershed_subbasin_from_folder( rapid_input_directory) forecast_directory = os.path.join( rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) era_interim_watershed_directory = os.path.join( era_interim_data_location, rapid_input_directory) if os.path.exists(era_interim_watershed_directory): print( "Generating warning points for {0}-{1} from {2}" .format(watershed, subbasin, forecast_date_timestep)) era_interim_files = glob( os.path.join(era_interim_watershed_directory, "return_period*.nc")) if era_interim_files: try: generate_ecmwf_warning_points( forecast_directory, era_interim_files[0], forecast_directory, threshold=warning_flow_threshold) if upload_output_to_ckan and data_store_url and data_store_api_key: data_manager.initialize_run_ecmwf( watershed, subbasin, forecast_date_timestep) data_manager.zip_upload_warning_points_in_directory( forecast_directory) except Exception as ex: print(ex) pass else: print( "No ERA Interim file found. Skipping ...") else: print( "No ERA Interim directory found for {0}. " "Skipping warning point generation...".format( rapid_input_directory)) # initialize flows for next run if initialize_flows: # create new init flow files/generate warning point files for rapid_input_directory in rapid_input_directories: input_directory = os.path.join(rapid_io_files_location, 'input', rapid_input_directory) forecast_directory = os.path.join( rapid_io_files_location, 'output', rapid_input_directory, forecast_date_timestep) if os.path.exists(forecast_directory): # loop through all the rapid_namelist files in directory watershed, subbasin = get_watershed_subbasin_from_folder( rapid_input_directory) if initialize_flows: print( "Initializing flows for {0}-{1} from {2}". format(watershed, subbasin, forecast_date_timestep)) basin_files = find_current_rapid_output( forecast_directory, watershed, subbasin) try: compute_initial_rapid_flows( basin_files, input_directory, forecast_date_timestep, initialization_time_step) except Exception as ex: print(ex) pass # run autoroute process if added if autoroute_executable_location and autoroute_io_files_location: # run autoroute on all of the watersheds run_autorapid_process(autoroute_executable_location, autoroute_io_files_location, rapid_io_files_location, forecast_date_timestep, subprocess_forecast_log_dir, geoserver_url, geoserver_username, geoserver_password, app_instance_id) last_forecast_date = get_datetime_from_date_timestep( forecast_date_timestep) # update lock info file with next forecast update_lock_info_file(LOCK_INFO_FILE, True, last_forecast_date.strftime('%Y%m%d%H')) # ---------------------------------------------------------------------- # END FORECAST LOOP # ---------------------------------------------------------------------- except Exception as ex: print_exc() print(ex) pass # Release & update lock info file with all completed forecasts update_lock_info_file(LOCK_INFO_FILE, False, last_forecast_date.strftime('%Y%m%d%H')) if delete_output_when_done: # delete local datasets for job_info in master_job_info_list: try: rmtree(job_info['master_watershed_outflow_directory']) except OSError: pass # delete watershed folder if empty for item in os.listdir( os.path.join(rapid_io_files_location, 'output')): try: os.rmdir( os.path.join(rapid_io_files_location, 'output', item)) except OSError: pass # print info to user time_end = datetime.datetime.utcnow() print("Time Begin: {0}".format(time_begin_all)) print("Time Finish: {0}".format(time_end)) print("TOTAL TIME: {0}".format(time_end - time_begin_all))
def run_era_interim_rapid_process(rapid_executable_location, rapid_io_files_location, ecmwf_forecast_location, era_interim_data_location, condor_log_directory, main_log_directory, data_store_url, data_store_api_key, app_instance_id, sync_rapid_input_with_ckan, download_ecmwf, download_era_interim, upload_output_to_ckan, generate_return_periods_file): """ This it the main process """ time_begin_all = datetime.datetime.utcnow() date_string = time_begin_all.strftime('%Y%m%d') #date_string = datetime.datetime(2015,2,3).strftime('%Y%m%d') rapid_scripts_location = os.path.dirname(os.path.realpath(__file__)) if sync_rapid_input_with_ckan and app_instance_id and data_store_url and data_store_api_key: #sync with data store ri_manager = RAPIDInputDatasetManager(data_store_url, data_store_api_key, 'ecmwf', app_instance_id) ri_manager.sync_dataset(os.path.join(rapid_io_files_location,'input')) #clean up old log files clean_logs(condor_log_directory, main_log_directory) #initialize HTCondor Directory condor_init_dir = os.path.join(condor_log_directory, date_string) try: os.makedirs(condor_init_dir) except OSError: pass #get list of correclty formatted rapid input directories in rapid directory rapid_input_directories = [] for directory in os.listdir(os.path.join(rapid_io_files_location,'input')): if os.path.isdir(os.path.join(rapid_io_files_location,'input', directory)) \ and len(directory.split("-")) == 2: rapid_input_directories.append(directory) else: print directory, "incorrectly formatted. Skipping ..." era_interim_folder = era_interim_data_location if download_era_interim: #download historical ERA data era_interim_folders = download_all_ftp(era_interim_data_location, 'erai_runoff_1980to20*.tar.gz.tar') era_interim_folder = era_interim_folders[0] if upload_output_to_ckan and data_store_url and data_store_api_key: #init data manager for CKAN data_manager = ECMWFRAPIDDatasetManager(data_store_url, data_store_api_key) #run ERA Interim processes iteration = 0 job_list = [] job_info_list = [] for rapid_input_directory in rapid_input_directories: input_folder_split = rapid_input_directory.split("-") watershed = input_folder_split[0] subbasin = input_folder_split[1] master_watershed_input_directory = os.path.join(rapid_io_files_location, "input", rapid_input_directory) master_watershed_outflow_directory = os.path.join(rapid_io_files_location, 'output', rapid_input_directory) try: os.makedirs(master_watershed_outflow_directory) except OSError: pass #get basin names interim_folder_basename = os.path.basename(era_interim_folder) print era_interim_folder, interim_folder_basename outflow_file_name = 'Qout_%s.nc' % interim_folder_basename node_rapid_outflow_file = outflow_file_name master_rapid_outflow_file = os.path.join(master_watershed_outflow_directory, outflow_file_name) #create job to downscale forecasts for watershed job = CJob('job_%s_%s_%s' % (interim_folder_basename, watershed, iteration), tmplt.vanilla_transfer_files) job.set('executable',os.path.join(rapid_scripts_location,'compute_ecmwf_rapid.py')) job.set('transfer_input_files', "%s, %s" % (master_watershed_input_directory, rapid_scripts_location)) job.set('initialdir', condor_init_dir) job.set('arguments', '%s %s %s %s %s' % (watershed.lower(), subbasin.lower(), rapid_executable_location, era_interim_folder, ecmwf_forecast_location)) job.set('transfer_output_remaps', "\"%s = %s\"" % (node_rapid_outflow_file, master_rapid_outflow_file)) job.submit() job_list.append(job) job_info_list.append({'watershed' : watershed, 'subbasin' : subbasin, 'outflow_file_name' : master_rapid_outflow_file, 'master_watershed_outflow_directory': master_watershed_outflow_directory, }) iteration += 1 #wait for jobs to finish then upload files for index, job in enumerate(job_list): job.wait() #generate return periods if generate_return_periods_file: job_info = job_info_list[index] watershed_output_dir = job_info['master_watershed_outflow_directory'] erai_output_file = job_info['outflow_file_name'] return_periods_file = os.path.join(watershed_output_dir, 'return_periods.nc') generate_return_periods(erai_output_file, return_periods_file) """ #upload file when done if upload_output_to_ckan and data_store_url and data_store_api_key: job_info = job_info_list[index] print "Uploading", job_info['watershed'], job_info['subbasin'], \ job_info['forecast_date_timestep'], job_info['ensemble_number'] #Upload to CKAN data_manager.initialize_run_ecmwf(job_info['watershed'], job_info['subbasin'], job_info['forecast_date_timestep']) data_manager.update_resource_ensemble_number(job_info['ensemble_number']) #upload file try: #tar.gz file output_tar_file = os.path.join(job_info['master_watershed_outflow_directory'], "%s.tar.gz" % data_manager.resource_name) if not os.path.exists(output_tar_file): with tarfile.open(output_tar_file, "w:gz") as tar: tar.add(job_info['outflow_file_name'], arcname=os.path.basename(job_info['outflow_file_name'])) return_data = data_manager.upload_resource(output_tar_file) if not return_data['success']: print return_data print "Attempting to upload again" return_data = data_manager.upload_resource(output_tar_file) if not return_data['success']: print return_data else: print "Upload success" else: print "Upload success" except Exception, e: print e pass #remove tar.gz file os.remove(output_tar_file) #initialize flows for next run if initialize_flows: #create new init flow files for rapid_input_directory in rapid_input_directories: input_directory = os.path.join(rapid_io_files_location, 'input', rapid_input_directory) path_to_watershed_files = os.path.join(rapid_io_files_location, 'output', rapid_input_directory) forecast_date_timestep = None #finds the current output from downscaled ECMWF forecasts if os.path.exists(path_to_watershed_files): forecast_date_timestep = sorted([d for d in os.listdir(path_to_watershed_files) \ if os.path.isdir(os.path.join(path_to_watershed_files, d))], reverse=True)[0] if forecast_date_timestep: #loop through all the rapid_namelist files in directory forecast_directory = os.path.join(path_to_watershed_files, forecast_date_timestep) input_folder_split = rapid_input_directory.split("-") watershed = input_folder_split[0] subbasin = input_folder_split[1] if initialize_flows: print "Initializing flows for", watershed, subbasin, "from", forecast_date_timestep basin_files = find_current_rapid_output(forecast_directory, watershed, subbasin) try: compute_initial_rapid_flows(basin_files, input_directory, forecast_date_timestep) except Exception, ex: print ex pass if upload_output_to_ckan and data_store_url and data_store_api_key: #delete local datasets for job_info in job_info_list: try: rmtree(job_info['master_watershed_outflow_directory']) except OSError: pass #delete watershed folder if empty for item in os.listdir(os.path.join(rapid_io_files_location, 'output')): try: os.rmdir(os.path.join(rapid_io_files_location, 'output', item)) except OSError: pass """ #print info to user time_end = datetime.datetime.utcnow() print "Time Begin All: " + str(time_begin_all) print "Time Finish All: " + str(time_end) print "TOTAL TIME: " + str(time_end-time_begin_all)