def determine_order_disposition(proc_cfg, username): """Accomplishes order dispossition tasks Interact with the web service to accomplish order dispossition tasks along with sending the initial emails out to the users after their order has been accepted. """ # Get the logger for this task logger = logging.getLogger(LOGGER_NAME) rpcurl = proc_cfg.get('processing', 'espa_api') server = None # Create a server object if the rpcurl seems valid if (rpcurl is not None and rpcurl.startswith('http://') and len(rpcurl) > 7): server = api_interface.api_connect(rpcurl) else: raise Exception('Missing or invalid API URL') # Verify the API is up if server is None: raise Exception('API server was None... exiting') # Use order_disposition_enabled to determine if we should be processing # or not od_enabled = server.get_configuration('system.order_disposition_enabled') if not od_enabled.lower() == 'true': raise Exception('order disposition disabled... exiting') try: if not server.handle_orders(username): raise Exception('server.handle_orders() was not successful') except api_interface.APIException: logger.exception('A protocol error occurred') except Exception: logger.exception('An error occurred finalizing orders') finally: server = None
def main (): logger = logging.getLogger(__name__) # Get logger for the module. # get the command line arguments parser = OptionParser() parser.add_option ('-s', '--start_year', type='int', dest='syear', default=0, help='year for which to start pulling LAADS data') parser.add_option ('-e', '--end_year', type='int', dest='eyear', default=0, help='last year for which to pull LAADS data') parser.add_option ('--today', dest='today', default=False, action='store_true', help='process LAADS data up through the most recent year and DOY') msg = ('process or reprocess all LAADS data from today back to {}' .format(START_YEAR)) parser.add_option ('--quarterly', dest='quarterly', default=False, action='store_true', help=msg) (options, args) = parser.parse_args() syear = options.syear # starting year eyear = options.eyear # ending year today = options.today # process most recent year of data quarterly = options.quarterly # process today back to START_YEAR # check the arguments if (today == False) and (quarterly == False) and \ (syear == 0 or eyear == 0): msg = ('Invalid command line argument combination. Type --help ' 'for more information.') logger.error(msg) return ERROR # determine the auxiliary directory to store the data auxdir = os.environ.get('L8_AUX_DIR') if auxdir is None: msg = 'L8_AUX_DIR environment variable not set... exiting' logger.error(msg) return ERROR # Get the application token for the LAADS https interface. for ESPA # systems, pull the token from the config file. if TOKEN is None: # ESPA Processing Environment # Read ~/.usgs/espa/processing.conf to get the URL for the ESPA API. # Connect to the ESPA API and get the application token for downloading # the LAADS data from the internal database. PROC_CFG_FILENAME = 'processing.conf' proc_cfg = retrieve_cfg(PROC_CFG_FILENAME) rpcurl = proc_cfg.get('processing', 'espa_api') server = api_connect(rpcurl) if server: token = server.get_configuration('aux.downloads.laads.token') else: # Non-ESPA processing. TOKEN needs to be defined at the top of this # script. token = TOKEN if token is None: logger.error('Application token is None. This needs to be a valid ' 'token provided for accessing the LAADS data. ' 'https://ladsweb.modaps.eosdis.nasa.gov/tools-and-services/data-download-scripts/') return ERROR # if processing today then process the current year. if the current # DOY is within the first month, then process the previous year as well # to make sure we have all the recently available data processed. if today: msg = 'Processing LAADS data up to the most recent year and DOY.' logger.info(msg) now = datetime.datetime.now() day_of_year = now.timetuple().tm_yday eyear = now.year if day_of_year <= 31: syear = eyear - 1 else: syear = eyear elif quarterly: msg = 'Processing LAADS data back to {}'.format(START_YEAR) logger.info(msg) eyear = now.year syear = START_YEAR msg = 'Processing LAADS data for {} - {}'.format(syear, eyear) logger.info(msg) for yr in range(eyear, syear-1, -1): msg = 'Processing year: {}'.format(yr) logger.info(msg) status = getLadsData(auxdir, yr, today, token) if status == ERROR: msg = ('Problems occurred while processing LAADS data for year {}' .format(yr)) logger.error(msg) return ERROR msg = 'LAADS processing complete.' logger.info(msg) return SUCCESS
def process(proc_cfg, developer_sleep_mode=False): """Read all lines from STDIN and process them Each line is converted to a JSON dictionary of the parameters for processing. Validation is performed on the JSON dictionary to test if valid for this mapper. After validation the generation of the products is performed. """ # Initially set to the base logger logger = EspaLogging.get_logger('base') processing_location = socket.gethostname() # Process each line from stdin for line in sys.stdin: if not line or len(line) < 1 or not line.strip().find('{') > -1: # this is how the nlineinputformat is supplying values: # 341104 {"orderid": # logger.info('BAD LINE:{}##'.format(line)) continue else: # take the entry starting at the first opening parenth to the end line = line[line.find('{'):] line = line.strip() # Reset these for each line (server, order_id, product_id) = (None, None, None) start_time = datetime.datetime.now() # Initialize so that we don't sleep dont_sleep = True try: line = line.replace('#', '') parms = json.loads(line) if not parameters.test_for_parameter(parms, 'options'): raise ValueError('Error missing JSON [options] record') # TODO scene will be replaced with product_id someday (order_id, product_id, product_type, options) = \ (parms['orderid'], parms['scene'], parms['product_type'], parms['options']) if product_id != 'plot': # Developer mode is always false unless you are a developer # so sleeping will always occur for none plotting requests # Override with the developer mode dont_sleep = developer_sleep_mode # Fix the orderid in-case it contains any single quotes # The processors can not handle single quotes in the email # portion due to usage in command lines. parms['orderid'] = order_id.replace("'", '') # If it is missing due to above TODO, then add it if not parameters.test_for_parameter(parms, 'product_id'): parms['product_id'] = product_id # Figure out if debug level logging was requested debug = False if parameters.test_for_parameter(options, 'debug'): debug = options['debug'] # Configure and get the logger for this order request EspaLogging.configure(settings.PROCESSING_LOGGER, order=order_id, product=product_id, debug=debug) logger = EspaLogging.get_logger(settings.PROCESSING_LOGGER) logger.info('Processing {}:{}'.format(order_id, product_id)) # Update the status in the database if parameters.test_for_parameter(parms, 'espa_api'): if parms['espa_api'] != 'skip_api': server = api_interface.api_connect(parms['espa_api']) if server is not None: status = server.update_status(product_id, order_id, processing_location, 'processing') if not status: logger.warning('Failed processing API call' ' to update_status to processing') if product_id != 'plot': # Make sure we can process the sensor tmp_info = sensor.info(product_id) del tmp_info # Make sure we have a valid output format if not parameters.test_for_parameter(options, 'output_format'): logger.warning('[output_format] parameter missing' ' defaulting to envi') options['output_format'] = 'envi' if (options['output_format'] not in parameters.VALID_OUTPUT_FORMATS): raise ValueError('Invalid Output format {}' .format(options['output_format'])) # ---------------------------------------------------------------- # NOTE: The first thing the product processor does during # initialization is validate the input parameters. # ---------------------------------------------------------------- destination_product_file = 'ERROR' destination_cksum_file = 'ERROR' pp = None try: # All processors are implemented in the processor module pp = processor.get_instance(proc_cfg, parms) (destination_product_file, destination_cksum_file) = \ pp.process() finally: # Free disk space to be nice to the whole system. if pp is not None: pp.remove_product_directory() # Sleep the number of seconds for minimum request duration sleep(get_sleep_duration(proc_cfg, start_time, dont_sleep)) archive_log_files(order_id, product_id) # Everything was successfull so mark the scene complete if server is not None: status = server.mark_scene_complete(product_id, order_id, processing_location, destination_product_file, destination_cksum_file, '') if not status: logger.warning('Failed processing API call to' ' mark_scene_complete') except Exception as excep: # First log the exception logger.exception('Exception encountered stacktrace follows') # Sleep the number of seconds for minimum request duration sleep(get_sleep_duration(proc_cfg, start_time, dont_sleep)) archive_log_files(order_id, product_id) if server is not None: try: status = set_product_error(server, order_id, product_id, processing_location) except Exception: logger.exception('Exception encountered stacktrace' ' follows') finally: # Reset back to the base logger logger = EspaLogging.get_logger('base')
def process_requests(cron_cfg, proc_cfg, args, queue_priority, request_priority): """Retrieves and kicks off processes Queries the API service to see if there are any requests that need to be processed with the specified type, priority and/or user. If there are, this method builds and executes a hadoop job and updates the status for each request through the API service." Args: cron_cfg (ConfigParser): Configuration for ESPA cron. proc_cfg (ConfigParser): Configuration for ESPA processing. args (struct): The arguments retireved from the command line. queue_priority (str): The queue to use or None. request_priority (str): The request to use or None. Returns: Nothing is returned. Raises: Exception(message) """ # Get the logger for this task logger = logging.getLogger(LOGGER_NAME) # check the number of hadoop jobs and don't do anything if they # are over a limit job_limit = cron_cfg.getint('hadoop', 'max_jobs') cmd = "hadoop job -list|awk '{print $1}'|grep -c job 2>/dev/null" try: job_count = execute_cmd(cmd) except Exception as e: errmsg = 'Stdout/Stderr is: 0' if errmsg in e.message: job_count = 0 else: raise e if int(job_count) >= int(job_limit): logger.warn('Detected {0} Hadoop jobs running'.format(job_count)) logger.warn('No additional jobs will be run until job count' ' is below {0}'.format(job_limit)) return rpcurl = proc_cfg.get('processing', 'espa_api') server = None # Create a server object if the rpcurl seems valid if (rpcurl is not None and rpcurl.startswith('http://') and len(rpcurl) > 7): server = api_interface.api_connect(rpcurl) else: raise Exception('Missing or invalid environment variable ESPA_API') home_dir = os.environ.get('HOME') hadoop_executable = os.path.join(home_dir, 'bin/hadoop/bin/hadoop') # Verify API server if server is None: raise Exception('ESPA API did not respond... exiting') user = server.get_configuration('landsatds.username') if user is None: raise Exception('landsatds.username is not defined... exiting') password = server.get_configuration('landsatds.password') if password is None: raise Exception('landsatds.password is not defined... exiting') host = server.get_configuration('landsatds.host') if host is None: raise Exception('landsatds.host is not defined... exiting') # Use ondemand_enabled to determine if we should be processing or not ondemand_enabled = server.get_configuration('system.ondemand_enabled') # Determine the appropriate hadoop queue to use hadoop_job_queue = get_queue_name(cron_cfg, queue_priority) if not ondemand_enabled.lower() == 'true': raise Exception('on demand disabled... exiting') # Create a partial function to reduce duplication in some of the # following code proc_cmdenv = partial(gen_cmdenv_from_cfg, cfg=proc_cfg, section='processing') try: logger.info('Checking for requests to process...') requests = server.get_scenes_to_process(int(args.limit), args.user, request_priority, list(args.product_types)) if requests: # Figure out the name of the order file stamp = datetime.now() job_name = ('%s_%s_%s_%s_%s_%s-%s-espa_job' % (str(stamp.month), str(stamp.day), str(stamp.year), str(stamp.hour), str(stamp.minute), str( stamp.second), str(queue_priority))) logger.info(' '.join([ 'Found requests to process,', 'generating job name:', job_name ])) job_filename = '{0}.txt'.format(job_name) job_filepath = os.path.join('/tmp', job_filename) # Create the order file full of all the scenes requested with open(job_filepath, 'w+') as espa_fd: for request in requests: request['espa_api'] = rpcurl # Log the request before passwords are added line_entry = json.dumps(request) logger.info(line_entry) # Add the usernames and passwords to the options request['options']['source_username'] = user request['options']['destination_username'] = user request['options']['source_pw'] = password request['options']['destination_pw'] = password # Need to refresh since we added password stuff that # could not be logged line_entry = json.dumps(request) # Split the jobs using newline's request_line = ''.join([line_entry, '\n']) # Write out the request line espa_fd.write(request_line) # Specify the location of the order file on the hdfs hdfs_target = os.path.join('requests', job_filename) # Define command line to store the job file in hdfs hadoop_store_command = [ hadoop_executable, 'dfs', '-copyFromLocal', job_filepath, hdfs_target ] jars_path = os.path.join(home_dir, 'bin/hadoop/contrib/streaming', 'hadoop-streaming*.jar') code_dir = os.path.join(home_dir, 'espa-site/processing') # Specify the mapper application mapper_path = os.path.join(code_dir, 'ondemand_mapper.py') # Define command line to execute the hadoop job # Be careful it is possible to have conflicts between module names # # When Hadoop kicks off a job task, it doesn't set $HOME # However matplotlib requires it to be set hadoop_run_command = \ [hadoop_executable, 'jar', jars_path, '-D', ('mapred.task.timeout={0}' .format(cron_cfg.getint('hadoop', 'timeout'))), '-D', 'mapred.reduce.tasks=0', '-D', 'mapred.job.queue.name={0}'.format(hadoop_job_queue), '-D', 'mapred.job.name="{0}"'.format(job_name), '-inputformat', ('org.apache.hadoop.mapred.' 'lib.NLineInputFormat'), '-file', mapper_path, '-file', os.path.join(code_dir, 'api_interface.py'), '-file', os.path.join(code_dir, 'config_utils.py'), '-file', os.path.join(code_dir, 'distribution.py'), '-file', os.path.join(code_dir, 'environment.py'), '-file', os.path.join(code_dir, 'espa_exception.py'), '-file', os.path.join(code_dir, 'initialization.py'), '-file', os.path.join(code_dir, 'landsat_metadata.py'), '-file', os.path.join(code_dir, 'logging_tools.py'), '-file', os.path.join(code_dir, 'parameters.py'), '-file', os.path.join(code_dir, 'processor.py'), '-file', os.path.join(code_dir, 'sensor.py'), '-file', os.path.join(code_dir, 'settings.py'), '-file', os.path.join(code_dir, 'staging.py'), '-file', os.path.join(code_dir, 'statistics.py'), '-file', os.path.join(code_dir, 'transfer.py'), '-file', os.path.join(code_dir, 'utilities.py'), '-file', os.path.join(code_dir, 'warp.py'), '-mapper', mapper_path, '-cmdenv', 'HOME=$HOME', '-cmdenv', proc_cmdenv(option='espa_work_dir'), '-cmdenv', proc_cmdenv(option='espa_distribution_method'), '-cmdenv', proc_cmdenv(option='espa_distribution_dir'), '-cmdenv', proc_cmdenv(option='espa_schema'), '-cmdenv', proc_cmdenv(option='espa_land_mass_polygon'), '-cmdenv', proc_cmdenv(option='espa_api'), '-cmdenv', proc_cmdenv(option='espa_cache_host_list'), '-cmdenv', proc_cmdenv(option='espa_elevation_dir'), '-cmdenv', proc_cmdenv(option='ias_data_dir'), '-cmdenv', proc_cmdenv(option='pythonpath'), '-cmdenv', proc_cmdenv(option='ledaps_aux_dir'), '-cmdenv', proc_cmdenv(option='l8_aux_dir'), '-cmdenv', proc_cmdenv(option='esun'), '-cmdenv', proc_cmdenv(option='lst_aux_dir'), '-cmdenv', proc_cmdenv(option='lst_data_dir'), '-cmdenv', proc_cmdenv(option='modtran_path'), '-cmdenv', proc_cmdenv(option='modtran_data_dir'), '-cmdenv', proc_cmdenv(option='aster_ged_server_name'), '-input', hdfs_target, '-output', hdfs_target + '-out'] # Define the executables to clean up hdfs hadoop_delete_request_command1 = [ hadoop_executable, 'dfs', '-rmr', hdfs_target ] hadoop_delete_request_command2 = [ hadoop_executable, 'dfs', '-rmr', hdfs_target + '-out' ] logger.info('Storing request file to hdfs...') output = '' try: cmd = ' '.join(hadoop_store_command) logger.info('Store cmd:{0}'.format(cmd)) output = execute_cmd(cmd) except Exception: msg = 'Error storing files to HDFS... exiting' raise Exception(msg) finally: if len(output) > 0: logger.info(output) logger.info('Deleting local request file copy [{0}]'.format( job_filepath)) os.unlink(job_filepath) try: # Update the scene list as queued so they don't get pulled # down again now that these jobs have been stored in hdfs product_list = list() for request in requests: product_list.append((request['orderid'], request['scene'])) logger.info( 'Adding scene:{0} orderid:{1} to queued list'.format( request['scene'], request['orderid'])) server.queue_products(product_list, 'CDR_ECV cron driver', job_name) logger.info('Running hadoop job...') output = '' try: cmd = ' '.join(hadoop_run_command) logger.info('Run cmd:{0}'.format(cmd)) output = execute_cmd(cmd) except Exception: logger.exception('Error running Hadoop job...') finally: if len(output) > 0: logger.info(output) finally: logger.info('Deleting hadoop job request file from hdfs....') output = '' try: cmd = ' '.join(hadoop_delete_request_command1) output = execute_cmd(cmd) except Exception: logger.exception("Error deleting hadoop job request file") finally: if len(output) > 0: logger.info(output) logger.info('Deleting hadoop job output...') output = '' try: cmd = ' '.join(hadoop_delete_request_command2) output = execute_cmd(cmd) except Exception: logger.exception('Error deleting hadoop job output') finally: if len(output) > 0: logger.info(output) else: logger.info('No requests to process....') except api_interface.APIException: logger.exception('A protocol error occurred') except Exception: logger.exception('Error Processing Ondemand Requests') finally: server = None
def process_requests(cron_cfg, proc_cfg, args, queue_priority, request_priority): """Retrieves and kicks off processes Queries the API service to see if there are any requests that need to be processed with the specified type, priority and/or user. If there are, this method builds and executes a hadoop job and updates the status for each request through the API service." Args: cron_cfg (ConfigParser): Configuration for ESPA cron. proc_cfg (ConfigParser): Configuration for ESPA processing. args (struct): The arguments retireved from the command line. queue_priority (str): The queue to use or None. request_priority (str): The request to use or None. Returns: Nothing is returned. Raises: Exception(message) """ # Get the logger for this task logger = logging.getLogger(LOGGER_NAME) # Define path to hadoop commandline executables home_dir = os.environ.get('HOME') yarn_executable = os.path.join(home_dir, 'bin/hadoop/bin/yarn') hdfs_executable = os.path.join(home_dir, 'bin/hadoop/bin/hdfs') jars_path = os.path.join(home_dir, 'bin/hadoop/share/hadoop/tools/lib/', 'hadoop-streaming-*.jar') # check the number of hadoop jobs and don't do anything if they # are over a limit job_limit = cron_cfg.getint('hadoop', 'max_jobs') yarn_running_apps_command = [yarn_executable, "application", "-list"] try: cmd = ' '.join(yarn_running_apps_command) app_states = execute_cmd(cmd) # Get "total applications: N" output line from YARN running_line = [l for l in app_states.split('\n') if 'Total number of applications' in l].pop() job_count = running_line.split(':')[-1] except Exception as e: errmsg = 'Stdout/Stderr is: 0' if errmsg in e.message: job_count = 0 else: raise e if int(job_count) >= int(job_limit): logger.warn('Detected {0} Hadoop jobs running'.format(job_count)) logger.warn('No additional jobs will be run until job count' ' is below {0}'.format(job_limit)) return rpcurl = proc_cfg.get('processing', 'espa_api') server = None # Create a server object if the rpcurl seems valid if (rpcurl is not None and rpcurl.startswith('http://') and len(rpcurl) > 7): server = api_interface.api_connect(rpcurl) else: raise Exception('Missing or invalid environment variable ESPA_API') # Verify API server if server is None: raise Exception('ESPA API did not respond... exiting') user = server.get_configuration('landsatds.username') if user is None: raise Exception('landsatds.username is not defined... exiting') password = server.get_configuration('landsatds.password') if password is None: raise Exception('landsatds.password is not defined... exiting') host = server.get_configuration('landsatds.host') if host is None: raise Exception('landsatds.host is not defined... exiting') # Use ondemand_enabled to determine if we should be processing or not ondemand_enabled = server.get_configuration('system.ondemand_enabled') # Determine the appropriate hadoop queue to use hadoop_job_queue = get_queue_name(cron_cfg, queue_priority) if not ondemand_enabled.lower() == 'true': raise Exception('on demand disabled... exiting') # Create a partial function to reduce duplication in some of the # following code proc_cmdenv = partial(gen_cmdenv_from_cfg, cfg=proc_cfg, section='processing') try: logger.info('Checking for requests to process...') requests = server.get_scenes_to_process(int(args.limit), args.user, request_priority, list(args.product_types)) if requests: # Figure out the name of the order file stamp = datetime.now() job_name = ('{0:%Y-%m-%d-%H-%M-%S}-{1}-espa_job' .format(stamp, queue_priority)) logger.info(' '.join(['Found requests to process,', 'generating job name:', job_name])) job_filename = '{0}.txt'.format(job_name) job_filepath = os.path.join('/tmp', job_filename) # Create the order file full of all the scenes requested with open(job_filepath, 'w+') as espa_fd: for request in requests: request['espa_api'] = rpcurl # Log the request before passwords are added line_entry = json.dumps(request) logger.info(line_entry) # Add the usernames and passwords to the options request['options']['source_username'] = user request['options']['destination_username'] = user request['options']['source_pw'] = password request['options']['destination_pw'] = password # Need to refresh since we added password stuff that # could not be logged line_entry = json.dumps(request) # Split the jobs using newline's request_line = ''.join([line_entry, '\n']) # Write out the request line espa_fd.write(request_line) # Specify the location of the order file on the hdfs hdfs_target = os.path.join('requests', job_filename) # Define command line to store the job file in hdfs hadoop_store_command = [hdfs_executable, 'dfs', '-put', job_filepath, hdfs_target] # Specify the mapper application code_dir = os.path.join(home_dir, 'espa-site/processing') mapper_path = 'processing/ondemand_mapper.py' # Define command line to execute the hadoop job # Be careful it is possible to have conflicts between module names # # When Hadoop kicks off a job task, it doesn't set $HOME # However matplotlib requires it to be set hadoop_run_command = \ [yarn_executable, 'jar', jars_path, '-D', ('mapred.task.timeout={0}' .format(cron_cfg.getint('hadoop', 'timeout'))), '-D', 'mapred.reduce.tasks=0', '-D', 'mapred.job.queue.name={0}'.format(hadoop_job_queue), '-D', 'mapred.job.name="{0}"'.format(job_name), '-files', code_dir, '-mapper', mapper_path, '-input', hdfs_target, '-inputformat', 'org.apache.hadoop.mapred.lib.NLineInputFormat', '-cmdenv', 'HOME={0}'.format(home_dir), '-output', hdfs_target + '-out'] # Define the executables to clean up hdfs hadoop_delete_request_command1 = [hdfs_executable, 'dfs', '-rm', '-r', hdfs_target] hadoop_delete_request_command2 = [hdfs_executable, 'dfs', '-rm', '-r', hdfs_target + '-out'] logger.info('Storing request file to hdfs...') output = '' try: cmd = ' '.join(hadoop_store_command) logger.info('Store cmd:{0}'.format(cmd)) output = execute_cmd(cmd) except Exception: msg = 'Error storing files to HDFS... exiting' raise Exception(msg) finally: if len(output) > 0: logger.info(output) logger.info('Deleting local request file copy [{0}]' .format(job_filepath)) os.unlink(job_filepath) try: # Update the scene list as queued so they don't get pulled # down again now that these jobs have been stored in hdfs product_list = list() for request in requests: product_list.append((request['orderid'], request['scene'])) logger.info('Adding scene:{0} orderid:{1} to queued list' .format(request['scene'], request['orderid'])) server.queue_products(product_list, 'CDR_ECV cron driver', job_name) logger.info('Running hadoop job...') output = '' try: cmd = ' '.join(hadoop_run_command) logger.info('Run cmd:{0}'.format(cmd)) output = execute_cmd(cmd) except Exception: logger.exception('Error running Hadoop job...') finally: if len(output) > 0: logger.info(output) finally: logger.info('Deleting hadoop job request file from hdfs....') output = '' try: cmd = ' '.join(hadoop_delete_request_command1) output = execute_cmd(cmd) except Exception: logger.exception("Error deleting hadoop job request file") finally: if len(output) > 0: logger.info(output) logger.info('Deleting hadoop job output...') output = '' try: cmd = ' '.join(hadoop_delete_request_command2) output = execute_cmd(cmd) except Exception: logger.exception('Error deleting hadoop job output') finally: if len(output) > 0: logger.info(output) else: logger.info('No requests to process....') except api_interface.APIException: logger.exception('A protocol error occurred') except Exception: logger.exception('Error Processing Ondemand Requests') finally: server = None