def process_requests(args, logger_name, queue_priority, request_priority): ''' Description: Queries the xmlrpc service to see if there are any requests that need to be processed with the specified type, priority and/or user. If there are, this method builds and executes a hadoop job and updates the status for each request through the xmlrpc service." ''' # Get the logger for this task logger = EspaLogging.get_logger(logger_name) # check the number of hadoop jobs and don't do anything if they # are over a limit job_limit = settings.HADOOP_MAX_JOBS cmd = "hadoop job -list|awk '{print $1}'|grep -c job 2>/dev/null" try: job_count = utilities.execute_cmd(cmd) except Exception, e: errmsg = 'Stdout/Stderr is: 0' if errmsg in e.message: job_count = 0 else: raise e
'-output', hdfs_target + '-out'] # Define the executables to clean up hdfs hadoop_delete_request_command1 = [hadoop_executable, 'dfs', '-rmr', hdfs_target] hadoop_delete_request_command2 = [hadoop_executable, 'dfs', '-rmr', hdfs_target + '-out'] # ---------------------------------------------------------------- logger.info("Storing request file to hdfs...") output = '' try: cmd = ' '.join(hadoop_store_command) logger.info("Store cmd:%s" % cmd) output = utilities.execute_cmd(cmd) except Exception: msg = "Error storing files to HDFS... exiting" raise Exception(msg) finally: if len(output) > 0: logger.info(output) logger.info("Deleting local request file copy [%s]" % job_filepath) os.unlink(job_filepath) try: # ------------------------------------------------------------ # Update the scene list as queued so they don't get pulled # down again now that these jobs have been stored in hdfs
def process_requests(args, logger_name, queue_priority, request_priority): ''' Description: Queries the xmlrpc service to see if there are any requests that need to be processed with the specified type, priority and/or user. If there are, this method builds and executes a hadoop job and updates the status for each request through the xmlrpc service." ''' # Get the logger for this task logger = EspaLogging.get_logger(logger_name) rpcurl = os.environ.get('ESPA_XMLRPC') server = None # Create a server object if the rpcurl seems valid if (rpcurl is not None and rpcurl.startswith('http://') and len(rpcurl) > 7): server = xmlrpclib.ServerProxy(rpcurl, allow_none=True) else: raise Exception("Missing or invalid environment variable ESPA_XMLRPC") home_dir = os.environ.get('HOME') hadoop_executable = "%s/bin/hadoop/bin/hadoop" % home_dir # Verify xmlrpc server if server is None: msg = "xmlrpc server was None... exiting" raise Exception(msg) user = server.get_configuration('landsatds.username') if len(user) == 0: msg = "landsatds.username is not defined... exiting" raise Exception(msg) pw = urllib.quote(server.get_configuration('landsatds.password')) if len(pw) == 0: msg = "landsatds.password is not defined... exiting" raise Exception(msg) host = server.get_configuration('landsatds.host') if len(host) == 0: msg = "landsatds.host is not defined... exiting" raise Exception(msg) # Use ondemand_enabled to determine if we should be processing or not ondemand_enabled = server.get_configuration('ondemand_enabled') # Determine the appropriate hadoop queue to use hadoop_job_queue = settings.HADOOP_QUEUE_MAPPING[queue_priority] if not ondemand_enabled.lower() == 'true': raise Exception("on demand disabled... exiting") try: logger.info("Checking for requests to process...") requests = server.get_scenes_to_process(int(args.limit), args.user, request_priority, list(args.product_types)) if requests: # Figure out the name of the order file stamp = datetime.now() job_name = ('%s_%s_%s_%s_%s_%s-%s-espa_job' % (str(stamp.month), str(stamp.day), str(stamp.year), str(stamp.hour), str(stamp.minute), str(stamp.second), str(queue_priority))) logger.info(' '.join(["Found requests to process,", "generating job name:", job_name])) job_filename = '%s%s' % (job_name, '.txt') job_filepath = os.path.join('/tmp', job_filename) # Create the order file full of all the scenes requested with open(job_filepath, 'w+') as espa_fd: for request in requests: (orderid, options) = (request['orderid'], request['options']) request['xmlrpcurl'] = rpcurl # Log the requested options before passwords are added line_entry = json.dumps(request) logger.info(line_entry) # Add the usernames and passwords to the options options['source_username'] = user options['destination_username'] = user options['source_pw'] = pw options['destination_pw'] = pw request['options'] = options # Need to refresh since we added password stuff that # could not be logged line_entry = json.dumps(request) # Pad the entry so hadoop will properly split the jobs filler_count = (settings.ORDER_BUFFER_LENGTH - len(line_entry)) request_line = ''.join([line_entry, ('#' * filler_count), '\n']) # Write out the request line espa_fd.write(request_line) # END - for scene # END - with espa_fd # Specify the location of the order file on the hdfs hdfs_target = 'requests/%s' % job_filename # Specify the mapper application mapper = "ondemand_mapper.py" # Define command line to store the job file in hdfs hadoop_store_command = [hadoop_executable, 'dfs', '-copyFromLocal', job_filepath, hdfs_target] jars = os.path.join(home_dir, 'bin/hadoop/contrib/streaming', 'hadoop-streaming*.jar') # Define command line to execute the hadoop job # Be careful it is possible to have conflicts between module names hadoop_run_command = \ [hadoop_executable, 'jar', jars, '-D', 'mapred.task.timeout=%s' % settings.HADOOP_TIMEOUT, '-D', 'mapred.reduce.tasks=0', '-D', 'mapred.job.queue.name=%s' % hadoop_job_queue, '-D', 'mapred.job.name="%s"' % job_name, '-file', '%s/espa-site/processing/%s' % (home_dir, mapper), '-file', '%s/espa-site/processing/processor.py' % home_dir, '-file', '%s/espa-site/processing/browse.py' % home_dir, '-file', '%s/espa-site/processing/environment.py' % home_dir, '-file', ('%s/espa-site/processing/initialization.py' % home_dir), '-file', '%s/espa-site/processing/distribution.py' % home_dir, '-file', ('%s/espa-site/processing/espa_exception.py' % home_dir), '-file', '%s/espa-site/processing/metadata.py' % home_dir, '-file', '%s/espa-site/processing/parameters.py' % home_dir, '-file', '%s/espa-site/processing/solr.py' % home_dir, '-file', '%s/espa-site/processing/staging.py' % home_dir, '-file', '%s/espa-site/processing/statistics.py' % home_dir, '-file', '%s/espa-site/processing/transfer.py' % home_dir, '-file', '%s/espa-site/processing/warp.py' % home_dir, '-file', ('%s/espa-site/espa_common/logger_factory.py' % home_dir), '-file', '%s/espa-site/espa_common/sensor.py' % home_dir, '-file', '%s/espa-site/espa_common/settings.py' % home_dir, '-file', '%s/espa-site/espa_common/utilities.py' % home_dir, '-mapper', '%s/espa-site/processing/%s' % (home_dir, mapper), '-cmdenv', 'ESPA_WORK_DIR=$ESPA_WORK_DIR', '-cmdenv', 'HOME=$HOME', '-cmdenv', 'USER=$USER', '-cmdenv', 'LEDAPS_AUX_DIR=$LEDAPS_AUX_DIR', '-cmdenv', 'L8_AUX_DIR=$L8_AUX_DIR', '-cmdenv', 'ESUN=$ESUN', '-input', hdfs_target, '-output', hdfs_target + '-out'] # Define the executables to clean up hdfs hadoop_delete_request_command1 = [hadoop_executable, 'dfs', '-rmr', hdfs_target] hadoop_delete_request_command2 = [hadoop_executable, 'dfs', '-rmr', hdfs_target + '-out'] # ---------------------------------------------------------------- logger.info("Storing request file to hdfs...") output = '' try: cmd = ' '.join(hadoop_store_command) logger.info("Store cmd:%s" % cmd) output = utilities.execute_cmd(cmd) except Exception, e: msg = "Error storing files to HDFS... exiting" raise Exception(msg) finally: if len(output) > 0: logger.info(output) logger.info("Deleting local request file copy [%s]" % job_filepath) os.unlink(job_filepath) try: # ------------------------------------------------------------ # Update the scene list as queued so they don't get pulled # down again now that these jobs have been stored in hdfs product_list = list() for request in requests: orderid = request['orderid'] sceneid = request['scene'] product_list.append((orderid, sceneid)) logger.info("Adding scene:%s orderid:%s to queued list" % (sceneid, orderid)) server.queue_products(product_list, 'CDR_ECV cron driver', job_name) # ------------------------------------------------------------ logger.info("Running hadoop job...") output = '' try: cmd = ' '.join(hadoop_run_command) logger.info("Run cmd:%s" % cmd) output = utilities.execute_cmd(cmd) except Exception, e: logger.exception("Error running Hadoop job...") finally: if len(output) > 0: logger.info(output)