def transfer_in(args, parser): """ transfer in command The transfer in command will transfer files from a remote Globus endpoint to the cluster Globus endpoint. When used with --proc-id, the files defined in the PROC_ID file will be transferred. When used with --filter, the files returned by API query will be transferred. """ emop_transfer = EmopTransfer(args.config_path) endpoint_check = emop_transfer.check_endpoints() if not endpoint_check: print("ERROR: Not all endpoints are activated.") sys.exit(1) if args.proc_id: task_id = emop_transfer.stage_in_proc_ids(proc_ids=[args.proc_id], wait=args.wait) if task_id: print("Transfer submitted: %s" % task_id) else: print("Error: Failed to submit transfer") elif args.filter: emop_query = EmopQuery(args.config_path) pending_pages = emop_query.pending_pages(q_filter=args.filter)#, r_filter='page.pg_image_path,pg_ground_truth_file') task_id = emop_transfer.stage_in_data(data=pending_pages, wait=args.wait) if task_id: print("Transfer submitted: %s" % task_id) else: print("ERROR: Failed to submit transfer") if task_id: sys.exit(0) else: sys.exit(1)
def query(args, parser): """ query command The query subcommmand behavior is determined by the --pending-pages and --avg-runtimes arguments. When --pending-pages is passed the number of pending pages is printed. This query can be modified via the --filters argument. When --avg-runtimes is passed the log files from emop-controller are parsed and average runtimes are printed. """ emop_query = EmopQuery(args.config_path) # --pending-pages if args.query_pending_pages: pending_pages = emop_query.pending_pages_count(q_filter=args.filter) if pending_pages == 0 or pending_pages: print("Number of pending pages: %s" % pending_pages) else: print("ERROR: querying pending pages failed") sys.exit(1) # --avg-runtimes if args.query_avg_runtimes: avg_runtimes = emop_query.get_runtimes() if avg_runtimes: print("Pages completed: %d" % avg_runtimes["total_pages"]) print("Total Page Runtime: %d seconds" % avg_runtimes["total_page_runtime"]) print("Average Page Runtime: %d seconds" % avg_runtimes["average_page_runtime"]) print("Jobs completed: %d" % avg_runtimes["total_jobs"]) print("Average Job Runtime: %d seconds" % avg_runtimes["average_job_runtime"]) print("Processes:") for process in avg_runtimes["processes"]: print("\t%s completed: %d" % (process["name"], process["count"])) print("\t%s Average: %d seconds" % (process["name"], process["avg"])) print("\t%s Total: %d seconds" % (process["name"], process["total"])) else: print("ERROR: querying average page runtimes") sys.exit(1) sys.exit(0)
def submit(args, parser): """ submit command This command will submit jobs based on the various arguments. Arguments used: --num-jobs: Number of jobs to submit. The default value is determined based on optimization logic. --pages-per-job: Number of pages per job. The default value if determined based on optimization logic. --no-schedule: Currently unused --sim: Simulate and print the job optimization and commands to use but do not actually submit jobs --filter: Filter to use when querying pending pages The default logic of this function is to determine optimal number of jobs and pages per job. Once this is determined the necessary number of pages are reserved via Dashboard API. The returned API results are saved to a PROC_ID input file. All the reserved job's PROC_IDs are then processed and a Globus transfer is initiated and a transfer job is submitted to wait for the transfer to complete. All the reserved page PROC_IDs are submitted as batch jobs and will depend on the transfer job to complete before they start. """ # Ensure --num-jobs and --pages-per-job are both present # if either is used if (args.num_jobs and not args.pages_per_job or not args.num_jobs and args.pages_per_job): print("--num-jobs and --pages-per-job must be used together") parser.print_help() sys.exit(1) emop_submit = EmopSubmit(args.config_path) emop_query = EmopQuery(args.config_path) pending_pages = emop_query.pending_pages_count(q_filter=args.filter) # Exit if no pages to run if pending_pages == 0: print("No work to be done") sys.exit(0) if not pending_pages: print("Error querying pending pages") sys.exit(1) # Exit if the number of submitted jobs has reached the limit if args.schedule: current_job_count = emop_submit.scheduler.current_job_count() if current_job_count >= emop_submit.settings.max_jobs: print("Job limit of %s reached." % emop_submit.settings.max_jobs) sys.exit(0) # Optimize job submission if --pages-per-job and --num-jobs was not set if not args.pages_per_job and not args.num_jobs: num_jobs, pages_per_job = emop_submit.optimize_submit(pending_pages, current_job_count, sim=args.submit_simulate) else: num_jobs = args.num_jobs pages_per_job = args.pages_per_job if args.submit_simulate: sys.exit(0) # Verify transfers are possible emop_transfer = EmopTransfer(args.config_path) endpoint_check = emop_transfer.check_endpoints(fail_on_warn=True) if not endpoint_check: print("ERROR: Not all endpoints are activated or activation expires soon.") sys.exit(1) # Loop that performs the actual submission proc_ids = [] for i in xrange(num_jobs): proc_id = emop_submit.reserve(num_pages=pages_per_job, r_filter=args.filter) if not proc_id: print("ERROR: Failed to reserve page") continue proc_ids.append(proc_id) if proc_ids: if args.transfer: task_id = emop_transfer.stage_in_proc_ids(proc_ids=proc_ids, wait=False) transfer_job_id = emop_submit.scheduler.submit_transfer_job(task_id=task_id) else: transfer_job_id = None for proc_id in proc_ids: job_id = emop_submit.scheduler.submit_job(proc_id=proc_id, num_pages=pages_per_job, dependency=transfer_job_id) emop_submit.set_job_id(proc_id=proc_id, job_id=job_id) sys.exit(0)
def setUp(self): self.query = EmopQuery(config_path=default_config_path())
class TestEmopQuery(TestCase): def setUp(self): self.query = EmopQuery(config_path=default_config_path()) def tearDown(self): pass def test_get_job_status_id(self): mock_response = { "total": 1, "subtotal": 1, "page": 1, "per_page": 1, "total_pages": 1, "results": [ { "id": 1, "name": "Not Started" }, ] } self.query.emop_api.get_request = MagicMock() self.query.emop_api.get_request.return_value = mock_response retval = self.query._get_job_status_id() self.assertEqual(1, retval) def test_pending_pages_count(self): mock_response = { "job_queue": { "count": 2 } } self.query._get_job_status_id = MagicMock() self.query._get_job_status_id.return_value = 1 self.query.emop_api.get_request = MagicMock() self.query.emop_api.get_request.return_value = mock_response retval = self.query.pending_pages_count(q_filter='{"batch_id": 1}') self.assertEqual(2, retval) def test_pending_pages_1(self): mock_response = load_fixture_file('job_queues_1.json') self.query._get_job_status_id = MagicMock() self.query._get_job_status_id.return_value = 1 self.query.emop_api.get_request = MagicMock() self.query.emop_api.get_request.return_value = mock_response retval = self.query.pending_pages(q_filter='{"batch_id": 1}') self.assertEqual(mock_response['results'], retval) @xfail def test_pending_pages_2(self): mock_response = load_fixture_file('job_queues_1.json') expected = [ { 'page': { "pg_ground_truth_file": "/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/1.txt", 'pg_image_path': '/data/eebo/e0006/40099/00001.000.001.tif', }, }, { 'page': { "pg_ground_truth_file": "/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/2.txt", 'pg_image_path': '/data/eebo/e0006/40099/00002.000.001.tif', }, }, ] self.query._get_job_status_id = MagicMock() self.query._get_job_status_id.return_value = 1 self.query.emop_api.get_request = MagicMock() self.query.emop_api.get_request.return_value = mock_response retval = self.query.pending_pages(q_filter='{"batch_id": 1}', r_filter='page.pg_image_path,pg_ground_truth_file') self.maxDiff = None self.assertEqual(expected, retval) def test_get_runtimes_1(self): expected = { 'total_pages': 10, 'total_page_runtime': 630.943, 'average_page_runtime': 63.094, 'total_jobs': 1, 'average_job_runtime': 631.018, 'processes': [ {'name': "OCR", 'count': 10, 'total': 69.422,'avg': 6.942}, {'name': "Denoise", 'count': 10, 'total': 69.579,'avg': 6.958}, {'name': "MultiColumnSkew", 'count': 10, 'total': 69.331,'avg': 6.933}, {'name': "XML_To_Text", 'count': 10, 'total': 0.205,'avg': 0.021}, {'name': "PageEvaluator", 'count': 10, 'total': 9.852,'avg': 0.985}, {'name': "PageCorrector", 'count': 10, 'total': 402.345,'avg': 40.234}, {'name': "JuxtaCompare", 'count': 10, 'total': 10.118,'avg': 1.012}, ], } self.query.settings.scheduler_logdir = os.path.dirname(fixture_file('log-1.out')) retval = self.query.get_runtimes() self.maxDiff = None self.assertEqual(expected, retval)