예제 #1
0
def transfer_in(args, parser):
    """ transfer in command

    The transfer in command will transfer files from a remote Globus endpoint to the cluster Globus endpoint.

    When used with --proc-id, the files defined in the PROC_ID file will be transferred.

    When used with --filter, the files returned by API query will be transferred.
    """
    emop_transfer = EmopTransfer(args.config_path)
    endpoint_check = emop_transfer.check_endpoints()
    if not endpoint_check:
        print("ERROR: Not all endpoints are activated.")
        sys.exit(1)
    if args.proc_id:
        task_id = emop_transfer.stage_in_proc_ids(proc_ids=[args.proc_id], wait=args.wait)
        if task_id:
            print("Transfer submitted: %s" % task_id)
        else:
            print("Error: Failed to submit transfer")
    elif args.filter:
        emop_query = EmopQuery(args.config_path)
        pending_pages = emop_query.pending_pages(q_filter=args.filter)#, r_filter='page.pg_image_path,pg_ground_truth_file')
        task_id = emop_transfer.stage_in_data(data=pending_pages, wait=args.wait)
        if task_id:
            print("Transfer submitted: %s" % task_id)
        else:
            print("ERROR: Failed to submit transfer")
    if task_id:
        sys.exit(0)
    else:
        sys.exit(1)
예제 #2
0
def transfer_test(args, parser):
    """transfer test command

    The transfer test command can be used to verify transfers are functional between the cluster and remote Globus endpoints.

    The tasks performed:
    * Perform ls on /~/ on cluster endpoint
    * Perform ls on /~/ on remote endpoint
    * Transfer a test file from cluster endpoint to remote endpoint
    * Display task and wait 2 minutes for task to complete
    """
    _fail = False
    emop_transfer = EmopTransfer(args.config_path)
    status = emop_transfer.check_endpoints()
    if not status:
        sys.exit(1)

    # ls_test_path = '/~/'
    # print("Testing ls ability of %s:%s" % (emop_transfer.cluster_endpoint, ls_test_path))
    # cluster_ls_data = emop_transfer.ls(emop_transfer.cluster_endpoint, ls_test_path)
    # if not cluster_ls_data:
    #     print("ERROR: ls of %s:%s" % (emop_transfer.cluster_endpoint, ls_test_path))
    #     _fail = True
    # print("Testing ls ability of %s:%s" % (emop_transfer.remote_endpoint, ls_test_path))
    # remote_ls_data = emop_transfer.ls(emop_transfer.remote_endpoint, ls_test_path)
    # if not remote_ls_data:
    #     print("ERROR: ls of %s:%s" % (emop_transfer.remote_endpoint, ls_test_path))
    #     _fail = True
    #
    # if _fail:
    #     sys.exit(1)

    print("Generating test files")
    test_input = "~/test-in.txt"
    test_output = "~/test-out.txt"
    test_input_path = os.path.expanduser(test_input)
    test_file = open(test_input_path, "w+")
    test_file.write("TEST")
    test_file.close()

    transfer_data = [{"src": test_input, "dest": test_output}]
    task_id = emop_transfer.start(src=emop_transfer.cluster_endpoint, dest=emop_transfer.remote_endpoint, data=transfer_data, label="emop-test", wait=args.wait)
    emop_transfer.display_task(task_id=task_id, wait=120)
예제 #3
0
def transfer_status(args, parser):
    """ transfer status command

    When no arguments are given this command will simply verify that both cluster and remote Globus endpoints
    are activated and usable.

    When the --task-id argument is passed this command will query the status of a Globus transfer task.  The --wait argument
    can be used to force this command to wait for the Globus transfer status to be SUCCEEDED or FAILED.
    """
    emop_transfer = EmopTransfer(args.config_path)
    if args.task_id:
        status = emop_transfer.display_task(task_id=args.task_id, wait=args.wait)
    else:
        status = emop_transfer.check_endpoints()

    if status:
        sys.exit(0)
    else:
        sys.exit(1)
예제 #4
0
def transfer_out(args, parser):
    """ transfer out command

    The transfer out command will transfer files from a cluster Globus endpoint to a remote Globus endpoint.

    Currently this command will only transfer out when passed the --proc-id argument, and will parse the PROC_ID file
    for files to transfer.
    """
    emop_transfer = EmopTransfer(args.config_path)
    endpoint_check = emop_transfer.check_endpoints()
    if not endpoint_check:
        print("ERROR: Not all endpoints are activated.")
    if args.proc_id:
        task_id = emop_transfer.stage_out_proc_id(proc_id=args.proc_id)

    if task_id:
        print("Transfer submitted: %s" % task_id)
        sys.exit(0)
    else:
        print("ERROR: Failed to submit transfer")
        sys.exit(1)
 def setup(self, tmpdir):
     self.tmpout = str(tmpdir.mkdir("out"))
     globus_dir = tmpdir.mkdir('globus')
     auth_file = globus_dir.join('globus-auth')
     expiry = int(time.time()) + (60*60*24*365)
     self.fake_goauth_token = 'un=test|tokenid=fake-token-id|expiry=%d' % expiry
     auth_file.write(self.fake_goauth_token)
     settings = default_settings()
     settings.globus_auth_file = str(auth_file)
     with patch('emop.emop_transfer.GlobusAPIClient') as globus_class:
         mock_globus = GlobusAPIClient(settings=settings)
         globus_class.return_value = mock_globus
         self.transfer = EmopTransfer(config_path=default_config_path())
예제 #6
0
def submit(args, parser):
    """ submit command

    This command will submit jobs based on the various arguments.

    Arguments used:
        --num-jobs: Number of jobs to submit.  The default value is determined based on optimization logic.
        --pages-per-job: Number of pages per job.  The default value if determined based on optimization logic.
        --no-schedule: Currently unused
        --sim: Simulate and print the job optimization and commands to use but do not actually submit jobs
        --filter: Filter to use when querying pending pages

    The default logic of this function is to determine optimal number of jobs and pages per job.  Once this is determined 
    the necessary number of pages are reserved via Dashboard API.  The returned API results are saved to a PROC_ID input file.
    All the reserved job's PROC_IDs are then processed and a Globus transfer is initiated and a transfer job is submitted to wait
    for the transfer to complete.  All the reserved page PROC_IDs are submitted as batch jobs and will depend on the transfer job to
    complete before they start.
    """
    # Ensure --num-jobs and --pages-per-job are both present
    # if either is used
    if (args.num_jobs and not args.pages_per_job
            or not args.num_jobs and args.pages_per_job):
        print("--num-jobs and --pages-per-job must be used together")
        parser.print_help()
        sys.exit(1)

    emop_submit = EmopSubmit(args.config_path)
    emop_query = EmopQuery(args.config_path)
    pending_pages = emop_query.pending_pages_count(q_filter=args.filter)

    # Exit if no pages to run
    if pending_pages == 0:
        print("No work to be done")
        sys.exit(0)

    if not pending_pages:
        print("Error querying pending pages")
        sys.exit(1)

    # Exit if the number of submitted jobs has reached the limit
    if args.schedule:
        current_job_count = emop_submit.scheduler.current_job_count()
        if current_job_count >= emop_submit.settings.max_jobs:
            print("Job limit of %s reached." % emop_submit.settings.max_jobs)
            sys.exit(0)

    # Optimize job submission if --pages-per-job and --num-jobs was not set
    if not args.pages_per_job and not args.num_jobs:
        num_jobs, pages_per_job = emop_submit.optimize_submit(pending_pages, current_job_count, sim=args.submit_simulate)
    else:
        num_jobs = args.num_jobs
        pages_per_job = args.pages_per_job

    if args.submit_simulate:
        sys.exit(0)

    # Verify transfers are possible
    emop_transfer = EmopTransfer(args.config_path)
    endpoint_check = emop_transfer.check_endpoints(fail_on_warn=True)
    if not endpoint_check:
        print("ERROR: Not all endpoints are activated or activation expires soon.")
        sys.exit(1)

    # Loop that performs the actual submission
    proc_ids = []
    for i in xrange(num_jobs):
        proc_id = emop_submit.reserve(num_pages=pages_per_job, r_filter=args.filter)
        if not proc_id:
            print("ERROR: Failed to reserve page")
            continue
        proc_ids.append(proc_id)

    if proc_ids:
        if args.transfer:
            task_id = emop_transfer.stage_in_proc_ids(proc_ids=proc_ids, wait=False)
            transfer_job_id = emop_submit.scheduler.submit_transfer_job(task_id=task_id)
        else:
            transfer_job_id = None
        for proc_id in proc_ids:
            job_id = emop_submit.scheduler.submit_job(proc_id=proc_id, num_pages=pages_per_job, dependency=transfer_job_id)
            emop_submit.set_job_id(proc_id=proc_id, job_id=job_id)
    sys.exit(0)
class TestEmopTransfer(TestCase):
    @pytest.fixture(autouse=True)
    def setup(self, tmpdir):
        self.tmpout = str(tmpdir.mkdir("out"))
        globus_dir = tmpdir.mkdir('globus')
        auth_file = globus_dir.join('globus-auth')
        expiry = int(time.time()) + (60*60*24*365)
        self.fake_goauth_token = 'un=test|tokenid=fake-token-id|expiry=%d' % expiry
        auth_file.write(self.fake_goauth_token)
        settings = default_settings()
        settings.globus_auth_file = str(auth_file)
        with patch('emop.emop_transfer.GlobusAPIClient') as globus_class:
            mock_globus = GlobusAPIClient(settings=settings)
            globus_class.return_value = mock_globus
            self.transfer = EmopTransfer(config_path=default_config_path())

    def setUp(self):
        pass

    def tearDown(self):
        pass

    def test_stage_in_files_1(self):
        files = ['/dne/file.txt']
        data = [
            {'src': '/dne/file.txt', 'dest': '/fdata/idhmc/emop-input/dne/file.txt'}
        ]
        self.transfer.start = MagicMock()
        self.transfer.stage_in_files(files)
        self.transfer.start.assert_called_once_with(src='idhmc#data', dest='tamu#brazos', data=data, label='emop-stage-in-files', wait=False)

    def test_stage_in_data_1(self):
        data = load_fixture_file('job_queues_1.json')
        expected_files = [
            "/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/2.txt",
            "/data/eebo/e0006/40099/00001.000.001.tif",
            "/data/eebo/e0006/40099/00002.000.001.tif",
            "/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/1.txt",
        ]
        self.transfer.stage_in_files = MagicMock()
        self.transfer.stage_in_data(data['results'])
        self.transfer.stage_in_files.assert_called_once_with(files=expected_files, wait=False)

    def test_stage_in_proc_ids(self):
        self.transfer.settings.payload_input_path = os.path.dirname(fixture_file('input_payload_2.json'))
        expected_data = [
            {
                'src': '/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/2.txt',
                'dest': '/fdata/idhmc/emop-input/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/2.txt',
            },
            {
                'src': '/data/eebo/e0006/40099/00001.000.001.tif',
                'dest': '/fdata/idhmc/emop-input/data/eebo/e0006/40099/00001.000.001.tif',
            },
            {
                'src': '/data/eebo/e0006/40099/00002.000.001.tif',
                'dest': '/fdata/idhmc/emop-input/data/eebo/e0006/40099/00002.000.001.tif',
            },
            {
                'src': '/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/1.txt',
                'dest': '/fdata/idhmc/emop-input/data/shared/text-xml/EEBO-TCP-pages-text/e0006/40099/1.txt',
            },
        ]

        self.transfer.start = MagicMock()
        self.transfer.stage_in_proc_ids(proc_ids=['input_payload_2'])
        self.transfer.start.assert_called_once_with(src='idhmc#data', dest='tamu#brazos', data=expected_data, label='emop-stage-in-files', wait=False)
        

    def test_stage_out_proc_id_1(self):
        self.transfer.settings.payload_output_path = os.path.dirname(fixture_file('output_payload_1.json'))
        payload = EmopPayload(self.transfer.settings, 'output_payload_1')
        expected_data = [
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.txt',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.txt',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1.txt',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1.txt',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.xml',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.xml',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1.xml',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1.xml',
            }
        ]
        self.transfer.start = MagicMock(return_value='000-000-001')
        retval = self.transfer.stage_out_proc_id('output_payload_1')
        self.transfer.start.assert_called_once_with(src='tamu#brazos', dest='idhmc#data', data=expected_data, label='emop-stage-out-output_payload_1', wait=False)
        self.assertEqual('000-000-001', retval)

    def test_stage_out_proc_id_2(self):
        self.transfer.settings.payload_completed_path = os.path.dirname(fixture_file('output_payload_1.json'))
        payload = EmopPayload(self.transfer.settings, 'output_payload_1')
        expected_data = [
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.txt',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.txt',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1.txt',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1.txt',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.xml',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1_ALTO.xml',
            },
            {
                'src': '/fdata/idhmc/emop-output/data/shared/text-xml/IDHMC-ocr/17/152141/1.xml',
                'dest': '/data/shared/text-xml/IDHMC-ocr/17/152141/1.xml',
            }
        ]
        self.transfer.start = MagicMock(return_value='000-000-001')
        retval = self.transfer.stage_out_proc_id('output_payload_1')
        self.transfer.start.assert_called_once_with(src='tamu#brazos', dest='idhmc#data', data=expected_data, label='emop-stage-out-output_payload_1', wait=False)
        self.assertEqual('000-000-001', retval)

    def test_stage_out_proc_id_3(self):
        payload = EmopPayload(self.transfer.settings, 'output_payload_1')
        payload.completed_output_exists = MagicMock()
        payload.completed_output_exists.return_value = False
        payload.output_exists = MagicMock()
        payload.output_exists.return_value = False
        retval = self.transfer.stage_out_proc_id('output_payload_1')
        self.assertEqual('', retval)

    def test_stage_out_proc_id_4(self):
        self.transfer.settings.payload_output_path = os.path.dirname(fixture_file('invalid.json'))
        payload = EmopPayload(self.transfer.settings, 'invalid')
        retval = self.transfer.stage_out_proc_id('invalid')
        self.assertEqual('', retval)

    def test_check_endpoints_1(self):
        self.transfer._check_activation = MagicMock()
        self.transfer._check_activation.side_effect = [False, False, False, False]
        self.transfer.globus.autoactivate = MagicMock()
        self.transfer.globus.get_activate_url = MagicMock(return_value="https://globus.org/activate?ep=go%23ep1&ep_ids=foobar")
        retval = self.transfer.check_endpoints()
        self.assertEqual(False, retval)

    def test_check_endpoints_2(self):
        self.transfer._check_activation = MagicMock()
        self.transfer._check_activation.side_effect = [True, True]
        retval = self.transfer.check_endpoints()
        self.assertEqual(True, retval)

    def test_start_1(self):
        transfer = api_client.Transfer('test', 'go#ep1', 'go#ep2')
        self.transfer.globus.create_transfer = MagicMock()
        self.transfer.globus.create_transfer.return_value = transfer
        self.transfer.globus.send_transfer = MagicMock()
        self.transfer.globus.send_transfer.return_value = 'task-id'
        data = [{'src': '/dne/file1', 'dest': '/dne/file1'}]
        self.transfer.start('go#ep1', 'go#ep2', data)
        self.assertEqual(1, len(transfer.items))
        self.assertEqual(data[0]['src'], transfer.items[0]['source_path'])
        self.assertEqual(data[0]['dest'], transfer.items[0]['destination_path'])

    @skipif(True, reason="Not yet implemented")
    def test_ls(self):
        pass

    @skipif(True, reason="Not yet implemented")
    def test_display_task(self):
        pass

    def test__get_stage_in_files_from_data_1(self):
        data = [
            {
                "page": {
                    "pg_image_path": '/dne/page1.txt',
                    "pg_ground_truth_file": '/dne/gt1.txt',
                    "pg_foo": "pg_bar",
                },
                "work": {
                    "wk_foo": "wk_bar",
                }
            },
            {
                "page": {
                    "pg_image_path": '/dne/page2.txt',
                    "pg_ground_truth_file": '/dne/gt2.txt',
                    "pg_foo": "pg_bar",
                },
                "work": {
                    "wk_foo": "wk_bar",
                }
            },
        ]
        expected = [
            '/dne/page1.txt',
            '/dne/gt1.txt',
            '/dne/gt2.txt',
            '/dne/page2.txt',
        ]
        retval = self.transfer._get_stage_in_files_from_data(data=data)
        self.assertEqual(expected, retval)

    def test__get_stage_in_files_from_data_2(self):
        data = [
            {
                "page": {
                    "pg_image_path": '/dne/page1.txt',
                    "pg_ground_truth_file": '/dne/gt1.txt',
                    "pg_foo": "pg_bar",
                },
                "work": {
                    "wk_foo": "wk_bar",
                }
            },
            {
                "page": {
                    "pg_image_path": '/dne/page2.txt',
                    "pg_ground_truth_file": None,
                    "pg_foo": "pg_bar",
                },
                "work": {
                    "wk_foo": "wk_bar",
                }
            },
        ]
        expected = [
            '/dne/page1.txt',
            '/dne/gt1.txt',
            '/dne/page2.txt',
        ]
        retval = self.transfer._get_stage_in_files_from_data(data=data)
        self.assertEqual(expected, retval)

    def test__get_stage_out_data_1(self):
        data = {
            "job_queues": {"completed": [1,2,3], "failed": []},
            "page_results": [
                {"page_id": 1, "batch_id": 2, "ocr_text_path": "/dne/1.txt", "ocr_xml_path": "/dne/1.xml"}
            ],
            "font_training_results": [
                {"work_id": 1, "batch_job_id": 2, "font_path": "/dne/font", "language_model_path": "/dne/lm", "glyph_substitution_model_path": "/dne/gsm"}
            ]
        }
        expected = [
            {'dest': '/dne/1.txt', 'src': '/fdata/idhmc/emop-output/dne/1.txt'},
            {'dest': '/dne/1.xml', 'src': '/fdata/idhmc/emop-output/dne/1.xml'},
            {'dest': '/dne/font', 'src': '/fdata/idhmc/emop-output/dne/font'},
            {'dest': '/dne/lm', 'src': '/fdata/idhmc/emop-output/dne/lm'},
            {'dest': '/dne/gsm', 'src': '/fdata/idhmc/emop-output/dne/gsm'},
        ]
        retval = self.transfer._get_stage_out_data(data=data)
        self.maxDiff = None
        self.assertEqual(len(expected), len(retval))
        self.assertEqual(sorted(expected), sorted(retval))

    def test__get_stage_out_data_2(self):
        data = {
            "job_queues": {"completed": [1], "failed": []},
            "page_results": [
                {"page_id": 1, "batch_id": 2, "ocr_text_path": os.path.join(self.tmpout, "1.txt"), "ocr_xml_path": os.path.join(self.tmpout, "1.xml")}
            ],
            "font_training_results": [
                {"work_id": 1, "batch_job_id": 2,
                "font_path": os.path.join(self.tmpout, "font"),
                "language_model_path": os.path.join(self.tmpout, "lm"),
                "glyph_substitution_model_path": os.path.join(self.tmpout, "gsm")}
            ],
            "extra_transfers": [self.tmpout]
        }
        expected = [
            {'dest': self.tmpout, 'src': os.path.join('/fdata/idhmc/emop-output', self.tmpout), 'recursive': True},
        ]
        retval = self.transfer._get_stage_out_data(data=data)
        self.assertEqual(expected, retval)