예제 #1
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        # Get output of jobs and yield output dirs
        if len(gc_id_jobnum_list) == 0:
            raise StopIteration

        root_dn = os.path.join(self._path_output, 'tmp')
        try:
            if len(gc_id_jobnum_list) == 1:
                # For single jobs create single subdir
                tmp_dn = os.path.join(root_dn,
                                      md5_hex(gc_id_jobnum_list[0][0]))
            else:
                tmp_dn = root_dn
            ensure_dir_exists(tmp_dn)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmp_dn,
                BackendError)

        map_gc_id2jobnum = dict(gc_id_jobnum_list)
        jobs = self._write_wms_id_list(gc_id_jobnum_list)

        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        proc = LocalProcess(self._output_exec, '--noint', '--logfile',
                            '/dev/stderr', '-i', jobs, '--dir', tmp_dn)

        # yield output dirs
        todo = map_gc_id2jobnum.values()
        current_jobnum = None
        for line in imap(str.strip, proc.stdout.iter(timeout=60)):
            if line.startswith(tmp_dn):
                todo.remove(current_jobnum)
                output_dn = line.strip()
                unpack_wildcard_tar(self._log, output_dn)
                yield (current_jobnum, output_dn)
                current_jobnum = None
            else:
                current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line),
                                                      current_jobnum)
        exit_code = proc.status(timeout=0, terminate=True)
        activity.finish()

        if exit_code != 0:
            if 'Keyboard interrupt raised by user' in proc.stderr.read(
                    timeout=0):
                remove_files([jobs, root_dn])
                raise StopIteration
            else:
                self._log.log_process(proc,
                                      files={'jobs': SafeFile(jobs).read()})
            self._log.error('Trying to recover from error ...')
            for dn in os.listdir(root_dn):
                yield (None, os.path.join(root_dn, dn))

        # return unretrievable jobs
        for jobnum in todo:
            yield (jobnum, None)

        remove_files([jobs, tmp_dn])
예제 #2
0
    def get_jobs_output_chunk(self, tmp_dn, gc_id_jobnum_list,
                              wms_id_list_done):
        map_gc_id2jobnum = dict(gc_id_jobnum_list)
        jobs = list(self._iter_wms_ids(gc_id_jobnum_list))
        log = tempfile.mktemp('.log')
        proc = LocalProcess(self._output_exec, '--noint', '--logfile', log,
                            '--dir', tmp_dn, *jobs)
        exit_code = proc.status(timeout=20 * len(jobs), terminate=True)

        # yield output dirs
        current_jobnum = None
        for line in imap(str.strip, proc.stdout.iter(timeout=20)):
            match = re.match(self._output_regex, line)
            if match:
                wms_id = match.groupdict()['rawId']
                current_jobnum = map_gc_id2jobnum.get(
                    self._create_gc_id(wms_id))
                wms_id_list_done.append(wms_id)
                yield (current_jobnum, match.groupdict()['output_dn'])
                current_jobnum = None

        if exit_code != 0:
            if 'Keyboard interrupt raised by user' in proc.stdout.read_log():
                remove_files([log, tmp_dn])
                raise StopIteration
            else:
                self._log.log_process(proc)
            self._log.error('Trying to recover from error ...')
            for dn in os.listdir(tmp_dn):
                yield (None, os.path.join(tmp_dn, dn))
        remove_files([log])
예제 #3
0
	def _submit_jobs(self, jobnum_list, task):
		# submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially
		# >>jobnum: internal ID of the Job
		# JobNum is linked to the actual *task* here
		(jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task)
		try:
			# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
			activity = Activity('queuing jobs at scheduler')
			submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn
			proc = self._proc_factory.logged_execute(self._submit_exec, submit_args)

			# extract the Condor ID (WMS ID) of the jobs from output ClassAds
			jobnum_gc_id_list = []
			for line in proc.iter():
				if 'GridControl_GCIDtoWMSID' in line:
					jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@')
					jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip()
					# Condor creates a default job then overwrites settings on any subsequent job
					# i.e. skip every second, but better be sure
					if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]):
						jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id)))

			exit_code = proc.wait()
			activity.finish()
			if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)):
				if not self._explain_error(proc, exit_code):
					self._log.error('Submitted %4d jobs of %4d expected',
						len(jobnum_gc_id_list), len(jobnum_list))
					proc.log_error(self._error_log_fn, jdl=jdl_fn)
		finally:
			remove_files([jdl_fn])

		for (jobnum, gc_id) in jobnum_gc_id_list:
			yield (jobnum, gc_id, {})
예제 #4
0
	def _submit_jobs(self, jobnum_list, task):
		# submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially
		# >>jobnum: internal ID of the Job
		# JobNum is linked to the actual *task* here
		(jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task)
		try:
			# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
			activity = Activity('queuing jobs at scheduler')
			submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn
			proc = self._proc_factory.logged_execute(self._submit_exec, submit_args)

			# extract the Condor ID (WMS ID) of the jobs from output ClassAds
			jobnum_gc_id_list = []
			for line in proc.iter():
				if 'GridControl_GCIDtoWMSID' in line:
					jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@')
					jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip()
					# Condor creates a default job then overwrites settings on any subsequent job
					# i.e. skip every second, but better be sure
					if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]):
						jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id)))

			exit_code = proc.wait()
			activity.finish()
			if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)):
				if not self._explain_error(proc, exit_code):
					self._log.error('Submitted %4d jobs of %4d expected',
						len(jobnum_gc_id_list), len(jobnum_list))
					proc.log_error(self._error_log_fn, jdl=jdl_fn)
		finally:
			remove_files([jdl_fn])

		for (jobnum, gc_id) in jobnum_gc_id_list:
			yield (jobnum, gc_id, {})
예제 #5
0
 def _write_jdl(self, jobnum_list, task):
     # construct a temporary JDL for this batch of jobs
     jdl_fd, jdl_fn = tempfile.mkstemp(suffix='.jdl')
     try:
         data = self._get_jdl_str_list(jobnum_list, task)
         safe_write(os.fdopen(jdl_fd, 'w'), data)
     except Exception:
         remove_files([jdl_fn])
         raise BackendError('Could not write jdl data to %s.' % jdl_fn)
     return jdl_fn
예제 #6
0
	def _write_jdl(self, jobnum_list, task):
		# construct a temporary JDL for this batch of jobs
		jdl_fd, jdl_fn = tempfile.mkstemp(suffix='.jdl')
		try:
			data = self._get_jdl_str_list(jobnum_list, task)
			safe_write(os.fdopen(jdl_fd, 'w'), data)
		except Exception:
			remove_files([jdl_fn])
			raise BackendError('Could not write jdl data to %s.' % jdl_fn)
		return jdl_fn
예제 #7
0
 def _purge_done_jobs(self, wms_id_list_done):
     purge_log_fn = tempfile.mktemp('.log')
     purge_proc = LocalProcess(resolve_install_path('glite-ce-job-purge'),
                               '--noint', '--logfile', purge_log_fn,
                               str.join(' ', wms_id_list_done))
     exit_code = purge_proc.status(timeout=60)
     if exit_code != 0:
         if self._explain_error(purge_proc, exit_code):
             pass
         else:
             self._log.log_process(purge_proc)
     remove_files([purge_log_fn])
예제 #8
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		# Get output of jobs and yield output dirs
		if len(gc_id_jobnum_list) == 0:
			raise StopIteration

		root_dn = os.path.join(self._path_output, 'tmp')
		try:
			if len(gc_id_jobnum_list) == 1:
				# For single jobs create single subdir
				tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0]))
			else:
				tmp_dn = root_dn
			ensure_dir_exists(tmp_dn)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError)

		map_gc_id2jobnum = dict(gc_id_jobnum_list)
		jobs = self._write_wms_id_list(gc_id_jobnum_list)

		activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list))
		proc = LocalProcess(self._output_exec, '--noint',
			'--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn)

		# yield output dirs
		todo = map_gc_id2jobnum.values()
		current_jobnum = None
		for line in imap(str.strip, proc.stdout.iter(timeout=60)):
			if line.startswith(tmp_dn):
				todo.remove(current_jobnum)
				output_dn = line.strip()
				unpack_wildcard_tar(self._log, output_dn)
				yield (current_jobnum, output_dn)
				current_jobnum = None
			else:
				current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum)
		exit_code = proc.status(timeout=0, terminate=True)
		activity.finish()

		if exit_code != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0):
				remove_files([jobs, root_dn])
				raise StopIteration
			else:
				self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dn in os.listdir(root_dn):
				yield (None, os.path.join(root_dn, dn))

		# return unretrievable jobs
		for jobnum in todo:
			yield (jobnum, None)

		remove_files([jobs, tmp_dn])
예제 #9
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		if not len(gc_id_jobnum_list):
			raise StopIteration

		activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list))
		for gc_id, jobnum in gc_id_jobnum_list:
			path = self._sandbox_helper.get_sandbox(gc_id)
			if path is None:
				yield (jobnum, None)
				continue

			# Cleanup sandbox
			output_fn_list = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)),
				self._output_fn_list))
			remove_files(ifilter(lambda x: x not in output_fn_list,
				imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

			yield (jobnum, path)
		activity.finish()
예제 #10
0
	def __new__(cls, config, job_limit=-1, job_selector=None):
		try:
			resolve_install_path('zip')
		except Exception:
			clear_current_exception()
			return TextFileJobDB.__new__(cls, config, job_limit, job_selector)
		path_db = config.get_work_path('jobs')
		db_fn = config.get_work_path('jobs.zip')
		if os.path.exists(path_db) and os.path.isdir(path_db) and not os.path.exists(db_fn):
			activity = Activity('Converting job database')
			new_db = ZippedJobDB(config)
			try:
				old_db = TextFileJobDB(config)
				for jobnum in old_db.get_job_list():
					new_db.commit(jobnum, old_db.get_job(jobnum))
			except Exception:
				remove_files([db_fn])
				raise
			activity.finish()
		return ZippedJobDB.__new__(cls, config, job_limit, job_selector)
예제 #11
0
    def _submit_job(self, jobnum, task):
        # Submit job and yield (jobnum, WMS ID, other data)
        jdl_fd, jdl_fn = tempfile.mkstemp('.jdl')
        try:
            jdl_line_list = self._make_jdl(jobnum, task)
            safe_write(os.fdopen(jdl_fd, 'w'), jdl_line_list)
        except Exception:
            remove_files([jdl_fn])
            raise BackendError('Could not write jdl data to %s.' % jdl_fn)

        try:
            submit_arg_list = []
            for key_value in filter_dict(self._submit_args_dict,
                                         value_filter=identity).items():
                submit_arg_list.extend(key_value)
            submit_arg_list.append(jdl_fn)

            activity = Activity('submitting job %d' % jobnum)
            proc = LocalProcess(self._submit_exec, '--nomsg', '--noint',
                                '--logfile', '/dev/stderr', *submit_arg_list)

            wms_id = None
            stripped_stdout_iter = imap(str.strip,
                                        proc.stdout.iter(timeout=60))
            for line in ifilter(lambda x: x.startswith('http'),
                                stripped_stdout_iter):
                wms_id = line
            exit_code = proc.status(timeout=0, terminate=True)

            activity.finish()

            if (exit_code != 0) or (wms_id is None):
                if self._explain_error(proc, exit_code):
                    pass
                else:
                    self._log.log_process(
                        proc, files={'jdl': SafeFile(jdl_fn).read()})
        finally:
            remove_files([jdl_fn])
        job_data = {'jdl': str.join('', jdl_line_list)}
        return (jobnum, self._create_gc_id(wms_id), job_data)
예제 #12
0
	def _recover_jobs(self):
		proc = LocalProcess('zip', '-FF', self._db_fn, '--out', '%s.tmp' % self._db_fn)
		proc.stdin.write('y\n')
		proc.status(timeout=None)
		os.rename(self._db_fn, self._db_fn + '.broken')
		os.rename(self._db_fn + '.tmp', self._db_fn)
		tar = zipfile.ZipFile(self._db_fn, 'r', zipfile.ZIP_DEFLATED)
		remove_files([self._db_fn + '.broken'])
		broken_fn_list = []
		for tar_info_fn in tar.namelist():
			try:
				tuple(imap(lambda s: int(s[1:]), tar_info_fn.split('_', 1)))  # check name
				fp = tar.open(tar_info_fn)
				try:
					fp.read()
				finally:
					fp.close()
			except Exception:
				clear_current_exception()
				broken_fn_list.append(tar_info_fn)
		for broken in broken_fn_list:
			os.system('zip %s -d %s' % (self._db_fn, broken))
예제 #13
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        if not len(gc_id_jobnum_list):
            raise StopIteration

        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        for gc_id, jobnum in gc_id_jobnum_list:
            path = self._sandbox_helper.get_sandbox(gc_id)
            if path is None:
                yield (jobnum, None)
                continue

            # Cleanup sandbox
            output_fn_list = lchain(
                imap(lambda pat: glob.glob(os.path.join(path, pat)),
                     self._output_fn_list))
            remove_files(
                ifilter(
                    lambda x: x not in output_fn_list,
                    imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

            yield (jobnum, path)
        activity.finish()
예제 #14
0
	def _submit_job(self, jobnum, task):
		# Submit job and yield (jobnum, WMS ID, other data)
		jdl_fd, jdl_fn = tempfile.mkstemp('.jdl')
		try:
			jdl_line_list = self._make_jdl(jobnum, task)
			safe_write(os.fdopen(jdl_fd, 'w'), jdl_line_list)
		except Exception:
			remove_files([jdl_fn])
			raise BackendError('Could not write jdl data to %s.' % jdl_fn)

		try:
			submit_arg_list = []
			for key_value in filter_dict(self._submit_args_dict, value_filter=identity).items():
				submit_arg_list.extend(key_value)
			submit_arg_list.append(jdl_fn)

			activity = Activity('submitting job %d' % jobnum)
			proc = LocalProcess(self._submit_exec, '--nomsg', '--noint',
				'--logfile', '/dev/stderr', *submit_arg_list)

			wms_id = None
			stripped_stdout_iter = imap(str.strip, proc.stdout.iter(timeout=60))
			for line in ifilter(lambda x: x.startswith('http'), stripped_stdout_iter):
				wms_id = line
			exit_code = proc.status(timeout=0, terminate=True)

			activity.finish()

			if (exit_code != 0) or (wms_id is None):
				if self._explain_error(proc, exit_code):
					pass
				else:
					self._log.log_process(proc, files={'jdl': SafeFile(jdl_fn).read()})
		finally:
			remove_files([jdl_fn])
		job_data = {'jdl': str.join('', jdl_line_list)}
		return (jobnum, self._create_gc_id(wms_id), job_data)
예제 #15
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        # Get output of jobs and yield output dirs
        if len(gc_id_jobnum_list) == 0:
            raise StopIteration

        tmp_dn = os.path.join(self._path_output, 'tmp')
        try:
            if len(gc_id_jobnum_list) == 1:
                # For single jobs create single subdir
                tmp_dn = os.path.join(tmp_dn, md5_hex(gc_id_jobnum_list[0][0]))
            ensure_dir_exists(tmp_dn)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmp_dn,
                BackendError)

        map_gc_id2jobnum = dict(gc_id_jobnum_list)
        jobnum_list_todo = list(map_gc_id2jobnum.values())
        wms_id_list_done = []
        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        chunk_pos_iter = irange(0, len(gc_id_jobnum_list), self._chunk_size)
        for ids in imap(lambda x: gc_id_jobnum_list[x:x + self._chunk_size],
                        chunk_pos_iter):
            for (current_jobnum, output_dn) in self.get_jobs_output_chunk(
                    tmp_dn, ids, wms_id_list_done):
                unpack_wildcard_tar(self._log, output_dn)
                jobnum_list_todo.remove(current_jobnum)
                yield (current_jobnum, output_dn)
        activity.finish()

        # return unretrievable jobs
        for jobnum in jobnum_list_todo:
            yield (jobnum, None)
        self._purge_done_jobs(wms_id_list_done)
        remove_files([tmp_dn])