def rename(src, dest): if (src.startswith('/pnfs/') or dest.startswith('/pnfs/')) and (prefer_grid or not pnfs_is_mounted): if debug: print('*** Larbatch_posix: Rename %s to %s using ifdh.' % (src, dest)) src_uri = larbatch_utilities.gridftp_uri(src) dest_path = larbatch_utilities.dcache_path(dest) cmd = ['uberftp', '-rename', src_uri, dest_path] jobinfo = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) q = queue.Queue() thread = threading.Thread(target=larbatch_utilities.wait_for_subprocess, args=[jobinfo, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): if debug: print('*** Larbatch_posix: Terminating subprocess.') jobinfo.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc != 0: raise IFDHError(cmd, rc, jobout, joberr) else: if debug: print('*** Larbatch_posix: Rename %s to %s using posix.' % (src, dest)) os.rename(src, dest)
def symlink(src, dest): # Make sure we have a kerberos ticket. if src.startswith('/pnfs/') and not pnfs_is_mounted: if debug: print('*** Larbatch_posix: Make symbolic link from %s to %s using nfs server.' % (src, dest)) larbatch_utilities.test_ticket() cmd = ['ssh', larbatch_utilities.nfs_server(), 'ln', '-s', src, dest] jobinfo = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) q = queue.Queue() thread = threading.Thread(target=larbatch_utilities.wait_for_subprocess, args=[jobinfo, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): if debug: print('*** Larbatch_posix: Terminating subprocess.') jobinfo.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc != 0: raise IFDHError(cmd, rc, jobout, joberr) else: if debug: print('*** Larbatch_posix: Make symbolic link from %s to %s using posix.' % (src, dest)) os.symlink(src, dest)
def listdir(path): if not isdir(path): raise OSError('%s is not a directory.' % path) result = [] if path.startswith('/pnfs/') and (prefer_grid or not pnfs_is_mounted): if debug: print('*** Larbatch_posix: Listdir %s using ifdh.' % path) # Get normalized tail. tail = os.path.normpath(path[-6:]) # Call "ifdh ls". contents = larbatch_utilities.ifdh_ls(path, 1) # Loop over contents returned by ifdh. # Normalize the paths returned by "ifdh ls", which in this context mainly # has the effect of stripping off trailing '/' on directories. # Filter out parent directory, which ifdh sometimes (usually?) includes in result. for c in contents: nc = os.path.normpath(c.strip()) if not nc.endswith(tail): result.append(os.path.basename(nc)) else: if debug: print('*** Larbatch_posix: Listdir %s using posix.' % path) #result = os.listdir(path) # To reduce hang risk, read contents of directory in a subprocess with # a timeout. cmd = ['ls', path] jobinfo = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) q = queue.Queue() thread = threading.Thread( target=larbatch_utilities.wait_for_subprocess, args=[jobinfo, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): if debug: print('*** Larbatch_posix: Terminating subprocess.') jobinfo.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc == 0: for word in jobout.split(): result.append(word) # Done. return result
def addLayerTwo(path, recreate=True): # Don't do anything if this file is not located in dCache (/pnfs/...) # or has nonzero size. if larbatch_posix.exists(path) and path[ 0:6] == '/pnfs/' and larbatch_posix.stat(path).st_size == 0: if recreate: print('Adding layer two for path %s.' % path) else: print('Deleting empty file %s.' % path) # Now we got a zero size file in dCache, which kind of files may be # missing layer two. # Delete the file and recreate it using ifdh. larbatch_posix.remove(path) if not recreate: return test_proxy() # Make sure environment variables X509_USER_CERT and X509_USER_KEY # are not defined (they confuse ifdh). save_vars = {} for var in ('X509_USER_CERT', 'X509_USER_KEY'): if var in os.environ: save_vars[var] = os.environ[var] del os.environ[var] # Do ifdh cp. command = ['ifdh', 'cp', '/dev/null', path] jobinfo = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) q = queue.Queue() thread = threading.Thread(target=wait_for_subprocess, args=[jobinfo, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): print('Terminating subprocess.') jobinfo.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc != 0: for var in list(save_vars.keys()): os.environ[var] = save_vars[var] raise IFDHError(command, rc, jobout, joberr) # Restore environment variables. for var in list(save_vars.keys()): os.environ[var] = save_vars[var]
def wait_for_subprocess(jobinfo, q): """Run jobinfo, put the return code, stdout, and stderr into a queue""" jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() for item in (rc, jobout, joberr): q.put(item) return
def get_job(self, proc): """Run the proc in a 60-sec timeout queue, return stdout, stderr""" q = queue.Queue() thread = threading.Thread(target=self.wait_for_subprocess, args=[proc, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): print('Terminating subprocess because of timeout.') proc.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc != 0: raise RuntimeError('sam_metadata_dumper returned nonzero exit status {}.'.format(rc)) return jobout, joberr
def active_projects(defname=''): result = set() # Get project name stem. s = samweb() prjstem = '' if defname != '': prjstem = '%s_' % s.makeProjectName(defname).rsplit('_', 1)[0] # Dump station url = '%s/dumpStation?station=%s' % (s.get_baseurl(), get_experiment()) furl = urlrequest.urlopen(url) # Parse response. for line in furl.readlines(): words = line.split() if len(words) > 5: prjname = convert_str(words[0]) if prjstem == '' or prjname.startswith(prjstem): result.add(prjname) # Done. return result
def get_stream(inputfile): result = '' # Extract sam metadata in form of json string. jobinfo = subprocess.Popen(['sam_metadata_dumper', inputfile], stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() if rc != 0: raise RuntimeError('sam_metadata_dumper failed with status %d' % rc) # Decode json string to dictionary. # Work around art bug by deleting "runs" line. json_str = '' n = jobout.find('"runs"') if n >= 0: m = jobout.rfind('\n', 0, n) if m > 0: json_str = jobout[:m+1] k = jobout.find('\n', n) if k > n: json_str += jobout[k+1:] else: json_str = jobout # End of workaround. js = json.loads(json_str) md = js[inputfile] # Extract stream from json dictionary. if 'data_stream' in md: result = md['data_stream'] else: raise RuntimeError('Sam metadata does not contain stream.') # Done. return result
def xmlhelp(self): # Capture output from project.py --xmlhelp. # Because of the way this command is implemented in project.py, we have # to run in a separate process, not just call method xmlhelp of project module. command = ['project.py', '--xmlhelp'] helptext = convert_str(subprocess.check_output(command)) w = TextWindow() w.append(helptext)
def readlink(path): result = '' # Make sure we have a kerberos ticket. if path.startswith('/pnfs/') and not_pnfs_is_mounted: if debug: print( '*** Larbatch_posix: Read symbolic link %s using nfs server.' % path) larbatch_utilities.test_ticket() cmd = ['ssh', larbatch_utilities.nfs_server(), 'readlink', path] jobinfo = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) q = queue.Queue() thread = threading.Thread( target=larbatch_utilities.wait_for_subprocess, args=[jobinfo, q]) thread.start() thread.join(timeout=60) if thread.is_alive(): if debug: print('*** Larbatch_posix: Terminating subprocess.') jobinfo.terminate() thread.join() rc = q.get() jobout = convert_str(q.get()) joberr = convert_str(q.get()) if rc != 0: raise IFDHError(cmd, rc, jobout, joberr) result = jobout.strip() else: if debug: print('*** Larbatch_posix: Read symbolic link %s using posix.' % path) result = os.readlink(path) # Done. return result
def fileEnstoreChecksum(path): """Calculate enstore compatible CRC value""" crc = {} srm_url = project_utilities.path_to_srm_url(path) if srm_url == path: try: f = larbatch_posix.open(path, 'rb') crc = enstoreChecksum(f) except (IOError, OSError) as ex: raise Error(str(ex)) finally: f.close() else: try: # Following commented commands are old way of calculating checksum by # transferring entire file over network. # Should work again if uncommented (if srm way breaks). #cmd = ['ifdh', 'cp', path, '/dev/fd/1'] #p = subprocess.Popen(cmd, stdout=subprocess.PIPE) #f = p.stdout #crc = enstoreChecksum(f) # New (clever, efficient, obscure...) way of accessing dCache # stored checksum using srm. project_utilities.test_proxy() cmd = ['srmls', '-2', '-l', srm_url] srmout = convert_str(subprocess.check_output(cmd)) first = True crc0 = 0 for line in srmout.split('\n'): if first: size = int(line[2:line.find('/') - 1]) first = False continue if line.find("Checksum value:") > 0: ssum = line[line.find(':') + 2:] crc1 = int(ssum, base=16) crc0 = convert_1_adler32_to_0_adler32(crc1, size) break crc = {"crc_value": str(crc0), "crc_type": "adler 32 crc type"} except: # Try the old method cmd = ['ifdh', 'cp', path, '/dev/fd/1'] p = subprocess.Popen(cmd, stdout=subprocess.PIPE) f = p.stdout crc = enstoreChecksum(f) return crc
def update_jobs(): global jobs, server command = ['jobsub_q'] if server != None: command.append('--jobsub-server=%s' % server) command.append('--group=%s' % project_utilities.get_experiment()) command.append('--user=%s' % project_utilities.get_user()) command.append('--role=%s' % project_utilities.get_role()) jobinfo = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() if rc != 0: #raise JobsubError(command, rc, jobout, joberr) # Simply return in case jobsub_q fails. return jobs = jobout.split('\n')
def kill_jobs(self): if self.current_project_def == None: tkinter_messagebox.showwarning('', 'No project selected.') return if self.current_stage_def == None: tkinter_messagebox.showwarning('', 'No stage selected.') return top = self.winfo_toplevel() old_cursor = top['cursor'] try: top['cursor'] = 'watch' top.update_idletasks() BatchStatus.update_jobs() jobs = BatchStatus.get_jobs() top['cursor'] = old_cursor except: top['cursor'] = old_cursor e = sys.exc_info() traceback.print_tb(e[2]) tkinter_messagebox.showerror('', e[1]) # Figure out which clusters to kill. cluster_ids = set() for job in jobs: words = job.split() if len(words) >= 2: jobid = words[0] script = words[-1] workscript = '%s-%s-%s.sh' % ( self.current_stage_def.name, self.current_project_def.name, self.current_project_def.release_tag) if script.find(workscript) == 0: cp_server = jobid.split('@') if len(cp_server) == 2: clusproc = cp_server[0] server = cp_server[1] cp = clusproc.split('.') if len(cp) == 2: cluster = cp[0] process = cp[1] cluster_id = '%s@%s' % (cluster, server) if not cluster_id in cluster_ids: cluster_ids.add(cluster_id) # Actually issue kill commands. for cluster_id in cluster_ids: print('Kill cluster id %s' % cluster_id) command = ['jobsub_rm'] if self.current_project_def.server != '-' and self.current_project_def.server != '': command.append('--jobsub-server=%s' % self.current_project_def.server) command.append('--jobid=%s' % cluster_id) command.append('--role=%s' % project_utilities.get_role()) jobinfo = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() if rc != 0: raise JobsubError(command, rc, jobout, joberr) self.update_jobs()
def __init__(self, project_element, default_first_input_list, default_input_lists): # Assign default values. self.name = '' # Project name. self.num_events = 0 # Total events (all jobs). self.num_jobs = 1 # Number of jobs. self.max_files_per_job = 0 # Max number of files per job. self.os = '' # Batch OS. self.resource = 'DEDICATED,OPPORTUNISTIC' # Jobsub resources. self.role = '' # Role (normally Analysis or Production). self.lines = '' # Arbitrary condor commands. self.server = '-' # Jobsub server. self.site = '' # Site. self.blacklist = '' # Blacklist. self.cpu = 0 # Number of cpus. self.disk = '' # Disk space (string value+unit). self.memory = 0 # Amount of memory (integer MB). self.merge = 'hadd -T' # histogram merging program. self.release_tag = '' # Larsoft release tag. self.release_qual = 'debug' # Larsoft release qualifier. self.version = '' # Project version. self.local_release_dir = '' # Larsoft local release directory. self.local_release_tar = '' # Larsoft local release tarball. self.file_type = '' # Sam file type. self.run_type = '' # Sam run type. self.run_number = 0 # Sam run number. self.script = 'condor_lar.sh' # Batch script. self.validate_on_worker = 0 # Run post-job validation on the worker node self.copy_to_fts = 0 # Copy a copy of the file to a dropbox scanned by fts. Note that a copy is still sent to <outdir> self.start_script = 'condor_start_project.sh' # Sam start project script. self.stop_script = 'condor_stop_project.sh' # Sam stop project script. self.force_dag = 0 # Force dag for sam input jobs. self.fclpath = [] # Fcl search path. self.stages = [] # List of stages (StageDef objects). self.parameters = {} # Dictionary of metadata parameters. # Extract values from xml. # Project name (attribute) if 'name' in dict(project_element.attributes): self.name = str(project_element.attributes['name'].firstChild.data) if self.name == '': raise XMLError('Project name not specified.') # Total events (subelement). num_events_elements = project_element.getElementsByTagName('numevents') for num_events_element in num_events_elements: if num_events_element.parentNode == project_element: self.num_events = int(num_events_element.firstChild.data) if self.num_events == 0: raise XMLError('Number of events not specified.') # Number of jobs (subelement). num_jobs_elements = project_element.getElementsByTagName('numjobs') for num_jobs_element in num_jobs_elements: if num_jobs_element.parentNode == project_element: self.num_jobs = int(num_jobs_element.firstChild.data) # Max Number of files per jobs. max_files_per_job_elements = project_element.getElementsByTagName( 'maxfilesperjob') for max_files_per_job_element in max_files_per_job_elements: if max_files_per_job_element.parentNode == project_element: self.max_files_per_job = int( max_files_per_job_element.firstChild.data) # OS (subelement). os_elements = project_element.getElementsByTagName('os') for os_element in os_elements: if os_element.parentNode == project_element: self.os = str(os_element.firstChild.data) self.os = ''.join(self.os.split()) # Resource (subelement). resource_elements = project_element.getElementsByTagName('resource') for resource_element in resource_elements: if resource_element.parentNode == project_element: self.resource = str(resource_element.firstChild.data) self.resource = ''.join(self.resource.split()) # Role (subelement). role_elements = project_element.getElementsByTagName('role') for role_element in role_elements: if role_element.parentNode == project_element: self.role = str(role_element.firstChild.data) # Lines (subelement). lines_elements = project_element.getElementsByTagName('lines') for lines_element in lines_elements: if lines_element.parentNode == project_element: self.lines = str(lines_element.firstChild.data) # Server (subelement). server_elements = project_element.getElementsByTagName('server') for server_element in server_elements: if server_element.parentNode == project_element: self.server = str(server_element.firstChild.data) # Site (subelement). site_elements = project_element.getElementsByTagName('site') for site_element in site_elements: if site_element.parentNode == project_element: self.site = str(site_element.firstChild.data) self.site = ''.join(self.site.split()) # Blacklist (subelement). blacklist_elements = project_element.getElementsByTagName('blacklist') for blacklist_element in blacklist_elements: if blacklist_element.parentNode == project_element: self.blacklist = str(blacklist_element.firstChild.data) self.blacklist = ''.join(self.blacklist.split()) # Cpu (subelement). cpu_elements = project_element.getElementsByTagName('cpu') for cpu_element in cpu_elements: if cpu_element.parentNode == project_element: self.cpu = int(cpu_element.firstChild.data) # Disk (subelement). disk_elements = project_element.getElementsByTagName('disk') for disk_element in disk_elements: if disk_element.parentNode == project_element: self.disk = str(disk_element.firstChild.data) self.disk = ''.join(self.disk.split()) # Memory (subelement). memory_elements = project_element.getElementsByTagName('memory') for memory_element in memory_elements: if memory_element.parentNode == project_element: self.memory = int(memory_element.firstChild.data) # merge (subelement). merge_elements = project_element.getElementsByTagName('merge') for merge_element in merge_elements: if merge_element.parentNode == project_element: if merge_element.firstChild: self.merge = str(merge_element.firstChild.data) else: self.merge = '' # Larsoft (subelement). larsoft_elements = project_element.getElementsByTagName('larsoft') if larsoft_elements: # Release tag (subelement). tag_elements = larsoft_elements[0].getElementsByTagName('tag') if tag_elements and tag_elements[0].firstChild != None: self.release_tag = str(tag_elements[0].firstChild.data) # Release qualifier (subelement). qual_elements = larsoft_elements[0].getElementsByTagName('qual') if qual_elements: self.release_qual = str(qual_elements[0].firstChild.data) # Local release directory or tarball (subelement). # local_elements = larsoft_elements[0].getElementsByTagName('local') if local_elements: local = str(local_elements[0].firstChild.data) if larbatch_posix.isdir(local): self.local_release_dir = local else: self.local_release_tar = local # Version (subelement). version_elements = project_element.getElementsByTagName('version') if version_elements: self.version = str(version_elements[0].firstChild.data) else: self.version = self.release_tag # Make sure local test release directory/tarball exists, if specified. # Existence of non-null local_release_dir has already been tested. if self.local_release_tar != '' and not larbatch_posix.exists( self.local_release_tar): raise IOError( "Local release directory/tarball %s does not exist." % self.local_release_tar) # Sam file type (subelement). file_type_elements = project_element.getElementsByTagName('filetype') if file_type_elements: self.file_type = str(file_type_elements[0].firstChild.data) # Sam run type (subelement). run_type_elements = project_element.getElementsByTagName('runtype') if run_type_elements: self.run_type = str(run_type_elements[0].firstChild.data) # Sam run number (subelement). run_number_elements = project_element.getElementsByTagName('runnumber') if run_number_elements: self.run_number = int(run_number_elements[0].firstChild.data) # Batch script (subelement). script_elements = project_element.getElementsByTagName('script') for script_element in script_elements: if script_element.parentNode == project_element: self.script = str(script_element.firstChild.data) # Make sure batch script exists, and convert into a full path. script_path = '' try: jobinfo = subprocess.Popen(['which', self.script], stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() script_path = jobout.splitlines()[0].strip() except: pass if script_path == '' or not larbatch_posix.access( script_path, os.X_OK): raise IOError('Script %s not found.' % self.script) self.script = script_path worker_validations = project_element.getElementsByTagName('check') for worker_validation in worker_validations: if worker_validation.parentNode == project_element: self.validate_on_worker = int( worker_validation.firstChild.data) worker_copys = project_element.getElementsByTagName('copy') for worker_copy in worker_copys: if worker_copy.parentNode == project_element: self.copy_to_fts = int(worker_copy.firstChild.data) # Start project batch script (subelement). start_script_elements = project_element.getElementsByTagName( 'startscript') for start_script_element in start_script_elements: if start_script_element.parentNode == project_element: self.start_script = str(start_script_element.firstChild.data) self.force_dag = 1 # Make sure start project batch script exists, and convert into a full path. script_path = '' try: jobinfo = subprocess.Popen(['which', self.start_script], stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() script_path = jobout.splitlines()[0].strip() except: pass self.start_script = script_path # Stop project batch script (subelement). stop_script_elements = project_element.getElementsByTagName( 'stopscript') for stop_script_element in stop_script_elements: if stop_script_element.parentNode == project_element: self.stop_script = str(stop_script_element.firstChild.data) self.force_dag = 1 # Make sure stop project batch script exists, and convert into a full path. script_path = '' try: jobinfo = subprocess.Popen(['which', self.stop_script], stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() jobout = convert_str(jobout) joberr = convert_str(joberr) rc = jobinfo.poll() script_path = jobout.splitlines()[0].strip() except: pass self.stop_script = script_path # Fcl search path (repeatable subelement). fclpath_elements = project_element.getElementsByTagName('fcldir') for fclpath_element in fclpath_elements: self.fclpath.append(str(fclpath_element.firstChild.data)) # Add $FHICL_FILE_PATH. if 'FHICL_FILE_PATH' in os.environ: for fcldir in os.environ['FHICL_FILE_PATH'].split(':'): if larbatch_posix.exists(fcldir): self.fclpath.append(fcldir) # Make sure all directories of fcl search path exist. for fcldir in self.fclpath: if not larbatch_posix.exists(fcldir): raise IOError("Fcl search directory %s does not exist." % fcldir) # Project stages (repeatable subelement). stage_elements = project_element.getElementsByTagName('stage') default_previous_stage = '' default_input_lists[default_previous_stage] = default_first_input_list for stage_element in stage_elements: # Get base stage, if any. base_stage = None if 'base' in dict(stage_element.attributes): base_name = str( stage_element.attributes['base'].firstChild.data) if base_name != '': for stage in self.stages: if stage.name == base_name: base_stage = stage break if base_stage == None: raise LookupError('Base stage %s not found.' % base_name) self.stages.append( StageDef(stage_element, base_stage, default_input_lists, default_previous_stage, self.num_jobs, self.num_events, self.max_files_per_job, self.merge, self.cpu, self.disk, self.memory, self.validate_on_worker, self.copy_to_fts, self.script, self.start_script, self.stop_script, self.site, self.blacklist)) default_previous_stage = self.stages[-1].name default_input_lists[default_previous_stage] = os.path.join( self.stages[-1].bookdir, 'files.list') # Dictionary of metadata parameters param_elements = project_element.getElementsByTagName('parameter') for param_element in param_elements: name = str(param_element.attributes['name'].firstChild.data) value = str(param_element.firstChild.data) self.parameters[name] = value # Done. return