def update_jobs(): global jobs, server command = ['jobsub_q'] if server != None: command.append('--jobsub-server=%s' % server) command.append('--group=%s' % project_utilities.get_experiment()) command.append('--user=%s' % project_utilities.get_user()) command.append('--role=%s' % project_utilities.get_role()) jobinfo = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) jobout, joberr = jobinfo.communicate() rc = jobinfo.poll() if rc != 0: #raise JobsubError(command, rc, jobout, joberr) # Simply return in case jobsub_q fails. return jobs = jobout.split('\n')
def getmetadata(inputfile): # Set up the experiment name for samweb Python API samweb = samweb_cli.SAMWebClient( experiment=project_utilities.get_experiment()) # Extract metadata into a pipe. local = project_utilities.path_to_local(inputfile) if local != '': proc = subprocess.Popen(["sam_metadata_dumper", "-H", local], stdout=subprocess.PIPE) else: url = project_utilities.path_to_url(inputfile) proc = subprocess.Popen(["sam_metadata_dumper", "-H", url], stdout=subprocess.PIPE) lines = proc.stdout.readlines() if local != '' and local != inputfile: os.remove(local) # Count the number of lines in the file (for later use!) num_lines = len(lines) # define an empty python dictionary md = {} # Read tbe columns from the file and fill the dictionary c = 0 p = 0 parents = [] PName = False gen = False for line in lines: c = c + 1 columns = line.split(" ") columns = [col.strip() for col in columns] if c >= 4 and c <= num_lines - 2: if columns[1] == 'dataTier': md['data_tier'] = columns[-1] if columns[-1] == 'generated': gen = True elif columns[1] == 'endTime': E = time.localtime(int(columns[-1])) md['end_time'] = str(E[0]) + '-' + str(E[1]) + '-' + str( E[2]) + 'T' + str(E[3]) + ':' + str(E[4]) + ':' + str(E[5]) elif columns[1] == 'startTime': S = time.localtime(int(columns[-1])) md['start_time'] = str(S[0]) + '-' + str(S[1]) + '-' + str( S[2]) + 'T' + str(S[3]) + ':' + str(S[4]) + ':' + str(S[5]) elif columns[1] == 'group': md['group'] = columns[-1] elif columns[1] == 'eventCount': md['event_count'] = columns[-1] elif columns[1] == 'fclName': md['fcl.name'] = columns[-1] elif columns[1] == 'fclVersion': md['fcl.version'] = columns[-1] elif columns[1] == 'fileFormat': md['file_format'] = columns[-1] elif columns[1] == 'ubProjectStage': md['ub_project.stage'] = columns[-1] elif columns[1] == 'ubProjectVersion': md['ub_project.version'] = columns[-1] elif columns[1] == 'lastEvent': md['last_event'] = columns[-1] elif columns[1] == 'firstEvent': md['first_event'] = columns[-1] elif columns[1] == 'fileType': md['file_type'] = columns[-1] elif columns[1] == 'group': md['group'] = columns[-1] elif columns[1] == 'group': md['group'] = columns[-1] elif columns[1] == 'run': run = columns[-1] elif columns[1] == 'runType': run_type = columns[-1] elif columns[1] == 'applicationFamily': app_family = columns[-1] elif columns[1] == 'applicationVersion': app_version = columns[-1] elif columns[1] == 'process_name': app_name = columns[-1] elif columns[1] == 'ubProjectName': PName = True md['ub_project.name'] = columns[-1] elif columns[1] == 'parent': parents.append({'file_name': columns[-1]}) # Get the other meta data field parameters md['file_name'] = inputfile.split("/")[-1] md['file_size'] = os.path.getsize(inputfile) # For now, skip the checksum for dCache files. md['crc'] = root_metadata.fileEnstoreChecksum(inputfile) md['runs'] = [[run, run_type]] md['application'] = { 'family': app_family, 'name': app_name, 'version': app_version } md['parents'] = parents # If ub_project.name is not in the internal metadata, # for generator files, get the ub_project.name from the fcl_filename (without the '.fcl' part) for gen files. # for all other stages, get this from the parents if gen == True: md['parents'] = [] if PName == False: md['ub_project.name'] = md['fcl.name'].split(".fcl")[0] else: if PName == False: if 'parents' in md: parent = md['parents'][0]['file_name'] mdparent = samweb.getMetadata(parent) if 'ub_project.name' in mdparent: md['ub_project.name'] = mdparent['ub_project.name'] return md
def main(): ana = 0 nproc = 0 import_samweb() # Parse arguments. checkdir='' logdir='' outdir='' declare_file = 0 copy_to_dropbox = 0 maintain_parentage = 0 data_file_types = [] args = sys.argv[1:] while len(args) > 0: if args[0] == '--dir' and len(args) > 1: checkdir = args[1] del args[0:2] elif args[0] == '--logfiledir' and len(args) > 1: logdir = args[1] del args[0:2] elif args[0] == '--outdir' and len(args) > 1: outdir = args[1] del args[0:2] elif args[0] == '--declare' and len(args) > 1: declare_file = int(args[1]) del args[0:2] elif args[0] == '--copy' and len(args) > 1: copy_to_dropbox = int(args[1]) del args[0:2] elif args[0] == '--maintain_parentage' and len(args) > 1: maintain_parentage = int(args[1]) del args[0:2] elif args[0] == '--data_file_type' and len(args) > 1: data_file_types.append(args[1]) del args[0:2] else: print('Unknown option %s' % args[0]) return 1 # Add default data_file_types. if len(data_file_types) == 0: data_file_types.append('root') status = 0 #global status code to tell us everything is ok. print("Do decleration in job: %d" % declare_file) # Check lar exit status (if any). stat_filename = os.path.join(logdir, 'lar.stat') if project_utilities.safeexist(stat_filename): try: status = int(project_utilities.saferead(stat_filename)[0].strip()) if status != 0: print('Job in subdirectory %s ended with non-zero exit status %d.' % (checkdir, status)) status = 1 except: print('Bad file lar.stat in subdirectory %s.' % checkdir) status = 1 if checkdir == '': print('No directory specified (use the --dir option.) Exiting.') return 1 if logdir == '': print('No log file directory specified (use the --logfiledir option.) Exiting.') return 1 nevts,rootfiles,hists = check_root(checkdir, logdir, data_file_types) # Set flag to do analysis-style validation if all of the following are true: # # 1. There is at least one valid histogram file. # 2. The total number of artroot files and artroot events is zero. if len(hists) > 0 and len(rootfiles) == 0 and nevts <= 0: ana = 1 if not ana: if len(rootfiles) == 0 or nevts < 0: print('Problem with root file(s) in %s.' % checkdir) status = 1 elif nevts < -1 or len(hists) == 0: print('Problem with analysis root file(s) in %s.' % checkdir) status = 1 # Then we need to loop over rootfiles and hists because those are good. # Then we could make a list of those and check that the file in question for # declaration is in that liast. also require that the par exit code is good for # declaration. validate_list = open('validate.list','w') file_list = open('files.list', 'w') ana_file_list = open('filesana.list', 'w') events_list = open('events.list', 'w') #will be empty if the checks succeed bad_list = open('bad.list', 'w') missing_list = open('missing_files.list', 'w') # Print summary. if ana: print("%d processes completed successfully." % nproc) print("%d total good histogram files." % len(hists)) else: print("%d total good events." % nevts) print("%d total good root files." % len(rootfiles)) print("%d total good histogram files." % len(hists)) file_list_stream = {} # Generate bookkeeping files pertaining to artroot files. for rootfile in rootfiles: rootpath = rootfile[0] nev = rootfile[1] streamname = rootfile[2] # Make sure root file names do not exceed 200 characters. rootname = os.path.basename(rootpath) if len(rootname) >= 200: print('Filename %s in subdirectory %s is longer than 200 characters.' % ( rootname, outdir)) status = 1 if streamname not in file_list_stream: file_list_stream[streamname] = open('files_%s.list' % streamname, 'w') validate_list.write(rootpath + '\n') file_on_scratch = os.path.join(outdir, os.path.basename(rootpath)) file_list.write(file_on_scratch + '\n') file_list_stream[streamname].write(file_on_scratch + '\n') events_list.write('%s %d \n' % (file_on_scratch, nev) ) # Generate bookkeeping files pertaining to analysis files. for histfile in hists: validate_list.write(histfile + '\n') file_on_scratch = os.path.join(outdir, os.path.basename(histfile)) ana_file_list.write(file_on_scratch + '\n') validate_list.close() file_list.close() ana_file_list.close() for streamname in list(file_list_stream.keys()): file_list_stream[streamname].close() events_list.close() #decide at this point if all the checks are ok. Write to missing_file_list first missing_list.write('%d \n' %status) if status == 0: bad_list.close() # begin SAM decleration if declare_file: # Declare artroot files. for rootfile in rootfiles: rootpath = rootfile[0] fn = os.path.basename(rootpath) declare_ok = False # Decide if we need to declare this file. # It is OK if the file is already declared. # In that case, do not try to declare it again. try: md = samweb.getMetadata(fn) if len(md) > 0: declare_ok = True print('File %s is already declared.' % fn) except: declare_ok = False if not declare_ok: print('Declaring %s' % fn) expSpecificMetaData = expMetaData(project_utilities.get_experiment(), rootpath) md = expSpecificMetaData.getmetadata() # Decide if we want to override the internal parentage metadata. if maintain_parentage == 1: # Delete the old parents, if any. if 'parents' in md: del md['parents'] # change the parentage of the file based on it's parents and aunts from condor_lar jobs_parents = os.getenv('JOBS_PARENTS', '').split(" ") jobs_aunts = os.getenv('JOBS_AUNTS', '').split(" ") if(jobs_parents[0] != '' ): md['parents'] = [{'file_name': parent} for parent in jobs_parents] if(jobs_aunts[0] != '' ): for aunt in jobs_aunts: mixparent_dict = {'file_name': aunt} if 'parents' not in md: md['parents'] = [] md['parents'].append(mixparent_dict) if len(md) > 0: project_utilities.test_kca() # Make lack of parent files a nonfatal error. # This should probably be removed at some point. try: samweb.declareFile(md=md) declare_ok = True except samweb_cli.exceptions.SAMWebHTTPError as e: print(e) print('SAM declare failed.') return 1 except: print('SAM declare failed.') return 1 else: print('No sam metadata found for %s.' % fn) declare_ok = False status = 1 if copy_to_dropbox == 1 and declare_ok: print("Copying to Dropbox") dropbox_dir = project_utilities.get_dropbox(fn) rootPath = os.path.join(dropbox_dir, fn) jsonPath = rootPath + ".json" ifdh_cp(rootpath, rootPath) # Declare histogram files. for histpath in hists: declare_ok = False fn = os.path.basename(histpath) # Decide if we need to declare this file. # It is OK if the file is already declared. # In that case, do not try to declare it again. try: md = samweb.getMetadata(fn) if len(md) > 0: declare_ok = True print('File %s is already declared.' % fn) except: declare_ok = False if not declare_ok: print('Declaring %s' % fn) json_file = os.path.join(logdir, fn + '.json') # Get metadata from json md = {} if project_utilities.safeexist(json_file): mdlines = project_utilities.saferead(json_file) mdtext = '' for line in mdlines: mdtext = mdtext + line try: md = json.loads(mdtext) except: md = {} pass if maintain_parentage == 1: # Delete the old parents, if any. if 'parents' in md: del md['parents'] # change the parentage of the file based on it's parents and aunts from condor_lar jobs_parents = os.getenv('JOBS_PARENTS', '').split(" ") jobs_aunts = os.getenv('JOBS_AUNTS', '').split(" ") if(jobs_parents[0] != '' ): md['parents'] = [{'file_name': parent} for parent in jobs_parents] if(jobs_aunts[0] != '' ): for aunt in jobs_aunts: mixparent_dict = {'file_name': aunt} if 'parents' not in md: md['parents'] = [] md['parents'].append(mixparent_dict) if len(md) > 0 and 'file_type' in md: project_utilities.test_kca() # Make lack of parent files a nonfatal error. # This should probably be removed at some point. try: samweb.declareFile(md=md) declare_ok = True except samweb_cli.exceptions.SAMWebHTTPError as e: print(e) print('SAM declare failed.') declare_ok = False except: print('SAM declare failed.') declare_ok = False else: print('No sam metadata found for %s.' % fn) declare_ok = False if copy_to_dropbox == 1 and declare_ok: print("Copying to Dropbox") dropbox_dir = project_utilities.get_dropbox(fn) rootPath = dropbox_dir + "/" + fn jsonPath = rootPath + ".json" ifdh_cp(histpath, rootPath) return status # something went wrong, so make a list of bad directories and potentially missing files else: # first get the subdir name on pnfs. this contains the job id dir_on_scratch = os.path.basename(outdir) print('Dir on scratch ' + dir_on_scratch) bad_list.write('%s \n' % dir_on_scratch) bad_list.close() return status