def searchFiles(searchDir,opts,fileValidationSpecs,fileValidationTools,pipelineTools): # compile all regexes regexes = {} for i,spec in enumerate(fileValidationSpecs): try: regexes[i] = re.compile(spec["regex"]) except: raise ValueError("Could not compile regex: %s" % spec["regex"]) allFiles = {} filesPerProc = {} # walk directory and for dirpath, dirs, files in os.walk(searchDir,followlinks=True): for file in files: filePath = os.path.realpath(os.path.join(dirpath, file)) #print(os.path.join(dirpath, file)) # try to match path with all regexes till one matches: for specIdx, spec in enumerate(fileValidationSpecs): m=regexes[specIdx].search(filePath) # we have a file match if m: try: processId = int(m.group("processId")) except: raise ValueError("Non convertable processId found in filePath %s" % filePath) if processId not in filesPerProc: filesPerProc[processId] = {"allFiles" : [] , "tools" : { tool:[] for tool in pipelineTools.keys() } }; #make dict for this file f = {} # add regex groups f.update(m.groupdict()) # add all values from the validation spec (deep copy since we want for each one a different) f.update(copy.deepcopy(spec)) # set file status on finished, (as initial guess, validation fully determines this value) f.update({"status":"finished"}) # format all values again with the regex results f = cF.formatAll(f,m.groupdict(),exceptKeys={"regex":None}) # get tool of this file if "tool" in f: tool = f["tool"] if tool not in pipelineTools.keys(): raise ValueError("The tool %s is not in %s!" % (tool,str(pipelineTools.keys())) ) else: raise ValueError("You need to define a 'tool' key for %s " % str(spec)) # make hashes if "hashString" in spec: h = cF.makeUUID( spec["hashString"].format(**m.groupdict()) ) f["hash"] = h else: raise ValueError("You need to define a 'hash' key for file %s " % str(spec)) # convert frameIdx if "frameIdx" in f: f["frameIdx"] = int(f["frameIdx"]) else: raise ValueError("You need to define a 'frameIdx' key for %s (or in regex!) " % str(spec)) # add file to the lists filesPerProc[processId]["allFiles"].append( f ) filesPerProc[processId]["tools"][ tool ].append(f) if f["hash"] not in allFiles: allFiles[f["hash"]] = f else: raise ValueError("Found files with the same hash %s, %s, this should not happen!" % (f["absPath"], allFiles[f["hash"]]["absPath"] ) ) break if not allFiles: print("We found no files in folder: %s to validate!" % searchDir) return allFiles # sort files according to maximal modified time of the output files for each tool and each process for procId, procFiles in filesPerProc.items(): for tool,files in procFiles["tools"].items(): filesPerProc[procId]["tools"][tool] = sorted( files , key= lambda file : os.path.getmtime(file["absPath"]) ); #determine files to validate filesToValidate = [] for procid, procFiles in filesPerProc.items(): if opts.validateOnlyLastModified: # validate last file of all tools for each processor, to see if its ok or not, all others are valid for tool, toolFiles in procFiles["tools"].items(): if toolFiles: filesToValidate.append(toolFiles[-1]) else: filesToValidate += procFiles["allFiles"] # Validate all files with the appropriate command for fIdx, file in enumerate(filesToValidate): try: ext = os.path.splitext(file["absPath"])[1]; try: validateCmd = fileValidationTools[ext] except: print("No validation command found for extentsion of file: %s" % file["absPath"]) raise validateCmd = validateCmd.format(**{"file":file["absPath"]}) try: out = subprocess.check_output(validateCmd.split(" ")).decode('utf-8') except: print("Validation command %s failed!" % validateCmd) raise if out not in ["finished","recover"]: print("Validation output %s not in list ['finished','recover']" % out) raise else: validationAttributes = {"status":out} filesToValidate[fIdx].update(validationAttributes); except: # file is invalid, clear this file from the list filesToValidate[fIdx]["status"] = "invalid"; print("Validated last files of each tool in the pipeline: ", "\n".join([ f["absPath"] + " --> " + f["status"] for f in filesToValidate ]) ) # filter all empty stuff from lists: allFiles = dict(filter(lambda x : x[1]["status"] != "invalid" ,allFiles.items())) del filesPerProc return allFiles
def recoverFrames(opts,allFrames,framesPerIdx, pipelineTools): def addFile(frame,file,parent=None): if "usedFile" in file and file["usedFile"] : #this file has already been used return if file["status"]==STATUS_RECOVER: print("added File: %s (recover)" % file["relPath"]) # add a file move to recover this file frame["fileMover"].append(file["fileMoveRecover"]) # mark file as used file["usedFile"] = True elif file["status"]==STATUS_FINISHED: if parent: print("added File: %s (finished, dependent)" % file["relPath"]) # add a file move to recover this file frame["fileMover"].append(file["fileMoveDependent"]) #print("id", id(frame), frame["fileMover"] ) # mark file as used file["usedFile"] = True def addTool(frame,toolName, visitedTools, parentToolName=None): if toolName in visitedTools: return visitedTools.add(toolName); frameTool = frame["tools"][toolName] # if tool is not finished if frameTool["status"] != STATUS_FINISHED: # add all this tools checkpoint files for outFileProp in frameTool["outputFiles"]: if not outFileProp["cpFile"] == None: addFile(frame,outFileProp["cpFile"],parentToolName) # add all dependent tools depTools = pipelineTools[toolName]["dependencies"] if depTools: for depTool in depTools: addTool(frame,depTool,visitedTools,toolName) else: # we are finished, but # if we have a parent tool (always not finished), # we add our finished checkpoint files if parentToolName: # add all its checkpoint files of output files for outFileProp in frameTool["outputFiles"]: if outFileProp["cpFile"] == None: raise ValueError("""Trying to add non existant checkpoint file of output file %s in tool %s!""" % (str(outFileProp),toolName) ) addFile(frame,outFileProp["cpFile"],parentToolName) #else: # if no parent given, dont do anything # get all file info if opts.validationFileInfo: print("Setup recovery from file info===============================") print("Using validation file: %s", opts.validationFileInfo) checkpointFiles = cF.jsonLoad(opts.validationFileInfo); cpFiles = { "hash": {}, "all" : []} for f in checkpointFiles: tool = f["tool"] fileId = f["hash"] frameIdx = int(f["frameIdx"]) ha = f["hash"] cpFiles["all"].append(f) if ha in cpFiles["hash"]: raise ValueError("File %s and %s have the same hash!" % (f["absPath"], cpFiles["hash"][ha]["absPath"] ) ) else: cpFiles["hash"][ha] = f print("===========================================================") print("Determine status of all tools =============================") # move over all frames, for each tool and match cpFiles for frameIdx,frame in framesPerIdx.items(): finished = False; for toolName,tool in frame["tools"].items(): # if there are checkpoint files corresponding to outputfiles of this tool finishedOutFiles = 0 for outFileProp in tool["outputFiles"]: ha = cF.makeUUID(outFileProp["hashString"]) outFileProp["hash"] = ha if ha in cpFiles["hash"]: # we found checkpoint file cpFile = cpFiles["hash"][ha] absP = cpFile["absPath"] print("Frame: %i " % frameIdx + " checkpoint file matched:\n\t%s\n\thash: %s\n\tstatus: %s " % ( absP[:10]+'...'+absP[-20:] if len(absP) > 70 else absP ,ha, cpFile["status"] )) outFileProp["cpFile"] = cpFile if outFileProp["cpFile"]["status"] == STATUS_FINISHED: finishedOutFiles += 1 else: outFileProp["cpFile"] = None # if all output files are finished -> tool is finished if finishedOutFiles == len(tool["outputFiles"]): tool["status"] = STATUS_FINISHED print("Tool: %s -> finished" % toolName) #print("Dependency check===========================================") ## for each frameIdx file list, ## travel dependency of each file and if some files are missing ## silently remove this file from the cpFilesPerFrame because this frameIdx can not be recovered! #invalidFrameIdx = set() # list for not recoverable frames! (should not happen) #for frameIdx,frame in framesPerIdx.items(): #if frameIdx not in cpFilesPerFrame: #continue #stack = cpFilesPerFrame[frameIdx]["all"][:] # shallow copy (remove files from stack) #invalidFrame = walkDependencies(stack,cpFilesPerFrame[frameIdx]) #if invalidFrame: #print("Invalid frameIdx: %i for recovery!" % frameIdx) #invalidFrameIdx.add(frameIdx) ## continue to next frame #continue ##endfor ## remove all files from all tools for invalid frames #for k in invalidFrameIdx: #for toolName,tool in framesPerIdx[k].items(): #if toolName in pipelineTools.keys(): #tool["checkpointFiles"] = [] #print("===========================================================") # setup recovery for all frames print("Setup pipeline tools with file info ========================") for frame in allFrames: # walk all tools in pipeline (visit all once!) for tool in pipelineTools.keys(): addTool(frame,tool,set()) print("===============================================================")