def main(): #dataset = sys.argv[1] lines = [l.strip() for l in open(sys.argv[1])] #look for datasetname for i in range(3): if lines[i].startswith('dataset'): dataset = lines[i].replace('dataset : ','').strip() break print "'%s'"%dataset #build graph and calculate graph = buildGraph(lines) try: files = colorBipartiteGraph(graph) except Exception as e: files = deleteMaxDegreeFirst(graph) total = dbs.getEventCountDataSet(dataset) invalid = dbs.getEventCountDataSetFileList(dataset, files) print 'total events %s'%total print 'invalidated files %s'%len(files) print 'invalidated events %s'%invalid if total: print '%s%%'%(float(total-invalid)/total*100.0) for f in sorted(files): print f
def main(): #dataset = sys.argv[1] lines = [l.strip() for l in open(sys.argv[1])] #look for datasetname for i in range(3): if lines[i].startswith('dataset'): dataset = lines[i].replace('dataset : ','').strip() break print "'%s'"%dataset print "Building graph model" graph = buildGraph(lines) print "Getting events per file" events = getFileEvents(dataset, graph.keys()) try: #first algorithm that assumes bipartition files = colorBipartiteGraph(graph, events) except Exception as e: #second, algorithm #files = deleteMaxDegreeFirst(graph, events) files = deleteSmallestVertexFirst(graph, events) total = dbs.getEventCountDataSet(dataset) invalid = dbs.getEventCountDataSetFileList(dataset, files) print 'total events %s'%total print 'invalidated files %s'%len(files) print 'invalidated events %s'%invalid if total: print '%s%%'%(float(total-invalid)/total*100.0) for f in sorted(files): print f
def getFileEvents(dataset, files): """ Builds a dict files-> num events """ eventCount = {} for f in files: evs = dbs.getEventCountDataSetFileList(dataset, [f]) #evs = random.randrange(10000) eventCount[f] = evs return eventCount
def main(): dataset = sys.argv[1] lines = [l.strip() for l in open(sys.argv[2])] graph = buildGraph(lines) files = deleteMaxDegreeFirst(graph) total = dbs.getEventCountDataSet(dataset) invalid = dbs.getEventCountDataSetFileList(dataset, files) print 'total events %s'%total print 'invalidated files %s'%len(files) print 'invalidated events %s'%invalid print '%s%%'%(float(total-invalid)/total*100.0) for f in files: print f
def main(): usage = "python %prog [OPTIONS]" parser = OptionParser(usage) parser.add_option("-a", "--doall",dest="doall", action="store_true" , default=False, help="It will analyze all datasets of the workflow from the beginning. If this option is true,"\ " you should provide a workflow name or a list of them in the --file option.") parser.add_option("-f", "--file",dest="file", help="Input file with the contents of duplicateEvents.py (a list of lumis and files)."\ " If you are using the --doall option, it should contain a list of workflows instead") options, args = parser.parse_args() workflows = None #if we not doing all, input should be treated as list of lumis an files if not options.doall and options.file: lines = [l.strip() for l in open(options.file)] graphs = buildGraphs(lines) # if do all and input file elif options.doall and options.file: workflows = [l.strip() for l in open(options.file)] elif options.doall and not options.file: workflows = args else: parser.error("You should provide an input file with the output of duplicateEvents") # get the output datasets of the workflos and create the graph if workflows: datasets = [] for wf in workflows: datasets += reqMgrClient.outputdatasetsWorkflow(url, wf); graphs = {} #analyze each dataset for dataset in datasets: dup, lumis = dbs.duplicateRunLumi(dataset, verbose="dict", skipInvalid=True) #print lumis graphs[dataset] = buildGraph(lumis) for dataset, graph in graphs.items(): #look for datasetname print "Getting events per file" events = getFileEvents(dataset, graph.keys()) try: #first algorithm that assumes bipartition files = colorBipartiteGraph(graph, events) except Exception as e: #second, algorithm #files = deleteMaxDegreeFirst(graph, events) files = deleteSmallestVertexFirst(graph, events) total = dbs.getEventCountDataSet(dataset) invalid = dbs.getEventCountDataSetFileList(dataset, files) print 'total events %s'%total print 'invalidated files %s'%len(files) print 'invalidated events %s'%invalid if total: print '%s%%'%(float(total-invalid)/total*100.0) for f in sorted(files): print f