id = Column(GUID(), primary_key=True) rse = Column(String) if "-n" in opts: nparts = int(opts["-n"]) else: nparts = config.nparts(rse_name) or 1 if nparts > 1: if out_prefix is None: print("Output file path must be specified if partitioning is requested") sys.exit(1) out_list = None if out_prefix is not None: out_list = PartitionedList.create(nparts, out_prefix, zout) subdir = config.dbdump_root(rse_name) or "/" if not subdir.endswith("/"): subdir = subdir + "/" _, ignore_file_patterns = config.ignore_lists(rse_name) engine = create_engine(dbconfig.DBURL, echo=verbose) Session = sessionmaker(bind=engine) session = Session() rse = session.query(RSE).filter(RSE.rse == rse_name).first() if rse is None: print ("RSE %s not found" % (rse_name,)) sys.exit(1)
def main(): opts, args = getopt.getopt(sys.argv[1:], "n:o:c:qr:z") opts = dict(opts) if not args or not ("-o" in opts): print(Usage) sys.exit(2) nparts = None out_prefix = opts["-o"] rewrite_match = rewrite_out = filter_in = remove_prefix = add_prefix = starts_with = None ignore_list = [] if "-c" in opts: rse = opts["-r"] config = Config(opts.get("-c")) preprocess = config.rse_param(rse, "preprocess") ignore_list = config.rse_param(rse, "ignore_list") or [] if preprocess is not None: ilist = preprocess.get("ignore_list") if ilist is not None: ignore_list = ilist filter_in = preprocess.get("filter") if filter_in is not None: #print("filtering:", filter_in) filter_in = re.compile(filter_in) starts_with = preprocess.get("starts_with") remove_prefix = preprocess.get("remove_prefix") add_prefix = preprocess.get("add_prefix") rewrite = preprocess.get("rewrite", {}) if rewrite: rewrite_match = re.compile(rewrite["match"]) rewrite_out = rewrite["out"] #print("rewriting:", rewrite["match"], rewrite["out"]) nparts = config.nparts(rse) zout = "-z" in opts nparts = int(opts.get("-n", nparts)) if nparts is None: print( "N parts must be specified either with -n or via the -c <config> and -r <rse>" ) print(Usage) sys.exit(2) in_lst = PartitionedList.open(files=args) out_lst = PartitionedList.create(nparts, out_prefix, zout) #print("ignore list:", ignore_list) for path in in_lst: if starts_with and not path.startswith(starts_with): continue for ignore_path in ignore_list: #print(f"checking path {path} for ignore path {ignore_path}") if path.startswith(ignore_path): ignore = True break else: ignore = False if ignore: continue if filter_in is not None and not filter_in.search(path): continue if remove_prefix is not None: if not path.startswith(remove_prefix): sys.stderr.write( f"Path {path} does not begin with prefix {remove_prefix}\n" ) sys.exit(1) path = path[len(remove_prefix):] if add_prefix: path = add_prefix + path if rewrite_match is not None: if not rewrite_match.search(path): sys.stderr.write( f"Path rewrite pattern did not find a match in path {path}\n" ) sys.exit(1) path = rewrite_match.sub(rewrite_out, path) #print("path:", type(path), path) out_lst.add(path) out_lst.close() print(out_lst.NWritten)
zout = "-z" in opts if "-n" in opts: nparts = int(opts["-n"]) else: nparts = config.nparts(rse) if nparts > 1: if not "-o" in opts: print ("Output prefix is required for partitioned output") print (Usage) sys.exit(2) output = opts.get("-o","out.list") out_list = PartitionedList.create(nparts, output, zout) dir_output = opts.get("-d") dir_list = PartitionedList.create(nparts, dir_output, zout) if dir_output else None server = config.scanner_server(rse) server_root = config.scanner_server_root(rse) include_sizes = config.scanner_include_sizes(rse) and not "-x" in opts purge_empty_dirs = config.scanner_param(rse, "purge_empty_dirs", default=False) if not server_root: print(f"Server root is not defined for {rse}. Should be defined as 'server_root'") sys.exit(2) my_stats = { "rse":rse, "scanner":{
def cmp3_parts(a_prefix, r_prefix, b_prefix): a_list = PartitionedList.open(a_prefix) r_list = PartitionedList.open(r_prefix) b_list = PartitionedList.open(b_prefix) return cmp3_lists(a_list, r_list, b_list)
def main(): import getopt, json t0 = time.time() opts, args = getopt.getopt(sys.argv[1:], "s:S:z") opts = dict(opts) if len(args) < 5: print(Usage) sys.exit(2) compress = "-z" in opts stats_file = opts.get("-s") stats_key = opts.get("-S", "cmp3") stats = Stats(stats_file) if stats_file else None b_prefix, r_prefix, a_prefix, out_dark, out_missing = args a_list = PartitionedList.open(a_prefix) r_list = PartitionedList.open(r_prefix) b_list = PartitionedList.open(b_prefix) my_stats = { "version": Version, "elapsed": None, "start_time": t0, "end_time": None, "missing": None, "dark": None, "missing_list_file": None, "dark_list_file": None, "b_prefix": b_prefix, "a_prefix": a_prefix, "r_prefix": r_prefix, "a_files": a_list.FileNames, "b_files": b_list.FileNames, "r_files": r_list.FileNames, "a_nfiles": a_list.NParts, "b_nfiles": b_list.NParts, "r_nfiles": r_list.NParts, "status": "started" } if stats is not None: stats[stats_key] = my_stats if compress: if not out_dark.endswith(".gz"): out_dark += ".gz" if not out_missing.endswith(".gz"): out_missing += ".gz" fd = gzip.open(out_dark, "wt") fm = gzip.open(out_missing, "wt") else: fd = open(out_dark, "w") fm = open(out_missing, "w") diffs = cmp3_generator(a_list, r_list, b_list) nm = nd = 0 for t, path in diffs: if t == 'd': fd.write(path) nd += 1 else: fm.write(path) nm += 1 fd.close() fm.close() print("Found %d dark and %d missing replicas" % (nd, nm)) t1 = time.time() my_stats.update({ "elapsed": t1 - t0, "end_time": t1, "missing": nm, "dark": nd, "status": "done", "missing_list_file": out_missing, "dark_list_file": out_dark }) if stats is not None: stats[stats_key] = my_stats t = int(t1 - t0) s = t % 60 m = t // 60 print("Elapsed time: %dm%02ds" % (m, s))
Session = sessionmaker(bind=engine) session = Session() rse = session.query(RSE).filter(RSE.rse == rse_name).first() if rse is None: print("RSE %s not found" % (rse_name, )) sys.exit(1) rse_id = rse.id #print ("rse_id:", type(rse_id), rse_id) batch = 100000 outputs = { states: PartitionedList.create(nparts, prefix, zout) for states, prefix in filters.items() } all_replicas = '*' in all_states replicas = session.query(Replica).filter( Replica.rse_id == rse_id).yield_per(batch) if all_replicas: sys.stderr.write("including all replias\n") else: print("including replicas in states:", list(all_states), file=sys.stderr) replicas = replicas.filter(Replica.state.in_(list(all_states)))
def main(): import getopt t0 = time.time() opts, args = getopt.getopt(sys.argv[1:], "s:S:zf") opts = dict(opts) if len(args) < 4: print(Usage) sys.exit(2) stats_file = opts.get("-s") stats_key = opts.get("-S", "join") compress = "-z" in opts single_file = "-f" in opts my_stats = stats = None op, a_spec, b_spec, out_spec = args if single_file: a_list = PartitionedList.open(files=[a_spec]) b_list = PartitionedList.open(files=[b_spec]) out_list = PartitionedList.create_file(out_spec) else: a_list = PartitionedList.open(prefix=a_spec) b_list = PartitionedList.open(prefix=b_spec) if a_list.NParts != b_list.NParts: print("Inconsistent number of parts: %s:%d: %s:%d" % (a_spec, a_list.NParts, b_spec, b_list.NParts)) sys.exit(1) out_list = PartitionedList.create(a_list.NParts, out_spec) if stats_file is not None: stats = Stats(stats_file) my_stats = { "version": Version, "elapsed": None, "start_time": t0, "end_time": None, "a_list_files": 0, "b_list_files": 0, "join_list_files": 0, "operation": op, "b_prefix": b_spec, "a_prefix": a_spec, "out_prefix": out_spec, "a_files": a_list.FileNames, "b_files": b_list.FileNames, "out_files": out_list.FileNames, "nparts": a_list.NParts, "status": "started" } stats[stats_key] = my_stats n_a_files = 0 n_b_files = 0 n_out_files = 0 for pa, pb in zip(a_list.parts(), b_list.parts()): b_set = set(pb) n_b_files += len(b_set) for f in pa: n_a_files += 1 if op == "and": if f in b_set: out_list.add(f) n_out_files += 1 elif op == "minus": if not f in b_set: out_list.add(f) n_out_files += 1 elif op == "xor": if f in b_set: b_set.remove(f) else: out_list.add(f) n_out_files += 1 elif op == "or": if f in b_set: b_set.remove(f) out_list.add(f) n_out_files += 1 if op in ("or", "xor"): for f in b_set: out_list.add(f) n_out_files += 1 t1 = time.time() if stats_file: my_stats.update({ "elapsed": t1 - t0, "end_time": t1, "a_list_files": n_a_files, "b_list_files": n_b_files, "join_list_files": join_list_files, "status": "done" }) stats[stats_key] = my_stats