def lsFiles(files, add='', group=50): """ list a set of files in parallel (when the set is huge) Args: ---- files: gs paths add: additional params to add group: files to do in parallel """ print('listing files in gs') by = len(files) if len(files) < group else group res = [] for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' data = subprocess.run("gsutil -m ls " + add + " " + a, capture_output=True, shell=True) if data.returncode != 0: if "One or more URLs matched no objects" not in str(data.stderr): raise ValueError('issue with the command: ' + str(data.stderr)) if len(str(data.stdout)) < 4: return 0 res += str(data.stdout)[2:-1].split('\\n')[:-1] if 'L' not in add else ['gs://' + i for i in str(data.stdout).split('\\ngs://')] if "TOTAL:" in res[-1] and 'L' not in add: res = res[:-1] return res
def catFiles(files, group=50, split=False, cut=False): """ copy a set of files in parallel (when the set is huge) Args: ---- files: gs paths location to copy group: files to do in parallel cut: split all lines into chunks of size cut split: split lines by split e.g. \\n """ by = len(files) if len(files) < group else group res = [] for i, sfiles in enumerate(h.grouped(files, by)): print(i / (len(files) / by)) a = '' for val in sfiles: a += val + ' ' data = subprocess.run("gsutil -m cat " + a, capture_output=True, shell=True) if data.returncode != 0: if "One or more URLs matched no objects" not in str(data.stderr): print(ValueError('issue with the command: ' + str(data.stderr))) return res if len(str(data.stdout)) < 4: return 0 resa = str(data.stdout)[2:-1] if cut: res += [resa[i * cut:(i + 1) * cut] for i in range(int(len(resa) / cut))] elif split: res += resa.split(split) else: res += [resa] return res
def cpFiles(files, location, group=50): """ copy a set of files in parallel (when the set is huge) Args: ---- files: gs paths location to copy group: files to do in parallel """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' code = os.system("gsutil -m cp " + a + location) if code != 0: print('pressed ctrl+c or command failed') break
def mvFiles(files, location, group=50, listen_to_errors=False): """ move a set of files in parallel (when the set is huge) Args: ---- files: gs paths location: to move the files to group: files to do in parallel """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' code = os.system("gsutil -m mv " + a + location) if code != 0 and listen_to_errors: print('pressed ctrl+c or command failed') break
def rmFiles(files, group=50, add='', dryrun=True): """ remove a set of files in parallel (when the set is huge) Args: ---- files: gs paths group: number to do in parallel add: additional gsutil cp params """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += ' ' + val if add: add = ' ' + add if dryrun: print("gsutil -m rm" + add + a) else: code = os.system("gsutil -m rm" + add + a) if code != 0: print('pressed ctrl+c or command failed') break