Пример #1
0
def lsFiles(files, add='', group=50):
    """
    list a set of files in parallel (when the set is huge)

    Args:
    ----
        files: gs paths
        add: additional params to add
        group: files to do in parallel
    """
    print('listing files in gs')
    by = len(files) if len(files) < group else group
    res = []
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        data = subprocess.run("gsutil -m ls " + add + " " + a, capture_output=True, shell=True)
        if data.returncode != 0:
            if "One or more URLs matched no objects" not in str(data.stderr):
                raise ValueError('issue with the command: ' + str(data.stderr))
        if len(str(data.stdout)) < 4:
            return 0
        res += str(data.stdout)[2:-1].split('\\n')[:-1] if 'L' not in add else ['gs://' + i for i in str(data.stdout).split('\\ngs://')]
        if "TOTAL:" in res[-1] and 'L' not in add:
            res = res[:-1]
    return res
Пример #2
0
def catFiles(files, group=50, split=False, cut=False):
    """
    copy a set of files in parallel (when the set is huge)

    Args:
    ----
        files: gs paths
        location to copy
        group: files to do in parallel
        cut: split all lines into chunks of size cut
        split: split lines by split e.g. \\n
    """
    by = len(files) if len(files) < group else group
    res = []
    for i, sfiles in enumerate(h.grouped(files, by)):
        print(i / (len(files) / by))
        a = ''
        for val in sfiles:
            a += val + ' '
        data = subprocess.run("gsutil -m cat " + a, capture_output=True, shell=True)
        if data.returncode != 0:
            if "One or more URLs matched no objects" not in str(data.stderr):
                print(ValueError('issue with the command: ' + str(data.stderr)))
                return res
        if len(str(data.stdout)) < 4:
            return 0
        resa = str(data.stdout)[2:-1]
        if cut:
            res += [resa[i * cut:(i + 1) * cut] for i in range(int(len(resa) / cut))]
        elif split:
            res += resa.split(split)
        else:
            res += [resa]
    return res
Пример #3
0
def cpFiles(files, location, group=50):
    """
    copy a set of files in parallel (when the set is huge)

    Args:
    ----
        files: gs paths
        location to copy
        group: files to do in parallel
    """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        code = os.system("gsutil -m cp " + a + location)
        if code != 0:
            print('pressed ctrl+c or command failed')
            break
Пример #4
0
def mvFiles(files, location, group=50, listen_to_errors=False):
    """
    move a set of files in parallel (when the set is huge)

    Args:
    ----
        files: gs paths
        location: to move the files to
        group: files to do in parallel
    """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += val + ' '
        code = os.system("gsutil -m mv " + a + location)
        if code != 0 and listen_to_errors:
            print('pressed ctrl+c or command failed')
            break
Пример #5
0
def rmFiles(files, group=50, add='', dryrun=True):
    """
    remove a set of files in parallel (when the set is huge)

    Args:
    ----
        files: gs paths
        group: number to do in parallel
        add: additional gsutil cp params
    """
    by = len(files) if len(files) < group else group
    for sfiles in h.grouped(files, by):
        a = ''
        for val in sfiles:
            a += ' ' + val
        if add:
            add = ' ' + add
        if dryrun:
            print("gsutil -m rm" + add + a)
        else:
            code = os.system("gsutil -m rm" + add + a)
            if code != 0:
                print('pressed ctrl+c or command failed')
                break