示例#1
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(
        ["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(
            data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip,
                               needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(
            work_dir,
            "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        utils.symlink_plus(in_file, out_file)
    return out_file
示例#2
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert
                                           or needs_bgzip or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
示例#3
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert
                                           or needs_bgzip or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
示例#4
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
示例#5
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    if isinstance(in_file, (list, tuple)):
        in_file = in_file[0]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or
          objectstore.is_remote(in_file) or
          (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)):
        out_file = _bgzip_file(data["in_file"], data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        out_file = _symlink_or_copy_grabix(in_file, out_file, data)
    return out_file
示例#6
0
def _ready_gzip_fastq(in_files, data):
    """Check if we have gzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"],
                             data) is not False
    return (all_gzipped and not needs_convert and not do_splitting
            and not objectstore.is_remote(in_files[0]) and not needs_trim)
示例#7
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim and not get_downsample_params(data))
示例#8
0
def _ready_bgzip_fastq(in_files, data):
    """Check if we have bgzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if all_gzipped:
        all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    else:
        all_bgzipped = False
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False
    return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim)
示例#9
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and
            not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
示例#10
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd
示例#11
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd