Пример #1
0
def bwa_mem(args, param_dict=None):
    """
    """
    def parse_in(args):
        """
        to deal with the following issues:
            - there may be multiple fastq files
            - fastq files may be gzipped
            - fastq file may not locate in the `--rootdir`
            - paired ended fastq files
        following advice on this page:
            http://sourceforge.net/p/bio-bwa/mailman/message/31053122/
        """
        data_dir = os.path.split(os.path.abspath(args.R1[0]))[0]
        in_f = []
        if len(args.R1) == 1:
            in_f.append(os.path.join("/data", os.path.split(os.path.abspath(args.R1[0]))[1]))
        else:
            if args.R1[0].split('.')[-1] == "gz":
                in_f.append("'<zcat {}'".format(
                    " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R1])))
            else:
                in_f.append("'<cat {}'".format(
                    " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R1])))
        if args.R2 is not None:
            if len(args.R2) == 1:
                in_f.append(os.path.join("/data", os.path.split(os.path.abspath(args.R2[0]))[1]))
            else:
                if args.R1[0].split('.')[-1] == "gz":
                    in_f.append("'<zcat {}'".format(
                        " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R2])))
                else:
                    in_f.append("'<cat {}'".format(
                        " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R2])))
        return data_dir, " ".join(in_f)

    data_dir, in_fq = parse_in(args)
    _out_sam = "> {}".format(file_cfg["aligned"](args))
    bwa_cmd = " ".join(
        ["bwa mem -t {_p} -M".format(_p=args.p), join_params(param_dict), ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], in_fq, _out_sam])
    cmd = DOCKER_RUN + \
        r""" -v {_data_d}:/data bwa:{_bwa_v} bash -c "{_bwa_c}" """
    cmd = cmd.format(
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        _bwa_v=_version,
        _data_d=data_dir,
        _bwa_c=bwa_cmd)
    return cmd, file_cfg["aligned"](args)
Пример #2
0
def picard_sort(args, param_dict=None):
    """
    doc
    """
    cmd = DOCKER_RUN + \
        """ picard:{_v} \
    SortSam {param} I={aligned} O={sort} TMP_DIR=/out_dir SORT_ORDER=coordinate"""
    cmd = cmd.format(
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        _v=_version,
        aligned=file_cfg["aligned"](args),
        sort=file_cfg["sorted"](args),
        param=join_params(param_dict))

    return cmd, os.path.join(args.out_dir, file_cfg["sorted"](args))
Пример #3
0
def picard_dedup(args, param_dict=None):
    """
    doc
    """
    cmd = DOCKER_RUN + \
        """ picard:{_v} \
    MarkDuplicates {param} I={sort} O={dedup} METRICS_FILE={matrics} CREATE_INDEX=true"""
    cmd = cmd.format(
        _v=_version,
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        sort=file_cfg["sorted"](args),
        param=join_params(param_dict),
        dedup=file_cfg["dedup"](args),
        matrics=file_cfg["matrics"](args))
    return cmd, [file_cfg["dedup"](args), file_cfg["matrics"](args)]
Пример #4
0
def gatk_haplotypecaller(args, param_dict=None):
    """
    The HaplotypeCaller is capable of calling SNPs and indels simultaneously via local de-novo
    assembly of haplotypes in an active region. In other words, whenever the program encounters
    a region showing signs of variation, it discards the existing mapping information and completely
    reassembles the reads in that region.
    """
    cmd = DOCKER_RUN + """gatk:{_v} -T HaplotypeCaller {param} -nct {_p} -R {_R}\
    -I {bqsr} --emitRefConfidence GVCF --dbsnp {_dbsnp_vcf} -o {gvcf}"""
    cmd = cmd.format(
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        param=join_params(param_dict),
        _p=args.p,
        _v=_version,
        _dbsnp_vcf=ref_file_cfg[version_cfg["REF_VERSION"]]["dbsnp"],
        _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"],
        bqsr=file_cfg["bqsr"](args),
        gvcf=file_cfg["gvcf"](args))
    return cmd, file_cfg["gvcf"](args)
Пример #5
0
def gatk_printread(args, param_dict=None):
    """
    PrintReads is a generic utility tool for manipulating sequencing data in SAM/BAM format.
    It can dynamically merge the contents of multiple input BAM files, resulting in merged output
    sorted in coordinate order.
    """
    cmd = DOCKER_RUN + """ gatk:{_v} -T PrintReads {param} -nct {_p}\
    -R {_R} -I {dedup} -BQSR {table} -o {bqsr}"""
    cmd = cmd.format(
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        param=join_params(param_dict),
        _p=args.p,
        _v=_version,
        _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"],
        dedup=file_cfg["dedup"](args),
        table=file_cfg["table"](args),
        bqsr=file_cfg["bqsr"](args)
        )
    return cmd, file_cfg["bqsr"](args)
Пример #6
0
def bwa_index(args, param_dict=None):
    """
    """
    cmd1 = \
        """docker create \
    -v /ref \
    --name {_ref_v} reference:{_ref_v}""".format(_ref_v=version_cfg["REF_VERSION"])

    cmd2 = \
        """docker run \
    --rm \
    --volumes-from {_ref_v} \
    -w /ref \
    bwa:{_bwa_v} \
    bwa index {param} {in_f} """.format(
        in_f=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"],
        param=join_params(param_dict),
        _ref_v=version_cfg["REF_VERSION"],
        _bwa_v=_version)

    return " && ".join([cmd1, cmd2]), None
Пример #7
0
def gatk_bqsr(args, param_dict=None):
    """
    Base quality score recalibration (BQSR) is a process in which we apply machine learning
    to model these errors empirically and adjust the quality scores accordingly. This allows
    us to get more accurate base qualities, which in turn improves the accuracy of our variant
    calls. The base recalibration process involves two key steps: first the program builds a model
    of covariation based on the data and a set of known variants (which you can bootstrap if
    there is none available for your organism), then it adjusts the base quality scores
    in the data based on the model.
    """
    cmd = DOCKER_RUN + """ gatk:{_v} -T BaseRecalibrator {param} -nct {_p}\
    -R {_R} -I {dedup} -knownSites {_dbsnp_vcf} -o {table}"""
    cmd = cmd.format(
        _ref_v=version_cfg["REF_VERSION"],
        _out_d=args.out_dir,
        param=join_params(param_dict),
        _p=args.p,
        _v=_version,
        _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"],
        _dbsnp_vcf=ref_file_cfg[version_cfg["REF_VERSION"]]["dbsnp"],
        dedup=file_cfg["dedup"](args),
        table=file_cfg["table"](args)
        )
    return cmd, file_cfg["table"](args)