示例#1
0
def buildCoverageStats(infile, outfile):
    '''Generate coverage statistics for regions of interest from a
       bed file using Picard'''

    # TS check whether this is always required or specific to current baits
    # file

    # baits file requires modification to make picard accept it
    # this is performed before CalculateHsMetrics
    to_cluster = USECLUSTER
    baits = PARAMS["roi_baits"]
    modified_baits = infile + "_temp_baits_final.bed"
    regions = PARAMS["roi_regions"]
    statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt;
                awk 'NR>2' %(baits)s |
                awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}'
                > %(infile)s_temp_baits.bed;
                cat  %(infile)s_temp_header.txt %(infile)s_temp_baits.bed
                > %(modified_baits)s;
                rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt
                '''
    P.run(statement)

    mappingqc.buildPicardCoverageStats(
        infile, outfile, modified_baits, modified_baits)

    iotools.zap_file(modified_baits)
示例#2
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                           PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    exome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    exome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    iotools.zap_file(outfile1)

    exome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    iotools.zap_file(outfile2)
示例#3
0
def realignMatchedSample(infile, outfile):
    ''' repeat realignments with merged bam of control and tumor
        this should help avoid problems with sample-specific realignments'''

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    exome.GATKIndelRealign(infile, outfile, genome)

    iotools.zap_file(infile)
示例#4
0
def mergeSampleBams(infile, outfile):
    '''merge control and tumor bams'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir(shared=True)

    outfile_tumor = outfile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])
    infile_tumor = infile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    infile_base = os.path.basename(infile)
    infile_tumor_base = infile_base.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    control_id = "Control.bam"
    tumor_id = control_id.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    statement = '''picard AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(infile_tumor)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track_tumor)s
                    ID=%(track_tumor)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''samtools merge -rf
                    %(outfile)s
                    %(tmpdir_gatk)s/%(infile_base)s
                    %(tmpdir_gatk)s/%(infile_tumor_base)s;'''
    statement += "samtools index %(outfile)s; "
    statement += "rm -rf %(tmpdir_gatk)s ;"
    P.run(statement)
    iotools.zap_file(infile)
    iotools.zap_file(infile_tumor)
示例#5
0
def splitMergedRealigned(infile, outfile):
    ''' split realignment file and truncate intermediate bams'''

    track = P.snip(os.path.basename(infile), ".realigned.bqsr.bam") + ".bqsr"
    track_tumor = track.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])
    outfile_tumor = outfile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    statement = '''samtools view -hb %(infile)s
                   -r %(track)s > %(outfile)s;
                   samtools view -hb %(infile)s
                   -r %(track_tumor)s > %(outfile_tumor)s;
                   samtools index %(outfile)s;
                   samtools index %(outfile_tumor)s;'''
    P.run(statement)
    iotools.zap_file(infile)
示例#6
0
def fastq_align_paired(infiles, outfile):
    """
    Aligns fq files.

    Uses bowtie2 before conversion to bam file using Samtools view.
    Bam file is then sorted and the unsorted bam file is replaced.

    """

    fq1, fq2 = infiles
    basename = os.path.basename(outfile).replace(".bam", "")
    sorted_bam = outfile.replace(".bam", "_sorted.bam")

    aligner = P.PARAMS.get("aligner_aligner", "bowtie2")
    aligner_options = P.PARAMS.get("aligner_options", "")
    blacklist = P.PARAMS.get("genome_blacklist", "")

    statement = [
        "%(aligner_aligner)s -x %(aligner_index)s -1 %(fq1)s -2 %(fq2)s %(aligner_options)s |",
        "samtools view - -b > %(outfile)s &&",
        "samtools sort -@ %(pipeline_n_cores)s -o %(sorted_bam)s %(outfile)s",
    ]

    if blacklist:
        # Uses bedtools intersect to remove blacklisted regions
        statement.append(
            "&& bedtools intersect -v -b %(blacklist)s -a %(sorted_bam)s > %(outfile)s"
        )
        statement.append("&& rm -f %(sorted_bam)s")

    else:
        statement.append("&& mv %(sorted_bam)s %(outfile)s")

    P.run(
        " ".join(statement),
        job_queue=P.PARAMS["pipeline_cluster_queue"],
        job_threads=P.PARAMS["pipeline_n_cores"],
        job_condaenv=P.PARAMS["conda_env"],
    )

    # Zeros the trimmed fastq files
    for fn in infiles:
        zap_file(fn)
示例#7
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = get_params().get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = iotools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = iotools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = iotools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c
示例#8
0
def zap_files(files):
    '''Runs cgatcore zap_files on all inputs'''
    for fn in files:
        zap_file(fn)
示例#9
0
def fastq_align(infiles, outfile):
    """
    Aligns fq files.

    Uses STAR before conversion to bam file using Samtools view.
    Bam file is then sorted and the unsorted bam file is replaced.

    """

    basename = os.path.basename(outfile).replace(".bam", "")
    sorted_bam = outfile.replace(".bam", "_sorted.bam")

    blacklist = P.PARAMS.get("genome_blacklist", "")

    statement_align = [
        "STAR",
        "--genomeDir",
        P.PARAMS["aligner_index"],
        "--readFilesIn",
        " ".join(infiles),
        "--readFilesCommand",
        "cat",
        "--outSAMtype",
        "BAM Unsorted",
        "--runThreadN",
        str(P.PARAMS["pipeline_n_cores"]),
        "--outFileNamePrefix",
        outfile.replace(".bam", ""),
        P.PARAMS["aligner_options"] or "",
    ]
    statement_samtools = [
        "samtools",
        "sort",
        "-@",
        str(P.PARAMS["pipeline_n_cores"]),
        "-o",
        sorted_bam,
        f'{outfile.replace(".bam", "")}Aligned.out.bam',
    ]

    if blacklist:
        statement_blacklist = [
            "bedtools",
            "intersect",
            "-v",
            "-a",
            sorted_bam,
            "-b",
            blacklist,
            ">",
            outfile,
            "&&",
            "rm",
            "-f",
            sorted_bam,
        ]
    else:
        statement_blacklist = ["mv", sorted_bam, outfile]

    statement_clean_up = ["rm", "-f", f'{outfile.replace(".bam", "")}Aligned.out.bam']

    P.run(
        f'''{" ".join(statement_align)}     && 
            {" ".join(statement_samtools)}  && 
            {" ".join(statement_blacklist)} && 
            {" ".join(statement_clean_up)}''',
        job_queue=P.PARAMS["pipeline_cluster_queue"],
        job_threads=P.PARAMS["pipeline_n_cores"],
        job_memory="32G",
        job_condaenv=P.PARAMS["conda_env"],
    )

    # Zeros the trimmed fastq files
    for fn in infiles:
        zap_file(fn)