示例#1
0
def summarizeMACSFDR(infiles, outfile):
    '''compile table with peaks that would remain after filtering
    by fdr.
    '''

    fdr_thresholds = numpy.arange(0, 1.05, 0.05)

    outf = iotools.openFile(outfile, "w")
    outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds)))

    for infile in infiles:
        called = []
        track = P.snip(os.path.basename(infile), ".macs")
        infilename = infile + "_peaks.xls.gz"
        inf = iotools.openFile(infilename)
        peaks = list(WrapperMACS.iteratePeaks(inf))

        for threshold in fdr_thresholds:
            called.append(len([x for x in peaks if x.fdr <= threshold]))

        outf.write("%s\t%s\n" % (track, "\t".join(map(str, called))))

    outf.close()
def loadTranscriptomeValidation(infiles, outfile):
    '''load transcriptome validation data into database.'''

    to_cluster = USECLUSTER

    headers = ",".join(
        [P.tablequote(P.snip(x, ".accepted.bam")) for x in infiles])
    infiles = " ".join(["%s.log" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = '''
    cgat combine_tables
         --header-names=%(headers)s
         %(infiles)s
    | cgat table2table --transpose
    | perl -p -e "s/bin/track/"
    | cgat csv2db
         --table=%(tablename)s
    > %(outfile)s
    '''

    P.run()
示例#3
0
def makeSalmonIndex(infile, outfile):
    # Long transcripts cause indexing to use lots of memory?
    job_memory = "64G"
    job_threads = 1

    gtf_basename = P.snip(os.path.basename(infile), ".gtf.gz")
    transcript_fasta = "salmon_index/" + gtf_basename + "transcripts.fa"
    fastaref = PARAMS["portcullis_fastaref"]
    index_options = PARAMS["salmon_indexoptions"]
    tmpfile = P.get_temp_filename()

    statement = '''
    gunzip -c %(infile)s > %(tmpfile)s;
    gffread %(tmpfile)s -g %(fastaref)s -w %(transcript_fasta)s;
    salmon index
      -p %(job_threads)s
      %(index_options)s
      -t %(transcript_fasta)s
      -i %(outfile)s
      --perfectHash;
    rm %(tmpfile)s
    '''
    P.run(statement)
示例#4
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"]))
    outs = iotools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
示例#5
0
def mergeSummarizedContextStats(infiles, outfile, samples_in_columns=False):
    """combine output from :func:`summarizeTagsWithinContext`.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format
    outfile : string
        Output filename in :term:`tsv` format.
    samples_in_columns :
        If True, put samples in columns. The default is to put them
        in rows.
    """

    header = ",".join(
        [P.snip(os.path.basename(x), ".contextstats.tsv.gz") for x in infiles])
    filenames = " ".join(infiles)

    if not samples_in_columns:
        transpose_cmd = \
            """| cgat table2table
            --transpose""" % P.getParams()
    else:
        transpose_cmd = ""

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    %(transpose_cmd)s
    | gzip
    > %(outfile)s
    """

    P.run(statement)
示例#6
0
def loadBigWigStats(infiles, outfile):
    '''merge and load bigwig summary for all wiggle files.

    Summarise and merge bigwig files for all samples and load into a
    table called bigwig_stats

    Parameters
    ----------
    infiles : list
       Input filenames in :term:`bigwig` format
    outfile : string
        Output filename, the table name is derived from `outfile`.
    '''

    data = " ".join([
        '<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x
        for x in infiles
    ])
    headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles])

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=track")

    statement = '''cgat combine_tables
    --header-names=%(headers)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(data)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    '''

    P.run()
def runPCA(infile, outfile, rownames=1):
    '''
    run principle components analysis on 
    normalised matrix
    '''
    #    ncol = len(open(infile).readline().strip("\n").split("\t"))
    # read in and format data
    R('''dat <- read.csv("%s",
                          header=T,
                          stringsAsFactors=F,
                          sep="\t",
                          row.names=%i)''' % (infile, rownames))
    # run PCA
    R('''pc.dat <- prcomp(as.matrix(t(dat)))''')

    # get scores
    R('''pc.dat.scores <- data.frame(pc.dat$x)''')
    R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''')
    R('''pc.dat.scores <- pc.dat.scores[, c("sample", 
                                          colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]'''
      )
    R('''write.table(pc.dat.scores,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outfile)

    # get the variance explained
    outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv"
    R('''ve <- data.frame(summary(pc.dat)$importance)''')
    R('''ve <- ve[2,]''')
    R('''write.table(ve,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outf_ve)
示例#8
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    # Load into database
    P.load(outf.name,
           outfile,
           options="--add-index=track")
    os.unlink(outf.name)
示例#9
0
def buildContigBed(infile, outfile):
    '''
    Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta`
    file and outputs them to :term:`BED` format
    Parameters
    ----------
    infile : str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      :term:`BED` format file containing contig name, value (0) and contig size
      in nucleotides.  The output file name is defined in
      `PARAMS: interface_contigs_bed`
    '''
    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs = iotools.open_file(outfile, "w")

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outs.write("%s\t%i\t%i\n" % (contig, 0, size))

    outs.close()
示例#10
0
def build_bam_stats(infiles, outfile):
    '''count number of reads mapped, duplicates, etc.
    Excludes regions overlapping repetitive RNA sequences
    Parameters
    ----------
    infiles : list
    infiles[0] : str
       Input filename in :term:`bam` format
    infiles[1] : str
       Input filename with number of reads per sample
    outfile : str
       Output filename with read stats
    annotations_interface_rna_gtf : str
        :term:`PARMS`. :term:`gtf` format file with repetitive rna
    '''

    job_memory = "32G"

    # Only one sample
    if len(infiles) == 3:
        bamfile, readsfile, rna_file = infiles
    # If there are multiple samples, programme specifies which .nreads file to use, by matching name to bam file
    else:
        bamfile = infiles[0]
        rna_file = infiles[-1]
        # Split file name up into directory and file name(/), then further split up by file name and file type and take file name (.)
        bam_name = bamfile.split('/')[1].split('.')[0]
        for i in range(1, len(infiles) - 1):
            nread_name = infiles[i].split('/')[1].split('.')[0]
            if bam_name == nread_name:
                readsfile = infiles[i]
                break
            else:
                continue

    nreads = ModuleTrna.getNumReadsFromReadsFile(readsfile)
    track = P.snip(os.path.basename(readsfile), ".nreads")

    # if a fastq file exists, submit for counting
    if os.path.exists(track + ".fastq.gz"):
        fastqfile = track + ".fastq.gz"
    elif os.path.exists(track + ".fastq.1.gz"):
        fastqfile = track + ".fastq.1.gz"
    else:
        fastqfile = None

    if fastqfile is not None:
        fastq_option = "--fastq-file=%s" % fastqfile
    else:
        fastq_option = ""

    statement = '''
    cgat bam2stats
         %(fastq_option)s
         --force-output
         --mask-bed-file=%(rna_file)s
         --ignore-masked-reads
         --num-reads=%(nreads)i
         --output-filename-pattern=%(outfile)s.%%s
    < %(bamfile)s
    > %(outfile)s
    '''

    P.run(statement)
示例#11
0
def buildUngappedContigBed(infile, outfiles):
    '''
    Constructs :term:`BED` format files containing both gapped and ungapped
    contig sizes from an index genome :term:`fasta` file.

    Parameters
    ----------
    infile: str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file

    assembly_gaps_min_size: int
      `PARAMS` - the minimum size (in nucleotides) for an assembly gap

    Returns
    -------
    outfiles: list
      two separate :term:`BED` format output files containing the contig sizes
      for contigs with and without gaps.  The names are defined
      in the `PARAMS` `interface_contigs_ungapped_bed` and
      `interface_gaps_bed` parameters.
    '''

    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs_nogap = iotools.open_file(outfiles[0], "w")
    outs_gap = iotools.open_file(outfiles[1], "w")
    min_gap_size = PARAMS["assembly_gaps_min_size"]

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():

        seq = fasta.getSequence(contig)

        def gapped_regions(seq):
            is_gap = seq[0] == "N"
            last = 0
            for x, c in enumerate(seq):
                if c == "N":
                    if not is_gap:
                        last = x
                        is_gap = True
                else:
                    if is_gap:
                        yield (last, x)
                        last = x
                        is_gap = False
            if is_gap:
                yield last, size

        last_end = 0
        for start, end in gapped_regions(seq):
            if end - start < min_gap_size:
                continue

            if last_end != 0:
                outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, start))
            outs_gap.write("%s\t%i\t%i\n" % (contig, start, end))
            last_end = end

        if last_end < size:
            outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, size))

    outs_nogap.close()
    outs_gap.close()
示例#12
0
# Pipeline configuration
###################################################
PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# Add automatically created files to the interface.  This is required
# when the pipeline is peek'ed.  The statement below will
# add the following to the dictionary:
#
# "geneset.dir/lincrna_gene_tss.bed.gz" maps to
# "interface_geneset_lincrna_gene_tss_bed"
PARAMS.update(
    dict([("interface_geneset_%s" %
           re.sub("[.]", "_", os.path.basename(P.snip(x, ".gz"))), x)
          for x in glob.glob('geneset.dir/*.bed.gz')]))


def connect():
    '''connect to database.'''

    dbh = sqlite3.connect(PARAMS["database_name"])
    return dbh


def connectToUCSC():
    return gtfsubset.connectToUCSC(host=PARAMS["ucsc_host"],
                                   user=PARAMS["ucsc_user"],
                                   database=PARAMS["ucsc_database"])
def matchBgSequenceComposition(gc_load_files,
                               background,
                               foreground,
                               fasta_file,
                               outfile,
                               database="csvdb",
                               header_line=True,
                               bg_stat="pCpG",
                               stat="fisher"):
    '''
    take the background set and subset it for intervals with a sequence
    composition distribution that is the same as the foreground set.
    Subsetting is done without replacement.
    This requires that the background set is sufficiently large, if the
    returned matched_background set is <90% size of foreground set, the
    pipeline will crash.
    '''
    background_file = open(background)
    foreground_file = open(foreground)

    if header_line:
        background_file.readline()
        foreground_file.readline()

    background_set = set()
    foreground_set = set()
    for interval_id in background_file.readlines():
        background_set.add(interval_id[:-1])
    for interval_id in foreground_file.readlines():
        foreground_set.add(interval_id[:-1])

    dbh = sqlite3.connect(database)
    cc = dbh.cursor()
    tablenames = [
        filenameToTablename(P.snip(os.path.basename(x), ".load"))
        for x in gc_load_files
    ]

    # jj: cpg scores rounded to three dp.
    # background dict - key <cpg score>: val <set of gene_ids with that score>
    # foreground dict - key <gene_id>: val <cpg score>

    gc = {"background": collections.defaultdict(set), "foreground": {}}
    for tablename in tablenames:

        # MM: need to make sure `-` in filenames don't break the sql statement

        tablename = tablename.replace("-", "_")
        for data in cc.execute("""SELECT * FROM %s;""" % tablename):
            interval_id = data[3].split(" ")[0]
            cpg = data[2]

            # jj: store background in 1 percent bins
            cpg_str = "%.3f" % cpg
            if re.search("background", tablename):
                if interval_id in background_set:
                    gc["background"][cpg_str].add(interval_id)

            elif re.search("foreground", tablename):
                if interval_id in foreground_set:
                    gc["foreground"][interval_id] = cpg_str

            else:
                raise ValueError("Unrecognized table name %s. Should contain"
                                 "'foreground' or 'background'" % tablename)

    # debug: pickle and dump the gc dict
    pickle_file = P.snip(foreground, ".foreground.tsv") + ".p"
    pickle.dump(gc, open(pickle_file, "wb"))

    # match the background set to the foreground set by taking a random
    # background interval with the the same sequence composition as each
    # foreground interval.
    outf = open(outfile, "w")
    if header_line:
        outf.write("gene_id\n")

    # jj: sample background gene_ids without replacement
    matched_background = set()
    X = 0
    for interval, cpg in gc["foreground"].items():

        # print("Finding background for foreground gene: %s (%s)" %
        # (interval, cpg))
        if cpg in list(gc["background"].keys()):

            # get set of bg gene_ids with relevant cpg score
            bg_gene_ids = gc["background"][cpg]

            # print "There are %i background genes in total" % len(bg_gene_ids)
            # remove foreground genes from background set
            bg_gene_ids = bg_gene_ids - foreground_set

            # print("There are %i background genes after removing foreground" %
            # len(bg_gene_ids))

            if bg_gene_ids:
                # select one gene_id to add to matched_background

                bg_id = random.sample(gc["background"][cpg], 1)[0]
                matched_background.add(bg_id)
                # remove selected background gene_id from set

                gc["background"][cpg].remove(bg_id)
            else:
                X += 1
                E.warn("Missing background gene for %s %s, no gene with"
                       " matching %s" %
                       (foreground_file.name, interval, bg_stat))

        else:
            X += 1
            E.warn("Missing background gene for %s %s, no gene with"
                   " matching %s" % (foreground_file.name, interval, bg_stat))

    # Hack
    # jj: check that background gene_list is <10% shorter than foregroung
    # hack
    # MM: only need to check sufficient background size for Fisher's exact test
    if stat == "fisher":
        assert len(matched_background) > 0.9 * len(foreground_set), (
            "There are insufficient genes with matched background to perform"
            " test for sample %s" % foreground_file)
    else:
        pass
    print("Number of genes with no available background: %i" % X)
    print("Foreground set: %i" % len(foreground_set))
    print("Backfround set: %i" % len(matched_background))
    outf.write("\n".join(matched_background) + "\n")
    outf.close()
示例#14
0
    def buildStatement(self, *args, **PARAMS):
        """
        Generate run statement for processing single, paired, or paired
        + singleton samples. 

        Required arguments: 
        index
        reference
        
        """

        run_options = PARAMS["sortmerna_run_options"]
        threads = PARAMS["sortmerna_threads"]

        # A comma separated list of references
        references = PARAMS["sortmerna_reference"]
        references = ' --ref '.join(references.split(','))
        # All listed references must be pre-indexed in this location
        index_dir = PARAMS[
            "sortmerna_index"]  # Check this isn't automatically passed.

        tmpf = P.get_temp_dir('.')
        tmpf_kvdb = os.path.join(tmpf, 'kvdb')
        tmpf_readb = os.path.join(tmpf, 'readb')

        if not self.fastn2:
            # Run sortMeRNA for single reads
            in_fastn1 = self.fastn1
            in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True)
            out_prefix = os.path.join(self.outdir, in_prefix)

            # Run sortMeRNA for single reads
            statement = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn1)s"
                " --ref %(references)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --aligned %(out_prefix)s_aligned"  # output location of aligned seq
                " --other %(out_prefix)s_unaligned"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --threads %(threads)s"
                " --zip-out" % locals())

        else:
            # Run sortMeRNA for paired reads
            in_fastn1 = self.fastn1
            in_fastn2 = self.fastn2
            in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True)
            out_prefix = os.path.join(self.outdir, in_prefix)
            # Run sortMeRNA for single reads
            statement = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn1)s"  # First read file
                " --reads %(in_fastn2)s"  # Second read file
                " --ref %(references)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --aligned %(out_prefix)s_aligned"  # output location of aligned seq
                " --other %(out_prefix)s_unaligned"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --paired_in"  # If one read is aligned, both are output to aligned file
                " --out2"  # Output paired reads to separate files
                " --threads %(threads)s"
                " --zip-out" % locals())

        if self.fastn3 and not PARAMS.get('sortmerna_skip_singletons', False):
            in_fastn3 = self.fastn3
            statement_2 = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn3)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --ref %(references)s"
                " --aligned %(out_prefix)s_aligned_singleton"  # output location of aligned seq
                " --other  %(out_prefix)s_unaligned_singleton"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --threads %(threads)s"
                " --zip-out" % locals())

            statement = " && ".join([
                statement,
                "rm -rf %(tmpf)s/*" % locals(),  # location of tmp_readb & kvdb
                statement_2,
                "rm -rf %(tmpf)s" % locals()
            ])

        return statement, run_options
示例#15
0
def loadBAMStats(infiles, outfile):
    '''load output of :func:`buildBAMStats` into database.

    Arguments
    ---------
    infiles : string
        Input files, output from :func:`buildBAMStats`.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    header = ",".join([P.snip(os.path.basename(x), ".readstats")
                       for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.to_table(outfile)

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --allow-empty-file")

    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --ignore-empty
    %(filenames)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s"""

    to_cluster = False

    P.run(statement)

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options="--allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """

        to_cluster = False

        P.run(statement)

    # load mapping qualities, there are two columns per row
    # 'all_reads' and 'filtered_reads'
    # Here, only filtered_reads are used (--take=3)
    for suffix in ("mapq",):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options=" --allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        --take=3
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """

        to_cluster = False

        P.run(statement)
示例#16
0
 def getID(infile):
     return P.snip(os.path.basename(infile),
                   ".mutect.snp.annotated.filtered.vcf")
示例#17
0
import glob
from pathlib import Path
from ruffus import *
from cgatcore import pipeline as P

# load options from the config file
PARAMS = P.get_parameters(
    ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"])

#get all files within the directory to process
SEQUENCEFILES = ("*fastq.gz")

SEQUENCEFILES_REGEX = regex(r"(\S+).(fastq.gz)")

scriptsdir = os.path.dirname(os.path.abspath(__file__))
scriptsdir = P.snip(scriptsdir, "pipelines") + "scripts"
PARAMS["scriptsdir"] = scriptsdir

reportdir = os.path.dirname(os.path.abspath(__file__))
reportdir = os.path.join(reportdir, "pipeline_docs", "Rmd")
PARAMS["reportdir"] = reportdir

########################################################
########################################################
########################################################
# Run humann3 on concatenated fastq.gz

# produces a humann3.dir which conatins
# a folder for each sample, which contains
# pathcoverage, pathabundance and genefamilies files.
########################################################
示例#18
0
def maskLowComplexity(fastq1, outfile):
    '''Either softmask low complexity regions, or remove reads with a large
    proportion of low complexity. 

    Uses BBTools scripts bbduk.sh (removal), or bbmask.sh. 

    Entropy is calculated as shannon entropy for of kmers with a specified size
    within a sliding window. Ranges from 0: mask nothing, 0.0001: mask
    homopolymers, 1: mask everything.
    '''

    bb_options = ' '.join(PARAMS['dust_options'].split(','))

    # bbmap assumes the file format based on the output being *fq.gz
    # I can't find any instructions as to how to override this.
    if IS_PAIRED:
        fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz'
        fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz'

        outfile1 = P.snip(outfile, '.1.gz') + '.1.fq.gz'
        outfile2 = P.snip(outfile, '.1.gz') + '.2.fq.gz'
        outfile3 = P.snip(outfile, '.1.gz') + '.3.fq.gz'

        out_disc1 = P.snip(outfile,
                           '_masked.fastq.1.gz') + '_discarded.fastq.1.fq.gz'
        out_disc2 = P.snip(outfile,
                           '_masked.fastq.1.gz') + '_discarded.fastq.2.fq.gz'
        out_disc3 = P.snip(outfile,
                           '_masked.fastq.1.gz') + '_discarded.fastq.3.fq.gz'

        if PARAMS['dust_discard_low_complexity']:
            statement1 = ("bbduk.sh"
                          "  in=%(fastq1)s"
                          "  in2=%(fastq2)s"
                          "  out=%(outfile1)s"
                          "  out2=%(outfile2)s"
                          "  outm=%(out_disc1)s"
                          "  outm2=%(out_disc2)s"
                          "  entropy=%(dust_entropy)s"
                          "  threads=%(dust_threads)s"
                          "  %(bb_options)s"
                          "  &> %(outfile)s.log")
            if IOTools.open_file(fastq3).read(1):
                statement2 = (" bbduk.sh"
                              "  in=%(fastq3)s"
                              "  out=%(outfile3)s"
                              "  outm=%(out_disc3)s"
                              "  entropy=%(dust_entropy)s"
                              "  threads=%(dust_threads)s"
                              "  %(bb_options)s"
                              "  &>> %(outfile)s.log")
            else:
                statement2 = (" touch %(outfile3)s  %(out_disc3)s")

            statement = " && ".join([statement1, statement2])

            P.run(statement, job_options=PARAMS['dust_run_options'])

        else:
            statement1 = ("bbmask.sh"
                          "  in=%(fastq1)s"
                          "  out=%(outfile1)s"
                          "  entropy=%(dust_entropy)s"
                          "  threads=%(dust_threads)s"
                          "  overwrite=t"
                          "  lowercase=t"
                          "  %(bb_options)s"
                          "  &> %(outfile)s.log &&"
                          " bbmask.sh"
                          "  in=%(fastq2)s"
                          "  out=%(outfile2)s"
                          "  entropy=%(dust_entropy)s"
                          "  threads=%(dust_threads)s"
                          "  overwrite=t"
                          "  lowercase=t"
                          "  %(bb_options)s"
                          "  &>> %(outfile)s.log")
            if IOTools.open_file(fastq3).read(1):
                statement2 = (" bbmask.sh"
                              "  in=%(fastq3)s"
                              "  out=%(outfile3)s"
                              "  entropy=%(dust_entropy)s"
                              "  threads=%(dust_threads)s"
                              "  overwrite=t"
                              "  lowercase=t"
                              "  %(bb_options)s"
                              "  &>> %(outfile)s.log")
            else:
                statement2 = (" touch %(outfile3)s")

            statement = " && ".join([statement1, statement2])

            P.run(statement, job_options=PARAMS['dust_run_options'])

        # Renaming files because of bbmap idiosyncracies
        of1 = P.snip(outfile1, '.fq.gz') + '.gz'
        of2 = P.snip(outfile2, '.fq.gz') + '.gz'
        of3 = P.snip(outfile3, '.fq.gz') + '.gz'
        os.rename(outfile1, of1)
        os.rename(outfile2, of2)
        os.rename(outfile3, of3)

        if PARAMS['dust_discard_low_complexity']:
            od1 = P.snip(out_disc1, '.fq.gz') + '.gz'
            od2 = P.snip(out_disc2, '.fq.gz') + '.gz'
            od3 = P.snip(out_disc3, '.fq.gz') + '.gz'
            os.rename(out_disc1, od1)
            os.rename(out_disc2, od2)
            os.rename(out_disc3, od3)

    else:
        outfile1 = P.snip(outfile, '.gz') + '.fq.gz'
        out_disc = P.snip(outfile,
                          '_masked.fastq.1.gz') + '_discarded.fastq.1.fq.gz'

        if PARAMS['dust_discard_low_complexity']:
            statement = ("bbduk.sh"
                         " in=%(fastq1)s"
                         " out=%(outfile1)s"
                         " outm=%(out_disc)s"
                         " entropy=%(dust_entropy)s"
                         " threads=%(dust_threads)s"
                         " lowercase=t"
                         " %(bb_options)s"
                         " &> %(outfile)s.log")

            P.run(statement, job_options=PARAMS['dust_run_options'])

        else:
            statement = ("bbmask.sh"
                         " in=%(fastq1)s"
                         " out=%(outfile1)s"
                         " entropy=%(dust_entropy)s"
                         " threads=%(dust_threads)s"
                         " lowercase=t"
                         " %(bb_options)s"
                         " &> %(outfile.log")

            P.run(statement, job_options=PARAMS['dust_run_options'])

        os.rename(outfile1, outfile)
        if PARAMS['dust_discard_low_complexity']:
            od1 = P.snip(out_disc, '.fq.gz') + '.gz'
            os.rename(out_disc, od1)
示例#19
0
def removeHost(fastq1, outfile):
    '''Remove host contamination using bmtagger'''

    outf_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.txt'
    outf_host_stub = P.snip(outf_host, '.txt') + '_toremove'

    # Currently disabled. Has no effect. See drop_fastq.py
    # # Whether to keep pair if a read is identified as host.
    # if PARAMS['bmtagger_keep_pairs']:
    #     keep_pairs = True
    #     E.info("BMTagger: reads with a pair identified as host will be"
    #            " discarded")
    # else:
    #     keep_pairs = False
    #     E.info("BMTagger: reads with a pair identified as host will be"
    #            " kept as singletons (assuming they are not also identified"
    #            " as host)")

    if IS_PAIRED:
        fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz'
        fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz'

        to_remove_paired = P.get_temp_filename('.')
        to_remove_singletons = P.get_temp_filename('.')

        # In some cases, it may be desirable to screen against multiple hosts.
        indexes = zip(PARAMS['bmtagger_bitmask'].split(','),
                      PARAMS['bmtagger_srprism'].split(','))
        for n, indexes in enumerate(indexes, 1):
            n = str(n)
            bitmask, srprism = indexes

            # Screen the paired reads, then singletons
            tmpdir1 = P.get_temp_dir('.')
            tmpdir2 = P.get_temp_dir('.')

            tmpf1 = P.get_temp_filename('.')
            tmpf2 = P.get_temp_filename('.')
            tmpf3 = P.get_temp_filename('.')

            # bmtagger truncates fasta headers...  sed 's/[[:space:]]\+/__/g'
            # It won't accept... sed 's|[[:space:]].*$|/1|'
            # It also fails if fastq1 header differs from fastq2
            statement1 = (
                "zcat %(fastq1)s > %(tmpf1)s &&"
                " zcat %(fastq2)s > %(tmpf2)s &&"
                " bmtagger.sh"
                "  -b %(bitmask)s"
                "  -x %(srprism)s"
                "  -T %(tmpdir1)s"
                "  -q1"  # Input is fastq
                "  -1 %(tmpf1)s"
                "  -2 %(tmpf2)s"
                "  -o %(outf_host_stub)s_paired%(n)s"
                "  &> %(outfile)s.log &&"
                " cat %(outf_host_stub)s_paired%(n)s"
                "  >> %(to_remove_paired)s &&"
                " rm -rf %(tmpdir1)s %(tmpf1)s %(tmpf2)s"
                "  %(outf_host_stub)s_paired%(n)s")

            # Screen the singletons
            if IOTools.open_file(fastq3).read(1):
                statement2 = (
                    "zcat %(fastq3)s > %(tmpf3)s &&"
                    " bmtagger.sh"
                    "  -b %(bitmask)s"
                    "  -x %(srprism)s"
                    "  -T %(tmpdir2)s"
                    "  -q1"  # Input is fastq
                    "  -1 %(tmpf3)s"
                    "  -o %(outf_host_stub)s_singletons%(n)s"
                    " &>> %(outfile)s.log &&"
                    " cat %(outf_host_stub)s_singletons%(n)s"
                    "  >> %(to_remove_singletons)s &&"
                    " rm -rf %(tmpdir2)s %(tmpf3)s"
                    "  %(outf_host_stub)s_singletons%(n)s")
            else:
                statement2 = ("touch  %(to_remove_singletons)s &&"
                              " rm -rf %(tmpdir2)s %(tmpf3)s")

            statement = " && ".join([statement1, statement2])

            P.run(statement, job_options=PARAMS['bmtagger_run_options'])

        # Drop host contaminated reads
        # A hack due to the fact that BMTagger truncates fastq identifiers
        # TO DO: Look at bmtagger/.../bin/extract_fullseq
        drop_script = os.path.join(
            os.path.splitext(__file__)[0], 'drop_fastqs.py')

        fastq1_out = outfile
        fastq2_out = P.snip(outfile, '.1.gz') + '.2.gz'
        fastq3_out = P.snip(outfile, '.1.gz') + '.3.gz'

        fastq1_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.1.gz'
        fastq2_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.2.gz'
        fastq3_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.3.gz'

        statement = ("python %(drop_script)s"
                     " --fastq1 %(fastq1)s"
                     " --fastq2 %(fastq2)s"
                     " --fastq3 %(fastq3)s"
                     " --to-drop-paired %(to_remove_paired)s"
                     " --to-drop-single %(to_remove_singletons)s"
                     " --fastq-out1 %(fastq1_out)s"
                     " --fastq-out2 %(fastq2_out)s"
                     " --fastq-out3 %(fastq3_out)s"
                     " --fastq-drop1 %(fastq1_host)s"
                     " --fastq-drop2 %(fastq2_host)s"
                     " --fastq-drop3 %(fastq3_host)s"
                     " &>> %(outfile)s.log")

        P.run(statement)

        os.unlink(to_remove_paired)
        os.unlink(to_remove_singletons)

    else:
        indexes = zip(PARAMS['bmtagger_bitmask'].split(','),
                      PARAMS['bmtagger_srprism'].split(','))
        to_remove = P.get_temp_filename('.')

        for n, indexes in enumerate(indexes, 1):
            n = str(n)
            bitmask, srprism = indexes
            # Screen the singletons
            tmpdir1 = P.get_temp_dir('.')
            tmpf = P.get_temp_filename('.')

            statement = (
                "zcat %(fastq1)s > %(tmpf)s &&"
                " bmtagger.sh"
                "  -b %(bitmask)s"
                "  -x %(srprism)s"
                "  -T %(tmpdir1)s"
                "  -q1"  # Input is fastq
                "  -1 %(tmpf)s"
                "  -o %(outf_host_stub)s_%(n)s"
                "  &>> %(outfile)s.log &&"
                " cat %(outf_host_stub)s_%(n)s >> %(to_remove)s"
                " rm -rf %(tmpdir1)s %(tmpf)s %(outf_host_stub)s_%(n)s")

            P.run(statement, job_options=PARAMS['bmtagger_run_options'])

        # Drop host contaminated reads
        drop_script = ps.path.join(
            os.path.splitext(__file__)[0], 'drop_single_fastqs.py')

        fastq_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz'

        statement = ("python %(drop_script)s"
                     " --fastq1 %(fastq1)s"
                     " --to-drop-single %(to_remove)s"
                     " --fastq-out1 %(outfile)s"
                     " --fastq-drop1 %(fastq_host)s"
                     " &>> %(outfile)s.log")
        P.run(statement)

        os.unlink(to_remove)
示例#20
0
def removeAdapters(fastq1, outfile1):
    '''Remove adapters using Trimmomatic'''

    if IS_PAIRED:
        fastq2 = P.snip(fastq1, FASTQ1_SUFFIX) + FASTQ2_SUFFIX
        outfile2 = P.snip(outfile1, '.fastq.1.gz') + '.fastq.2.gz'
        outf1_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.1s.gz'
        outf2_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.2s.gz'
        outf_singletons = P.snip(outfile1, '.fastq.1.gz') + '.fastq.3.gz'
        logfile = P.snip(outfile1, '.fastq.1.gz') + '.trim.log'
        logfile2 = P.snip(outfile1, '.fastq.1.gz') + '.log'

        statement = (
            "java -Xmx5g -jar %(trimmomatic_jar_path)s PE"
            " -threads %(trimmomatic_n_threads)s"
            " -phred%(phred_format)s"
            " -trimlog %(logfile)s"
            " %(fastq1)s"  # input read 1
            " %(fastq2)s"  # input read 2
            " %(outfile1)s"  # output read 1
            " %(outf1_singletons)s"  # output unpaired read 1
            " %(outfile2)s"  # output read 2
            " %(outf2_singletons)s"  # output unpaired read 2
            " ILLUMINACLIP:"
            "%(trimmomatic_adapters)s:"
            "%(trimmomatic_seed_mismatches)s:"
            "%(trimmomatic_score_palendromic)s:"
            "%(trimmomatic_score_simple)s:"
            "%(trimmomatic_min_adapter_len)s:"
            "%(trimmomatic_keep_both_reads)s"
            " LEADING:%(trimmomatic_quality_leading)s"
            " TRAILING:%(trimmomatic_quality_trailing)s"
            " MINLEN:%(trimmomatic_minlen)s"
            " &> %(logfile2)s &&"
            " gzip -f %(logfile)s &&"
            " cat %(outf1_singletons)s %(outf2_singletons)s "
            "  > %(outf_singletons)s &&"
            " rm -f %(outf1_singletons)s && rm -f %(outf2_singletons)s")

        P.run(statement, job_options=PARAMS['trimmomatic_run_options'])

    else:
        logfile = P.snip(outfile1, '.fastq.1.gz') + '.trim.log'
        logfile2 = P.snip(outfile1, '.fastq.1.gz') + '.log'

        statement = (
            "java -Xmx5g -jar %(trimmomatic_jar_path)s PE"
            " -threads %(trimmomatic_n_threads)s"
            " -phred%(phred_format)s"
            " -trimlog %(logfile)s"
            " %(fastq1)s"  # input read 1
            " %(outfile1)s"  # output read 1
            " ILLUMINACLIP:"
            "%(trimmomatic_adapters)s:"
            "%(trimmomatic_seed_mismatches)s:"
            "%(trimmomatic_score_palendromic)s:"
            "%(trimmomatic_score_simple)s"
            "%(trimmomatic_min_adapter_len)s:"
            "%(trimmomatic_keep_both_reads)s"
            " LEADING:%(trimmomatic_quality_leading)s"
            " TRAILING:%(trimmomatic_quality_trailing)s"
            " MINLEN:%(trimmomatic_minlen)s"
            " &> %(logfile2)s &&"
            " gzip -f %(logfile)s")

        P.run(statement, job_options=PARAMS['trimmomatic_run_options'])
示例#21
0
    '''
    generic split by newline and tab for reading tsv files
    '''
    return line[:-1].split("\t")


#########################################################################
#########################################################################
#########################################################################


@follows(mkdir("gtfs"))
@merge([PARAMS["genesets_abinitio_coding"], PARAMS["genesets_reference"]],
       os.path.join(
           "gtfs",
           P.snip(PARAMS["genesets_abinitio_coding"], ".gtf.gz") +
           "_coding.gtf.gz"))
def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes.

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
示例#22
0
def loadPicardMetrics(infiles, outfile, suffix,
                      pipeline_suffix=".picard_stats",
                      tablename=None):
    '''load picard metrics.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.

    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)

    outf = P.get_temp_file(".")

    filenames = ["%s.%s" % (x, suffix) for x in infiles]

    first = True
    for filename in filenames:
        track = P.snip(os.path.basename(filename), "%s.%s" %
                       (pipeline_suffix, suffix))

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = iotools.open_file(filename, "r").readlines()

        # extract metrics part
        rx_start = re.compile("## METRICS CLASS")
        for n, line in enumerate(lines):
            if rx_start.search(line):
                lines = lines[n + 1:]
                break

        for n, line in enumerate(lines):
            if not line.strip():
                lines = lines[:n]
                break

        if len(lines) == 0:
            E.warn("no lines in %s: %s" % (track, filename))
            continue

        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
            fields = lines[0][:-1].split("\t")
        else:
            f = lines[0][:-1].split("\t")
            if f != fields:
                raise ValueError(
                    "file %s has different fields: expected %s, got %s" %
                    (filename, fields, f))

        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()

    P.load(outf.name,
           outfile,
           tablename=tablename,
           options="--add-index=track --allow-empty-file")

    os.unlink(outf.name)
示例#23
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                       for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    to_cluster = False

    P.run(statement)
示例#24
0
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(
            iotools.openFile(infile),
            index_col=0,
            sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_foreground_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_foreground_max_threshold" % track)]
        genesets.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_background_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_background_max_threshold" % track)]

        E.info('%s: background: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))
        backgrounds.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" % (track,
                                     len(genesets[-1]),
                                     len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with iotools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with iotools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with iotools.openFile(outfile + ".matrix.gz", "w") as outf:
        iotools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with iotools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        iotools.writeMatrix(outf, matrix, headers, headers)
示例#25
0
              --output-filename-pattern=%%DIR%%/
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts,
         mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design), "diff_methylation/%s_%s.deseq.gz" %
         (P.snip(os.path.basename(data),
                 ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")


#########################################################################
#########################################################################
示例#26
0
def loadGeneListMatrix(infile, outfile):
    '''load fgene list matrix into table.'''
    track = P.snip(infile, ".tsv.gz")
    P.load(infile, outfile, tablename="%s_foreground" % track)
    P.load(infile + ".bg.tsv.gz", outfile, tablename="%s_background" % track)
示例#27
0
    def getRunStatement(self, infile, outfile, controlfile):
        """
        Generate a specific run statement for each peakcaller class
        """

        # generate outfile prefix
        dir_name = os.path.dirname(outfile)
        infile_stub = P.snip(os.path.basename(infile), ".bam")
        control_stub = P.snip(os.path.basename(controlfile), ".bam")
        outfile_stub = infile_stub + "_VS_" + control_stub
        outfile_stub = os.path.join(dir_name, outfile_stub)

        # build macs2 commandline statement
        statement = [("macs2 callpeak"
                      " --treatment %(infile)s"
                      " --control %(controlfile)s"
                      " --verbose=10")]

        # add additional parameters
        # currently the input read format has to be bam bc of ruffus regex
        statement.append("--format BAM")
        statement.append("--name %s" % outfile_stub)
        # require genome size, if it is not specified try to take from genome
        if not re.search("-g\s|--gsize",
                         self.PARAMS_PEAKCALLER["macs2_options_parameters"]):
            statement.append(
                "--gsize %s" %
                self.PARAMS_PEAKCALLER["macs2_options_genome_prefix"][:2])

        # set threshold for lax peak calling
        if self.PARAMS_PEAKCALLER["macs2_options_fdr"]:
            if self.PARAMS_PEAKCALLER["macs2_options_pvalue"]:
                raise Exception("Value specified for both macs2 options"
                                " -pvalue and -fdr please select one or"
                                " other option, but not both")
            else:
                threshold = "--qvalue " + \
                    str(self.PARAMS_PEAKCALLER["macs2_options_fdr"])
        elif self.PARAMS_PEAKCALLER["macs2_options_pvalue"]:
            threshold = "--pvalue=" + \
                str(self.PARAMS_PEAKCALLER["macs2_options_pvalue"])
        else:
            raise Exception("Must specify a value for either"
                            " macs2_options_pvalue or macs2_options_fdr,"
                            " but not both")
        statement.append(threshold)

        # deal with duplicate reads
        if self.PARAMS_PEAKCALLER["macs2_options_keep_duplicates"]:
            statement.append(
                "--keep-dup %s" %
                self.PARAMS_PEAKCALLER["macs2_options_keep_duplicates"])

        # add additional parameters
        statement.append(self.PARAMS_PEAKCALLER["macs2_options_parameters"])

        # write log information to sentinel file
        statement.append(">& %(outfile)s")

        statement = (" ".join(statement) % locals())

        return statement
示例#28
0
def buildDMRStats(tables, method, outfile, dbhandle):
    '''build dmr summary statistics.

    This method counts the number of up/down, 2fold up/down, etc.
    genes in output from (:mod:`scripts/runExpression`).

    This method also creates diagnostic plots in the
    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    Arguments
    ---------
    tables ; list
        List of tables with DMR output
    method : string
        Method name
    outfile : string
        Output filename. Tab separated file summarizing

    '''
    def togeneset(tablename):
        return re.match("([^_]+)_", tablename).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = iotools.openFile(outfile, "w")
    outf.write("\t".join((
        "tileset",
        "design",
        "track1",
        "track2",
        "tested",
        "\t".join(["status_%s" % x for x in keys_status]),
        "significant",
        "up",
        "down",
        "twofold",
        "twofold_up",
        "twofold_down",
    )) + "\n")

    all_tables = set(Database.getTables(dbhandle))
    outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")

    for tablename in tables:

        prefix = P.snip(tablename, "_%s" % method)
        tileset, design = prefix.split("_")

        def toDict(vals, l=2):
            return collections.defaultdict(int, [(tuple(x[:l]), x[l])
                                                 for x in vals])

        E.info("collecting data from %s" % tablename)

        tested = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            GROUP BY treatment_name,control_name""" % locals()).fetchall())
        status = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, status,
            COUNT(*) FROM %(tablename)s 
            GROUP BY treatment_name,control_name,status""" %
                locals()).fetchall(), 3)
        signif = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s 
            WHERE significant
            GROUP BY treatment_name,control_name""" % locals()).fetchall())
        fold2 = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s
            WHERE (l2fold >= 1 or l2fold <= -1) AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())
        up = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 0 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        down = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < 0 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        fold2up = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 1 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        fold2down = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < -1 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        groups = list(tested.keys())

        for treatment_name, control_name in groups:
            k = (treatment_name, control_name)
            outf.write("\t".join(
                map(str, (tileset, design, treatment_name, control_name,
                          tested[k], "\t".join([
                              str(status[(treatment_name, control_name, x)])
                              for x in keys_status
                          ]), signif[(k)], up[k], down[k], fold2[k],
                          fold2up[k], fold2down[k]))) + "\n")

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait(
            dbhandle, '''SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant''' % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = list(zip(*data))

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals(
            )
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10(length)',
                            ylab='log10(pvalue)',
                            log="x",
                            pch=20,
                            cex=.1)

            R['dev.off']()

    outf.close()
示例#29
0
def intersectionHeatmap(infiles, outfile):
    ''' calculate the intersection between the infiles and plot'''

    pandas2ri.activate()

    name2genes = {}
    df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"])

    ix = 0
    for inf in infiles:

        name = P.snip(os.path.basename(inf)).split(".")[0]
        name = name.replace(".", "_")

        with iotools.open_file(inf, "r") as f:
            genes = set()

            for line in f:
                if line[0] == "#":
                    continue

                values = line.strip().split("\t")
                info = values[7].split(";")

                for x in info:
                    if x.split("=")[0] == "SNPEFF_GENE_NAME":
                        gene_name = x.split("=")[1]
                        break

                # if no gene name found, line is skipped
                if gene_name:
                    genes.update((gene_name, ))

        name2genes[name] = genes
        df.loc[ix] = [name, name, len(genes), 1.0]
        ix += 1

    for pair in itertools.permutations(list(name2genes.keys()), 2):
        id_1, id_2 = pair
        intersection = len(name2genes[id_1].intersection(name2genes[id_2]))
        not_intersecting = len(name2genes[id_1].symmetric_difference(
            name2genes[id_2]))
        intersection_perc = float(intersection) / (intersection +
                                                   not_intersecting)

        df.loc[ix] = [id_1, id_2, intersection, intersection_perc]
        ix += 1

    variant = os.path.basename(outfile).replace("overlap_", "").replace(
        "_heatmap.png", "")

    plotIntersectionHeatmap = R('''
    function(df){
    library(ggplot2)
    m_txt = element_text(size=15)
    m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1)
    l_txt = element_text(size=20)

    p = ggplot(df, aes(id_1, id_2, fill=100*perc)) +
    geom_tile() +
    geom_text(aes(label=intersection), size=3) +
    scale_fill_gradient(name="Intersection (%%)", limits=c(0,100),
                       low="yellow", high="dodgerblue4") +
    theme(axis.text.x = m_txt_90, axis.text.y = m_txt,
          legend.text = m_txt, legend.title = m_txt,
          aspect.ratio=1) +
    xlab("") + ylab("") +
    ggtitle("%(variant)s")

    ggsave("%(outfile)s", width=10, height=10)
    }''' % locals())

    plotIntersectionHeatmap(df)
示例#30
0
def sortByPosition(infile, outfile):
    '''Add number of hits tags to sam file'''
    to_cluster = USECLUSTER
    track = P.snip(outfile, ".bam")
    statement = '''samtools sort %(infile)s %(track)s;'''
    P.run()