示例#1
0
def loadTranscriptStats(infile, outfile):
    '''compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--map=gene_id:str")

    statement = '''
    gunzip < %(infile)s |\
    cgat gtf2table \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
示例#2
0
def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

       context_stats
       bam_stats
    '''

    dbh = connect()

    tablename = P.to_table(outfile)
    view_type = "TABLE"
    tables = ((
        "bam_stats",
        "track",
    ), (
        "context_stats",
        "track",
    ))

    # do not use: ("picard_stats_alignment_summary_metrics", "track"),)
    # as there are multiple rows per track for paired-ended data.

    P.create_view(dbh, tables, tablename, outfile, view_type)
示例#3
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.get_temp_file(".")

        for infile in infiles:
            table = P.to_table(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run(statement)

        os.unlink(tmpf.name)
示例#4
0
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2table
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run(statement)
示例#5
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run(statement)
示例#6
0
def loadPeptideSequences(infile, outfile):
    '''load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    '''

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-protein_id"
                                            "--map=protein_id:str")

    statement = '''gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | cgat fasta2fasta --method=filter
    --filter-method=min-length=1
    | cgat fasta2table --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
示例#7
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """
    load_statement = P.build_load_statement(
        P.to_table(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run(statement, job_memory=PARAMS["job_memory"])
示例#8
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                       for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run(statement)
示例#9
0
def loadMutectFilteringSummary(infile, outfile):
    '''Load mutect extended output into database'''

    dbh = connect()
    tablename = P.to_table(outfile)
    statement = '''cat %(infile)s |
                   cgat csv2db
                   --table %(tablename)s --retry --ignore-empty
                   > %(outfile)s'''
    P.run(statement)
示例#10
0
def loadGeneInformation(infile,
                        outfile,
                        only_proteincoding=False,
                        job_memory="4G"):
    '''load gene-related attributes from :term:`gtf` file into database.

    This method takes transcript-associated features from an
    :term:`gtf` file and collects the gene-related attributes in the
    9th column of the gtf file, ignoring exon_id, transcript_id,
    transcript_name, protein_id and exon_number.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename, contains logging information. The
       table name is derived from the filename of outfile.
    only_proteincoding : bool
       If True, only consider protein coding genes.

    '''

    table = P.to_table(outfile)

    if only_proteincoding:
        filter_cmd = """cgat gtf2gtf
        --method=filter --filter-method=proteincoding""" % PARAMS
    else:
        filter_cmd = "cat"

    load_statement = P.build_load_statement(table,
                                            options="--add-index=gene_id "
                                            "--add-index=gene_name"
                                            "--map=gene_name:str")

    statement = '''
    zcat %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | cgat gtf2gtf
    --method=sort --sort-order=gene+transcript
    | cgat gtf2tsv
    --attributes-as-columns --output-only-attributes -v 0
    | cgat csv-cut
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | (read h; echo \"$h\"; sort ) "
    | uniq
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement, job_memory=job_memory)
示例#11
0
def loadVCFstats(infiles, outfile):
    '''Import variant statistics into SQLite'''
    filenames = " ".join(infiles)
    tablename = P.to_table(outfile)
    csv2db_options = PARAMS["csv2db_options"]
    E.info("Loading vcf stats...")
    statement = '''cgat vcfstats2db
                   %(filenames)s >> %(outfile)s; '''
    statement += '''cat vcfstats.txt |
                    cgat csv2db %(csv2db_options)s
                    --allow-empty-file --add-index=track --table=vcf_stats
                    >> %(outfile)s; '''
    P.run(statement)
示例#12
0
def exportPeakLocations(infile, outfile):
    '''export peak locations
    '''

    dbh = connect()
    outf = IOTools.open_file(outfile, "w")
    cc = dbh.cursor()
    table = P.to_table(infile)
    for x in cc.execute("""SELECT contig, peakcenter,
    peakcenter+1, interval_id, peakval
    FROM %(table)s """ % locals()):
        outf.write("\t".join(map(str, x)) + "\n")
    outf.close()
示例#13
0
def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)

    outf = P.get_temp_file(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.open_file(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
示例#14
0
def loadMotifSequenceComposition(infile, outfile):
    '''compute sequence composition of sequences used for ab-initio search.'''

    load_statement = P.build_load_statement(
        P.to_table(outfile))

    statement = '''
    cgat fasta2table
        --section=na
        --log=%(outfile)s
    < %(infile)s
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
示例#15
0
def loadMotifSequenceComposition(infile, outfile):
    '''compute sequence composition of sequences used for ab-initio search.'''

    tablename = P.to_table(outfile)

    statement = '''
    cgat fasta2table
        --section=na
        --log=%(outfile)s
    < %(infile)s
    | cgat csv2db
        %(csv2db_options)s
        --table=%(tablename)s
    > %(outfile)s'''

    P.run(statement)
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.to_table(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("id")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.get_temp_file(".")

    # parse the text file
    for line in IOTools.open_file(infile):
        if line.startswith("#Query"):
            tmpfile.write('\t'.join(("target_name", "query_id", "target_id",
                                     "optimal_offset", "pvalue", "evalue",
                                     "qvalue", "Overlap", "query_consensus",
                                     "target_consensus", "orientation")) +
                          "\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
示例#17
0
def loadProteinStats(infile, outfile):
    '''compute and load protein sequence properties into database.

    The method computes amino acid composition, length, and hash
    for each peptide sequence.

    The method calls :doc:`fasta2table` with the following counters:

    * length - protein sequence length
    * hid - protein sequence hash identifier
    * aa - protein sequence composition

    Arguments
    ---------
    infile : string
       Fiename of ENSEMBL peptide file in :term:`fasta` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=protein_id "
                                            "--map=protein_id:str")

    statement = '''
    gunzip < %(infile)s
    | cgat fasta2fasta
    --method=filter
    --filter-method=min-length=1
    | awk 'match($0, /(>[a-zA-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]}
    !/^>/ {print}'
    | cgat fasta2table
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    | sed "s/^id/protein_id/"
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
示例#18
0
def loadGeneCoordinates(infile, outfile):
    '''merge transcripts to generate the genomic coordinates per gene
    and load '''

    # TS. remove transcript_id column as this is now meaningless
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--ignore-column=transcript_id "
                                            "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=merge-transcripts
    | cgat gtf2tsv
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
示例#19
0
def loadTranscript2Gene(infile, outfile):
    '''build a map of transcript to gene from gtf file and load into database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.
    '''
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2tsv --output-map=transcript2gene -v 0
    | %(load_statement)s
    > %(outfile)s'''
    P.run(statement)
示例#20
0
def loadIntervals(infile, outfile):
    '''load intervals from :term:`bed` formatted files into database.
    '''

    bedfile = infile

    track = Sample(filename=P.snip(infile, ".bed.gz"))
    bamfiles, offsets = getAssociatedBAMFiles(track)
    control = ""

    if bamfiles:
        E.info("%s: associated bamfiles = %s" % (track, bamfiles))
    else:
        E.info("%s: no bamfiles associated" % (track))

    assert (len(bamfiles) == 1)
    bamfile = bamfiles[0]
    offset = offsets[0]

    tablename = P.to_table(outfile)

    statement = '''zcat %(bedfile)s
                | awk '{printf("%%s\\t%%i\\t%%i\\t%%i\\n", $1,$2,$3,++a)}'
                | cgat bed2table
                           --counter=peaks
                           --bam-file=%(bamfile)s
                           --offset=%(offset)i
                           --bed-header=contig,start,end,interval_id
                           %(control)s
                           --output-all-fields
                           --log=%(outfile)s
                | cgat csv2db %(csv2db_options)s
                       --add-index=contig,start
                       --add-index=interval_id
                       --table=%(tablename)s
                       --allow-empty-file
                > %(outfile)s'''

    P.run(statement)
示例#21
0
def loadStrandSpecificity(infiles, outfile,
                          suffix="strand",
                          tablename=None):
    '''
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)

    outf = P.get_temp_file(".")

    table_count = 0
    table_join = None

    for infile in infiles:
        name = P.snip(os.path.basename(infile), ".strand")

        table = pd.read_csv(infile, sep="\t", comment="#")
        table["track"] = name

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["MSR", "ISR", "OSR", "ISF", "MSF", "OSF", "SF", "SR", "track"],
                                     how="outer")

    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
示例#22
0
def loadmiRNATranscripts(infile, outfile):
    '''load transcripts from a GFF3 file into the database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gff3` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--allow-empty-file "
                                            "--header-names=feature,Name")

    statement = '''
     export LANG=en_GB.UTF-8 && zcat %(infile)s
    | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null
    | grep -v "#"
    | cut -f3,12
    |%(load_statement)s
    > %(outfile)s'''
    P.run(statement, job_memory=PARAMS["job_memory"])
示例#23
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2tsv
    | %(load_statement)s
    > %(outfile)s'''
    P.run(statement)
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.to_table(outfile)

    tmpfile = P.get_temp_file(".")

    tmpfile.write(MAST.Match().header + "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = IOTools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(
                str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    motif_fg,
                    contig,
                    "\t".join(map(str, controls[id]["l"])),
                    "\t".join(map(str, controls[id]["r"])),
                    str(min_evalue),
                    str(min_pvalue),
                    str(max_nmatches),
                ) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
示例#25
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        IOTools.touch_file(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.get_temp_filename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run(statement)

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run(statement)
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = IOTools.open_file(tmpfile_name, "w")
    tmpfile.write("\t".join(("expression", "fold", "fdr", "counts",
                             "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = IOTools.open_file(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(
                map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y],
                          100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(
                map(str, (fdr, power, power_counts.sum().sum(), 100.0 *
                          power_counts.sum().sum() / unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.to_table(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
示例#26
0
def loadBAMStats(infiles, outfile):
    '''load output of :func:`buildBAMStats` into database.
    Arguments
    ---------
    infiles : string
        Input files, output from :func:`buildBAMStats`.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    header = ",".join([P.snip(os.path.basename(x), ".readstats")
                       for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.to_table(outfile)

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --allow-empty-file")

    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --ignore-empty
    %(filenames)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s"""
    P.run(statement)

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options="--allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run(statement)

    # load mapping qualities, there are two columns per row
    # 'all_reads' and 'filtered_reads'
    # Here, only filtered_reads are used (--take=3)
    for suffix in ("mapq",):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options=" --allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        --take=3
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run(statement)
示例#27
0
def loadEnsemblTranscriptInformation(ensembl_gtf,
                                     geneset_gtf,
                                     outfile,
                                     csvdb,
                                     set_biotype=None,
                                     set_transcript_support=None):
    '''
    Parse and annotate a geneset_gtf using the original Ensembl
    GTF attributes.

    The ensembl GTF structure is not static, so this needs to maintain
    backwards compatibility.  For certain versions, attributes may be
    present in later versions which are used downstream.  These should
    be set with default/missing values if they are not natively present.

    Therefore, gene_biotype is taken from the "feature" field if it is
    not present, and transcript_support = NA if missing.

    Arguments
    ---------
    ensembl_gtf: string
      PATH to ensemlb gtf containing all annotation information and
      attributes

    geneset_gtf: string
      PATH to the geneset GTF to annotate with ensembl attributes

    outfile: string
      PATH to output filtered, annotated and sorted by gene position

    csvdb: string
      PATH to the SQLite database to upload transcript information
      table

    ensembl_version: int
      Ensembl build version used

    set_biotype: string
      should the gene_ and transcript_biotype columns be set
      to a default value.  If false, and not present, default
      value is to use the "feature" attribute

    set_transcript_support: int
      should the transcript_support_level be set to a default value,
      if not it will be set to NA
    '''

    table = P.to_table(outfile)

    gtf_file = IOTools.open_file(geneset_gtf, "rb")
    gtf_iterator = GTF.transcript_iterator(GTF.iterator(gtf_file))

    ensembl_file = IOTools.open_file(ensembl_gtf, "rb")
    ensembl_iterator = GTF.transcript_iterator(GTF.iterator(ensembl_file))

    # parse the two gtfs, creating keys from the GTF entries
    parse_ensembl = {}
    for ens_gtf in ensembl_iterator:
        for ens_trans in ens_gtf:
            ens_att = ens_trans.asDict()
            ens_vals = dict(
                zip(ens_trans.keys(),
                    [ens_trans[x] for x in ens_trans.keys()]))
            ens_att.update(ens_vals)
            parse_ensembl[ens_trans.transcript_id] = ens_att
    ensembl_file.close()

    parse_gtf = {}
    for gtf in gtf_iterator:
        for trans in gtf:
            trans_atts = trans.asDict()
            trans_vals = dict(
                zip(trans.keys(), [trans[g] for g in trans.keys()]))
            trans_atts.update(trans_vals)
            parse_gtf[trans.transcript_id] = trans_atts
    gtf_file.close()

    # convert to dataframe for easier merging, annotating
    # and ultimately SQL database insertion
    # these are large dictionaries to parse, so might
    # be quite memory and compute heavy
    ensembl_df = pd.DataFrame(parse_ensembl).T
    gtf_df = pd.DataFrame(parse_gtf).T

    # check for presence of gene_biotype and
    # transcript_support_level
    merged_df = pd.merge(gtf_df,
                         ensembl_df,
                         left_on=[cx for cx in gtf_df.columns],
                         right_on=[rx for rx in gtf_df.columns],
                         how='left')

    try:
        merged_df["transcript_support_level"]
        E.info("transcript_support_level is present")
    except KeyError:
        E.info("transcript_support_level is not present")
        if set_transcript_support:
            merged_df["transcript_support_level"] = set_transcript_support
        else:
            merged_df["transcript_support_level"] = "NA"

    try:
        merged_df["gene_biotype"]
        E.info("gene biotype is present")
        try:
            merged_df["transcript_biotype"]
            E.info("transcript biotype is present")
        except KeyError:
            E.info("transcript biotype is not present")
            if set_biotype:
                merged_df["transcript_biotype"] = set_biotype
            else:
                merged_df["transcript_biotype"] = "NA"
    except KeyError:
        E.info("gene biotype is not present")
        if set_biotype:
            merged_df["gene_biotype"] = set_biotype
            merged_df["transcript_biotype"] = set_biotype
        else:
            merged_df["gene_biotype"] = "NA"
            merged_df["transcript_biotype"] = "NA"

    # sort on gene then transcript id
    # remove exon_number and exon_id to maintain
    # compatibility with previous code
    try:
        merged_df.drop(["exon_id", "exon_number"], axis=1, inplace=True)
    except KeyError:
        try:
            merged_df.drop(["exon_id"], axis=1, inplace=True)
        except KeyError:
            try:
                merged_df.drop(["exon_number"], axis=1, inplace=True)
            except KeyError:
                pass

    # sort the output and load into the csvdb
    # add a multindex to use multiple SQL indices
    merged_df.sort_values(by=["gene_id", "transcript_id"], inplace=True)

    merged_df.set_index(
        ["gene_id", "gene_name", "protein_id", "transcript_id"],
        inplace=True,
        drop=True)

    merged_df.to_sql(
        name=table,
        con=sqlite3.connect(csvdb),
        if_exists='replace',
        index_label=["gene_id", "gene_name", "protein_id", "transcript_id"])
    return 1
示例#28
0
def loadPicardMetrics(infiles, outfile, suffix,
                      pipeline_suffix=".picard_stats",
                      tablename=None):
    '''load picard metrics.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)

    outf = P.get_temp_file(".")

    filenames = ["%s.%s" % (x, suffix) for x in infiles]

    first = True
    for filename in filenames:
        track = P.snip(os.path.basename(filename), "%s.%s" %
                       (pipeline_suffix, suffix))

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.open_file(filename, "r").readlines()

        # extract metrics part
        rx_start = re.compile("## METRICS CLASS")
        for n, line in enumerate(lines):
            if rx_start.search(line):
                lines = lines[n + 1:]
                break

        for n, line in enumerate(lines):
            if not line.strip():
                lines = lines[:n]
                break

        if len(lines) == 0:
            E.warn("no lines in %s: %s" % (track, filename))
            continue

        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
            fields = lines[0][:-1].split("\t")
        else:
            f = lines[0][:-1].split("\t")
            if f != fields:
                raise ValueError(
                    "file %s has different fields: expected %s, got %s" %
                    (filename, fields, f))

        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()

    P.load(outf.name,
           outfile,
           tablename=tablename,
           options="--add-index=track --allow-empty-file")

    os.unlink(outf.name)