def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[
                                   1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)):
                true_set.add(taxa[0])
            for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)

            fp_rate = float(len(fp)) / total_estimate
            tp_rate = float(len(tp)) / total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" %
                       (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()
Exemplo n.º 2
0
def estimateCopyNumber(infiles, outfile, params):
    """Estimate copy number based on ERCC spike in concentrations.
       Expects the location of the directory containing the
       R code as a single parameter."""

    infile, cuffnorm_load, ercc_load = infiles
    code_dir = params[0]

    cuffnorm_table = P.toTable(cuffnorm_load)
    ercc_table = P.toTable(ercc_load)

    track = outfile.split("/")[-1][: -len(".spike.norm")]
    plotname = outfile + ".png"

    # col_name = track.replace("-","_") + "_0"
    col_name = re.sub(r"[-.]", "_", track) + "_0"

    # ## connect to the database.
    con = sqlite3.connect(PARAMS["database_name"])

    # ## retrieve the spike in data
    statement = (
        """select e.gene_id, %(col_name)s as FPKM, copies_per_cell
                   from %(ercc_table)s e
                   inner join %(cuffnorm_table)s c
                   on e.gene_id=c.tracking_id
                """
        % locals()
    )

    # spikedf = PU.fetch_DataFrame(statement, PARAMS["database"])
    spikedf = pd.read_sql(statement, con)
    # rspikedf = pdcom.convert_to_r_dataframe(spikedf)

    # ## retrieve the data to normalise
    statement = (
        """ select tracking_id as gene_id, %(col_name)s as FPKM
                    from %(cuffnorm_table)s
                """
        % locals()
    )

    fpkms = pd.read_sql(statement, con)
    # rfpkms = pdcom.convert_to_r_dataframe(fpkms)

    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))

    r = R.r

    rscript = os.path.join(os.path.join(code_dir, PARAMS["rsource"]))

    r.source(rscript)

    plotname, outfile = [os.path.abspath(x) for x in [plotname, outfile]]

    r.normalise_to_spikes(spikedf, fpkms, plotname, outfile, track)
Exemplo n.º 3
0
def estimateCopyNumber(infiles, outfile, params):
    '''Estimate copy number based on ERCC spike in concentrations.
       Expects the location of the directory containing the
       R code as a single parameter.'''

    infile, cuffnorm_load, ercc_load = infiles
    code_dir = params[0]

    cuffnorm_table = P.toTable(cuffnorm_load)
    ercc_table = P.toTable(ercc_load)

    track = outfile.split("/")[-1][:-len(".spike.norm")]
    plotname = outfile + ".png"

    # col_name = track.replace("-","_") + "_0"
    col_name = re.sub(r"[-.]", "_", track) + "_0"

    # ## connect to the database.
    con = sqlite3.connect(PARAMS["database_name"])

    # ## retrieve the spike in data
    statement = '''select e.gene_id, %(col_name)s as FPKM, copies_per_cell
                   from %(ercc_table)s e
                   inner join %(cuffnorm_table)s c
                   on e.gene_id=c.tracking_id
                ''' % locals()

    # spikedf = PU.fetch_DataFrame(statement, PARAMS["database"])
    spikedf = pd.read_sql(statement, con)
    # rspikedf = pdcom.convert_to_r_dataframe(spikedf)

    # ## retrieve the data to normalise
    statement = ''' select tracking_id as gene_id, %(col_name)s as FPKM
                    from %(cuffnorm_table)s
                ''' % locals()

    fpkms = pd.read_sql(statement, con)
    # rfpkms = pdcom.convert_to_r_dataframe(fpkms)

    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))

    r = R.r

    rscript = os.path.join(os.path.join(code_dir, PARAMS["rsource"]))

    r.source(rscript)

    plotname, outfile = [os.path.abspath(x) for x in [plotname, outfile]]

    r.normalise_to_spikes(spikedf, fpkms, plotname, outfile, track)
Exemplo n.º 4
0
def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Exemplo n.º 6
0
def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
Exemplo n.º 8
0
def loadProteinStats(infile, outfile):
    '''load protein statistics to database.

    The *infile* is an ENSEMBL peptide file.

    Remove empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    '''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/fasta2fasta.py
    --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    |sed "s/^id/protein_id/"
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
    --add-index=protein_id
    --map=protein_id:str
    --table=%(table)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 9
0
def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
Exemplo n.º 10
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run()
Exemplo n.º 11
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg({"counts": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="track",
                              values="counts", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemplo n.º 12
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--add-index=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2tsv
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 13
0
def loadPeptideSequences(infile, outfile):
    """load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    """

    load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str")

    statement = """gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/fasta2fasta.py --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 14
0
def loadmiRNATranscripts(infile, outfile):
    '''load transcripts from a GFF3 file into the database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gff3` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    job_memory = PARAMS["job_memory"]

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--allow-empty-file "
        "--header-names=feature,Name")

    statement = '''
     export LANG=en_GB.UTF-8 && zcat %(infile)s
    | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null
    | grep -v "#"
    | cut -f3,12
    |%(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 15
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """

    job_memory = PARAMS["job_memory"]

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run()
Exemplo n.º 16
0
def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      %(csv2db_options)s
                      --add-index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)
Exemplo n.º 17
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg(
        {"counts": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="track",
                              values="counts",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemplo n.º 18
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [
        x[:-len("_0")] if x.endswith("_0") else x for x in df.columns
    ]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg(
        {"value": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="variable",
                              values="value",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemplo n.º 19
0
def loadTranscriptStats(infile, outfile):
    """compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    """

    load_statement = P.build_load_statement(
        P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str"
    )

    statement = """
    gunzip < %(infile)s |\
    python %(scriptsdir)s/gtf2table.py \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 20
0
def loadEditDistances(infile, outfile):
    '''Load distribtuions of edit distances as output by umi_tools dedup'''
    load_smt = P.build_load_statement(
        P.toTable(outfile), options="-i edit_distance")
    statement = ''' sed s/unique/_unique/g %(infile)s
                 | %(load_smt)s > %(outfile)s '''
    P.run()
Exemplo n.º 21
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [x[:-len("_0")] if x.endswith("_0") else x
                  for x in df.columns]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg({"value": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="variable",
                              values="value", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemplo n.º 22
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open("tophat/tophat.dir/picard_align_stats.tsv", "w")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
Exemplo n.º 23
0
def loadTranscriptInformation(infile, outfile,
                              only_proteincoding=False):
    '''load transcript information from a gtf file.

    *infile* is an ENSEMBL gtf file.
    '''
    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = """python %(scriptsdir)s/gtf2gtf.py
        --method=filter --filter-method=proteincoding""" % PARAMS
    else:
        filter_cmd = "cat"

    statement = '''zcat < %(infile)s
    | awk '$3 == "CDS"'
    | grep "transcript_id"
    | python %(scriptsdir)s/gtf2gtf.py
    --method=sort --sort-order=gene+transcript
    | python %(scriptsdir)s/gtf2tsv.py
    --attributes-as-columns --output-only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number
    | %(pipeline_scriptsdir)s/hsort 1 | uniq
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --add-index=transcript_id
              --add-index=gene_id
              --add-index=protein_id
              --add-index=gene_name
              --map=transcript_name:str
              --map=gene_name:str
              --table=%(table)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 24
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('tophat/tophat.dir/dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.stats")
        statfile = f
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                    | python %(scriptsdir)s/csv2db.py
                          --add-index=track
                          --table=%(tablename)s 
                    > %(outfile)s '''
    P.run()
Exemplo n.º 25
0
def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
Exemplo n.º 26
0
def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()
Exemplo n.º 27
0
def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
Exemplo n.º 28
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [x[0]
              for x in cc.execute("SELECT motif FROM motif_info").fetchall()]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(
            PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)
Exemplo n.º 29
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('tophat/tophat.dir/dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.stats")
        statfile = f
        lines = [
            x for x in open(statfile, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                    | python %(scriptsdir)s/csv2db.py
                          --add-index=track
                          --table=%(tablename)s 
                    > %(outfile)s '''
    P.run()
Exemplo n.º 30
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
def createViewMapping(infile, outfile):
    """create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    """

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = """
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    """

    Database.executewait(dbhandle, statement % locals())
Exemplo n.º 34
0
def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | cgat csv2db
                      %(csv2db_options)s
                      --add-index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)
def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
Exemplo n.º 37
0
def loadTranscriptStats(infile, outfile):
    '''compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--map=gene_id:str")

    statement = '''
    gunzip < %(infile)s |\
    python %(scriptsdir)s/gtf2table.py \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 38
0
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2table.py
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Exemplo n.º 40
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--add-index=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
Exemplo n.º 43
0
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2table.py
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 44
0
def loadCodingPotential(infile, outfile):
    '''load annotations'''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s
    | cgat csv2db
              %(csv2db_options)s
              --allow-empty-file
              --add-index=gene_id
              --map=gene_id:str
              --table=%(table)s
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect(PARAMS["database_name"])
    Database.executewait(
        dbhandle,
        '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait(
        dbhandle,
        '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Exemplo n.º 45
0
def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute(
            "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals())
        outf.write(
            "".join(["promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc]))
        i += 1
        cc.execute(
            "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals())
        outf.write(
            "".join(["promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc]))
        i += 1

    outf.close()
Exemplo n.º 46
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
Exemplo n.º 47
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)
def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
Exemplo n.º 49
0
def loadPeptideSequences(infile, outfile):
    '''load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    '''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-protein_id"
                                            "--map=protein_id:str")

    statement = '''gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/fasta2fasta.py --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 50
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing-value=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Exemplo n.º 51
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                      for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """python %(scriptsdir)s/combine_tables.py
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run()
Exemplo n.º 52
0
def buildDMRStats(infiles, outfile):
    '''compute differential methylation stats.'''
    tablenames = [P.toTable(x) for x in infiles]
    method = P.snip(outfile, "_stats.tsv")
    PipelineMedip.buildDMRStats(tablenames,
                                method,
                                outfile,
                                dbhandle=connect())
Exemplo n.º 53
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                       for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run()
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing-value=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Exemplo n.º 55
0
def loadCoveredCpGs(infile, outfile):
    dbh = connect()
    tablename = P.toTable(outfile)

    statement = '''cat %(infile)s |
                python %%(scriptsdir)s/csv2db.py
                --table %(tablename)s --retry --ignore-empty
                 > %(outfile)s''' % locals()
    P.run()
def load_chunk_annotations(infile, outfile):

    P.load(infile, outfile, "-i gene_id -i exon_id")

    tablename = P.toTable(outfile)
    connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint;
                               CREATE INDEX %(tablename)s_joint ON
                                   %(tablename)s(gene_id,exon_id)''' %
                            locals())
Exemplo n.º 57
0
def loadCoveredCpGs(infile, outfile):
    dbh = connect()
    tablename = P.toTable(outfile)

    statement = '''cat %(infile)s |
                cgat csv2db
                --table %(tablename)s --retry --ignore-empty
                 > %(outfile)s''' % locals()
    P.run()