Exemplo n.º 1
0
def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes,
                    database, annotations_database):

        tmpfile = P.getTempFilename("/ifs/scratch/")

        table = os.path.basename(transcript_info)

        if gene_biotypes:
            where_cmd = "WHERE " + " OR ".join(
                ["gene_biotype = '%s'" % x
                 for x in gene_biotypes.split(",")])
        else:
            where_cmd = ""

        select = """SELECT DISTINCT
        transcript_id, transcript_biotype, gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

        df1 = pd.read_table(infile, sep="\t")
        df1.set_index("transcript_id", drop=True, inplace=True)

        df2 = pd.read_sql(select, connect(database, annotations_database))
        df2.set_index("transcript_id", drop=False, inplace=True)

        df = df1.join(df2)
        df.to_csv(tmpfile, sep="\t", index=True)

        options = "--add-index=transcript_id"
        P.load(tmpfile, outfile, options=options)
        os.unlink(tmpfile)
Exemplo n.º 4
0
def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")
Exemplo n.º 5
0
def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database,
                         annotations_database):

    tmpfile = P.getTempFilename("/ifs/scratch/")

    table = os.path.basename(gene_info)

    if gene_biotypes:
        where_cmd = "WHERE " + " OR ".join(
            ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")])
    else:
        where_cmd = ""

    select = """SELECT DISTINCT
        gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

    df1 = pd.read_table(infile, sep="\t")
    df1.set_index("test_id", drop=False, inplace=True)

    df2 = pd.read_sql(select, connect(database, annotations_database))
    df2.set_index("gene_id", drop=False, inplace=True)

    df = df1.join(df2)
    df.to_csv(tmpfile, sep="\t", index=True)

    options = "--add-index=gene_id"
    P.load(tmpfile, outfile, options=options)
    os.unlink(tmpfile)
def loadQcMeasures(infile, outfile):
    '''
    load QC measures into CSVDB
    '''

    P.load(infile, outfile,
           options="--add-index=track")
def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)
Exemplo n.º 9
0
def loadSampleInfo(infile, outfile):

    P.load(
        infile,
        outfile,
        options="--header-names=format,barcode,track,lanes -i barcode -i track"
    )
Exemplo n.º 10
0
def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)
Exemplo n.º 11
0
def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(
        infile, outfile,
        "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str"
    )
    table = outfile[:-len(".load")]
Exemplo n.º 12
0
def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
Exemplo n.º 13
0
def loadCuffNormClassic(infile, outfile):
    '''load the fpkm table from cuffnorm into the database'''

    fpkm_table = os.path.dirname(infile) + "/genes.fpkm_table"

    P.load(fpkm_table, outfile,
           options='-i "tracking_id"')
Exemplo n.º 14
0
def loadGCContent(infile, outfile):
    '''
    load the results the GC content for each background
    and foreground
    '''
    P.load(infile, outfile,
           options="--add-index=id")
def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")
Exemplo n.º 16
0
def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)
Exemplo n.º 17
0
def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)
Exemplo n.º 18
0
def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")
Exemplo n.º 19
0
def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
Exemplo n.º 20
0
def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")
Exemplo n.º 22
0
def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
                                           ['total_reads', total_reads],
                                           ['track', track]], columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
def loadExonValidation(infiles, outfile):
    """merge alignment stats into single tables."""
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
Exemplo n.º 24
0
def loadEnrichmentOfTFBS(infile, outfile):
    '''
    load the results of the enrichment
    '''

    P.load(infile,
           outfile,
           options="--add-index=matrix_id")
Exemplo n.º 25
0
def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
Exemplo n.º 26
0
def loadMetaInformation(infile, outfile):
    P.load(infile, outfile,
           options="--map=id:int "
           "--map=sample_id:int "
           "--map=experiment_id:int "
           "--add-index=id "
           "--add-index=experiment_id "
           "--add-index=sample_id ")
Exemplo n.º 27
0
def loadFimo(infile, outfile):

    P.load(
        infile,
        outfile,
        options=
        '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" '
    )
Exemplo n.º 28
0
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
Exemplo n.º 30
0
def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame(
            [['total_mapped_reads', total_mapped_reads],
             ['total_reads', total_reads], ['track', track]],
            columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name, outfile, options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
Exemplo n.º 31
0
def loadTranscriptProfile(infiles,
                          outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(
            infile
        ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)
Exemplo n.º 32
0
def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")
def load_chunk_annotations(infile, outfile):

    P.load(infile, outfile, "-i gene_id -i exon_id")

    tablename = P.toTable(outfile)
    connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint;
                               CREATE INDEX %(tablename)s_joint ON
                                   %(tablename)s(gene_id,exon_id)''' %
                            locals())
Exemplo n.º 34
0
def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
Exemplo n.º 35
0
def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
Exemplo n.º 36
0
def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")
def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")
def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")
Exemplo n.º 39
0
def loadTranscriptProfile(infiles, outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)
Exemplo n.º 40
0
def build_db(infiles, outfile):
    '''
    Stores data generated throughout pipeline as a sqlite database.
    Structure of data tables and database is meant for compatibility
    with the shiny app
    '''

    # record merged_filter_summary, merged_qc_summary,
    # merged_taxonomy, merged_abundance_id
    # and yml table in database
    P.load(infiles, outfile)
Exemplo n.º 41
0
def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)
Exemplo n.º 42
0
def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)
Exemplo n.º 43
0
def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)
Exemplo n.º 44
0
def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)
Exemplo n.º 45
0
def loadCountReads(infiles,
                   outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
Exemplo n.º 46
0
def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
Exemplo n.º 47
0
def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")
Exemplo n.º 49
0
def load_last_exon_chunks(infile, outfile):
    '''Load gene and exon_ids for last exons into database'''

    from CGAT import GTF

    with P.getTempFile(shared=True) as tmpfile:
        tmpfile.write("gene_id\tchunk_id\n")
        for exon in GTF.iterator(IOTools.openFile(infile)):
            tmpfile.write("\t".join(
                [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n")
        tmpfn = tmpfile.name

    P.load(tmpfn, outfile, options="-i gene_id -i exon_id")
    os.unlink(tmpfn)
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")
Exemplo n.º 51
0
def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
Exemplo n.º 52
0
def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)
Exemplo n.º 53
0
def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)
Exemplo n.º 54
0
def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)
Exemplo n.º 55
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Exemplo n.º 56
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)
Exemplo n.º 57
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Exemplo n.º 58
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)