Exemplos de Pipeline.load em Python, exemplos de CGATPipelines.Pipeline.load em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pipeline_exome_cancer.py Projeto: gjaime/CGATPipelines

def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Exemplo n.º 2

0

Exibir arquivo

Arquivo: pipeline_promotors.py Projeto: CGATOxford/CGATPipelines

def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: PipelineTranscriptDiffExpression.py Projeto: gjaime/CGATPipelines

def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes,
                    database, annotations_database):

        tmpfile = P.getTempFilename("/ifs/scratch/")

        table = os.path.basename(transcript_info)

        if gene_biotypes:
            where_cmd = "WHERE " + " OR ".join(
                ["gene_biotype = '%s'" % x
                 for x in gene_biotypes.split(",")])
        else:
            where_cmd = ""

        select = """SELECT DISTINCT
        transcript_id, transcript_biotype, gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

        df1 = pd.read_table(infile, sep="\t")
        df1.set_index("transcript_id", drop=True, inplace=True)

        df2 = pd.read_sql(select, connect(database, annotations_database))
        df2.set_index("transcript_id", drop=False, inplace=True)

        df = df1.join(df2)
        df.to_csv(tmpfile, sep="\t", index=True)

        options = "--add-index=transcript_id"
        P.load(tmpfile, outfile, options=options)
        os.unlink(tmpfile)

Exemplo n.º 4

0

Exibir arquivo

def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: pipeline_bamstats.py Projeto: CGATOxford/CGATPipelines

def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: PipelineTranscriptDiffExpression.py Projeto: dormeight/CGATPipelines

def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database,
                         annotations_database):

    tmpfile = P.getTempFilename("/ifs/scratch/")

    table = os.path.basename(gene_info)

    if gene_biotypes:
        where_cmd = "WHERE " + " OR ".join(
            ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")])
    else:
        where_cmd = ""

    select = """SELECT DISTINCT
        gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

    df1 = pd.read_table(infile, sep="\t")
    df1.set_index("test_id", drop=False, inplace=True)

    df2 = pd.read_sql(select, connect(database, annotations_database))
    df2.set_index("gene_id", drop=False, inplace=True)

    df = df1.join(df2)
    df.to_csv(tmpfile, sep="\t", index=True)

    options = "--add-index=gene_id"
    P.load(tmpfile, outfile, options=options)
    os.unlink(tmpfile)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: pipeline_scrnaseqqc.py Projeto: CGATOxford/CGATPipelines

def loadQcMeasures(infile, outfile):
    '''
    load QC measures into CSVDB
    '''

    P.load(infile, outfile,
           options="--add-index=track")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: pipeline_scrnaseqqc.py Projeto: CGATOxford/CGATPipelines

def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)

Exemplo n.º 9

0

Exibir arquivo

def loadSampleInfo(infile, outfile):

    P.load(
        infile,
        outfile,
        options="--header-names=format,barcode,track,lanes -i barcode -i track"
    )

Exemplo n.º 10

0

Exibir arquivo

def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: pipeline_transcriptome.py Projeto: wbyu/CGATPipelines

def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(
        infile, outfile,
        "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str"
    )
    table = outfile[:-len(".load")]

Exemplo n.º 12

0

Exibir arquivo

Arquivo: PipelineBamStats.py Projeto: hmyh1202/CGATPipelines

def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: pipeline_scrnaseq.py Projeto: snsansom/scseq

def loadCuffNormClassic(infile, outfile):
    '''load the fpkm table from cuffnorm into the database'''

    fpkm_table = os.path.dirname(infile) + "/genes.fpkm_table"

    P.load(fpkm_table, outfile,
           options='-i "tracking_id"')

Exemplo n.º 14

0

Exibir arquivo

Arquivo: pipeline_transfacmatch.py Projeto: gjaime/CGATPipelines

def loadGCContent(infile, outfile):
    '''
    load the results the GC content for each background
    and foreground
    '''
    P.load(infile, outfile,
           options="--add-index=id")

Exemplo n.º 15

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: Acribbs/CGATPipelines

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

Exemplo n.º 16

0

Exibir arquivo

def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: pipeline_scrnaseqqc.py Projeto: CGATOxford/CGATPipelines

def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: pipeline_exome_cancer.py Projeto: gjaime/CGATPipelines

def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")

Exemplo n.º 19

0

Exibir arquivo

def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Exemplo n.º 20

0

Exibir arquivo

def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Exemplo n.º 21

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: CGATOxford/CGATPipelines

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

Exemplo n.º 22

0

Exibir arquivo

Arquivo: PipelineMappingQC.py Projeto: CGATOxford/CGATPipelines

def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
                                           ['total_reads', total_reads],
                                           ['track', track]], columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: pipeline_benchmark_rnaseqmappers.py Projeto: hainm/CGATPipelines

def loadExonValidation(infiles, outfile):
    """merge alignment stats into single tables."""
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: pipeline_transfacmatch.py Projeto: gjaime/CGATPipelines

def loadEnrichmentOfTFBS(infile, outfile):
    '''
    load the results of the enrichment
    '''

    P.load(infile,
           outfile,
           options="--add-index=matrix_id")

Exemplo n.º 25

0

Exibir arquivo

def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: pipeline_rnaseqqc.py Projeto: gjaime/CGATPipelines

def loadMetaInformation(infile, outfile):
    P.load(infile, outfile,
           options="--map=id:int "
           "--map=sample_id:int "
           "--map=experiment_id:int "
           "--add-index=id "
           "--add-index=experiment_id "
           "--add-index=sample_id ")

Exemplo n.º 27

0

Exibir arquivo

def loadFimo(infile, outfile):

    P.load(
        infile,
        outfile,
        options=
        '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" '
    )

Exemplo n.º 28

0

Exibir arquivo

Arquivo: PipelineMotifs.py Projeto: gjaime/CGATPipelines

def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: PipelineDeNovoMotifs.py Projeto: growland1/research_project

def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: PipelineMappingQC.py Projeto: wbyu/CGATPipelines

def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame(
            [['total_mapped_reads', total_mapped_reads],
             ['total_reads', total_reads], ['track', track]],
            columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name, outfile, options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Exemplo n.º 31

0

Exibir arquivo

def loadTranscriptProfile(infiles,
                          outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(
            infile
        ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)

Exemplo n.º 32

0

Exibir arquivo

Arquivo: pipeline_rnaseqlncrna.py Projeto: gjaime/CGATPipelines

def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Exemplo n.º 33

0

Exibir arquivo

Arquivo: pipeline_retained_introns.py Projeto: sudlab/pipeline_retained_introns

def load_chunk_annotations(infile, outfile):

    P.load(infile, outfile, "-i gene_id -i exon_id")

    tablename = P.toTable(outfile)
    connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint;
                               CREATE INDEX %(tablename)s_joint ON
                                   %(tablename)s(gene_id,exon_id)''' %
                            locals())

Exemplo n.º 34

0

Exibir arquivo

Arquivo: pipeline_exome_cancer.py Projeto: gjaime/CGATPipelines

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Exemplo n.º 35

0

Exibir arquivo

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Exemplo n.º 36

0

Exibir arquivo

Arquivo: pipeline_rnaseqlncrna.py Projeto: Acribbs/CGATPipelines

def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Exemplo n.º 37

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: CGATOxford/CGATPipelines

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")

Exemplo n.º 38

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: Acribbs/CGATPipelines

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")

Exemplo n.º 39

0

Exibir arquivo

Arquivo: PipelineBamStats.py Projeto: CGATOxford/CGATPipelines

def loadTranscriptProfile(infiles, outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)

Exemplo n.º 40

0

Exibir arquivo

def build_db(infiles, outfile):
    '''
    Stores data generated throughout pipeline as a sqlite database.
    Structure of data tables and database is meant for compatibility
    with the shiny app
    '''

    # record merged_filter_summary, merged_qc_summary,
    # merged_taxonomy, merged_abundance_id
    # and yml table in database
    P.load(infiles, outfile)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: wbyu/CGATPipelines

def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)

Exemplo n.º 42

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: wbyu/CGATPipelines

def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)

Exemplo n.º 43

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: CGATOxford/CGATPipelines

def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)

Exemplo n.º 44

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: CGATOxford/CGATPipelines

def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)

Exemplo n.º 45

0

Exibir arquivo

def loadCountReads(infiles,
                   outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)

Exemplo n.º 46

0

Exibir arquivo

def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)

Exemplo n.º 47

0

Exibir arquivo

Arquivo: pipeline_exome_cancer.py Projeto: gjaime/CGATPipelines

def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)

Exemplo n.º 48

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: Acribbs/CGATPipelines

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Exemplo n.º 49

0

Exibir arquivo

def load_last_exon_chunks(infile, outfile):
    '''Load gene and exon_ids for last exons into database'''

    from CGAT import GTF

    with P.getTempFile(shared=True) as tmpfile:
        tmpfile.write("gene_id\tchunk_id\n")
        for exon in GTF.iterator(IOTools.openFile(infile)):
            tmpfile.write("\t".join(
                [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n")
        tmpfn = tmpfile.name

    P.load(tmpfn, outfile, options="-i gene_id -i exon_id")
    os.unlink(tmpfn)

Exemplo n.º 50

0

Exibir arquivo

Arquivo: pipeline_variant_annotation.py Projeto: CGATOxford/CGATPipelines

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Exemplo n.º 51

0

Exibir arquivo

Arquivo: PipelineBamStats.py Projeto: CGATOxford/CGATPipelines

def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)

Exemplo n.º 52

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: wbyu/CGATPipelines

def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)

Exemplo n.º 53

0

Exibir arquivo

Arquivo: pipeline_iCLIP.py Projeto: shulp2211/UMIpipe

def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)

Exemplo n.º 54

0

Exibir arquivo

Arquivo: pipeline_splicing.py Projeto: CGATOxford/CGATPipelines

def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)

Exemplo n.º 55

0

Exibir arquivo

Arquivo: pipeline_motifs.py Projeto: wbyu/CGATPipelines

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Exemplo n.º 56

0

Exibir arquivo

Arquivo: pipeline_motifs.py Projeto: gjaime/CGATPipelines

def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)

Exemplo n.º 57

0

Exibir arquivo

Arquivo: pipeline_motifs.py Projeto: gjaime/CGATPipelines

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Exemplo n.º 58

0

Exibir arquivo

def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)