Exemplo n.º 1
0
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2table
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 2
0
def loadPeptideSequences(infile, outfile):
    """load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    """

    load_statement = P.build_load_statement(P.toTable(outfile), options="--add-protein_id" "--map=protein_id:str")

    statement = """gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/fasta2fasta.py --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 3
0
def loadGeneStats(infile, outfile):
    """compute and load gene statistics to database.

    Gene statistics are computed by :doc:`gtf2table` with the
    following counters:

    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Parameters
    ----------
    infile : string
        A :term:`gtf` file which is output from :meth:`buildGenes`
    outfile : string
        A log file. The table name is derived from `outfile`.
        e.g. bam_stats.load
    """

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--map=gene_name:str")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2table.py
          --log=%(outfile)s.log
          --genome=%(genome_dir)s/%(genome)s
          --counter=position
          --counter=length
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 4
0
def loadPeptideSequences(infile, outfile):
    '''load ENSEMBL peptide file into database

    This method removes empty sequences (see for example
    transcript:ENSMUST00000151316, ENSMUSP00000118372)

    The created table contains the columns ``protein_id``, ``length``
    and ``sequence``.

    Arguments
    ---------
    infile : string
        ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format
    outfile : string
        filename with logging information. The tablename is
        derived from ``outfile``.

    '''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-protein_id"
                                            "--map=protein_id:str")

    statement = '''gunzip
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | cgat fasta2fasta --method=filter
    --filter-method=min-length=1
    | cgat fasta2table --section=length
    --section=sequence
    | perl -p -e 's/id/protein_id/'
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 5
0
def loadTranscriptStats(infile, outfile):
    """compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    """

    load_statement = P.build_load_statement(
        P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str"
    )

    statement = """
    gunzip < %(infile)s |\
    python %(scriptsdir)s/gtf2table.py \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
def loadEditDistances(infile, outfile):
    '''Load distribtuions of edit distances as output by umi_tools dedup'''
    load_smt = P.build_load_statement(
        P.toTable(outfile), options="-i edit_distance")
    statement = ''' sed s/unique/_unique/g %(infile)s
                 | %(load_smt)s > %(outfile)s '''
    P.run()
Exemplo n.º 7
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """

    job_memory = PARAMS["job_memory"]

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run()
Exemplo n.º 8
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--add-index=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 9
0
def loadmiRNATranscripts(infile, outfile):
    '''load transcripts from a GFF3 file into the database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gff3` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    job_memory = PARAMS["job_memory"]

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--allow-empty-file "
        "--header-names=feature,Name")

    statement = '''
     export LANG=en_GB.UTF-8 && zcat %(infile)s
    | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null
    | grep -v "#"
    | cut -f3,12
    |%(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 10
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    load_statement = P.build_load_statement(
        tablename=tablename,
        options="--allow-empty-file "
        "--add-index=category "
        "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 11
0
def loadTranscriptStats(infile, outfile):
    '''compute and load transcript properties into database.

    The method calls :doc:`gtf2table` with the following counters:
    * length - gene/exon lengths
    * position - gene position
    * composition-na - gene nucleotide composition

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--map=gene_id:str")

    statement = '''
    gunzip < %(infile)s |\
    cgat gtf2table \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 12
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                      for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """python %(scriptsdir)s/combine_tables.py
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run()
Exemplo n.º 13
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                       for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run()
Exemplo n.º 14
0
def loadGeneInformation(infile, outfile, only_proteincoding=False):
    """load gene-related attributes from :term:`gtf` file into database.

    This method takes transcript-associated features from an
    :term:`gtf` file and collects the gene-related attributes in the
    9th column of the gtf file, ignoring exon_id, transcript_id,
    transcript_name, protein_id and exon_number.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename, contains logging information. The
       table name is derived from the filename of outfile.
    only_proteincoding : bool
       If True, only consider protein coding genes.

    """

    job_memory = "4G"
    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = (
            """python %(scriptsdir)s/gtf2gtf.py
        --method=filter --filter-method=proteincoding"""
            % PARAMS
        )
    else:
        filter_cmd = "cat"

    load_statement = P.build_load_statement(
        table, options="--add-index=gene_id " "--add-index=gene_name" "--map=gene_name:str"
    )

    statement = """
    zcat %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | python %(scriptsdir)s/gtf2gtf.py
    --method=sort --sort-order=gene+transcript
    | python %(scriptsdir)s/gtf2tsv.py
    --attributes-as-columns --output-only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | %(pipeline_scriptsdir)s/hsort 1
    | uniq
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 15
0
def loadGeneInformation(infile, outfile, only_proteincoding=False):
    '''load gene-related attributes from :term:`gtf` file into database.

    This method takes transcript-associated features from an
    :term:`gtf` file and collects the gene-related attributes in the
    9th column of the gtf file, ignoring exon_id, transcript_id,
    transcript_name, protein_id and exon_number.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename, contains logging information. The
       table name is derived from the filename of outfile.
    only_proteincoding : bool
       If True, only consider protein coding genes.

    '''

    job_memory = "4G"
    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = """cgat gtf2gtf
        --method=filter --filter-method=proteincoding""" % PARAMS
    else:
        filter_cmd = "cat"

    load_statement = P.build_load_statement(
        table,
        options="--add-index=gene_id "
        "--add-index=gene_name"
        "--map=gene_name:str")

    statement = '''
    zcat %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | cgat gtf2gtf
    --method=sort --sort-order=gene+transcript
    | cgat gtf2tsv
    --attributes-as-columns --output-only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | %(pipeline_scriptsdir)s/hsort 1
    | uniq
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 16
0
def loadMotifSequenceComposition(infile, outfile):
    '''compute sequence composition of sequences used for ab-initio search.'''

    load_statement = P.build_load_statement(P.toTable(outfile))

    statement = '''
    python %(scriptsdir)s/fasta2table.py
        --section=na
        --log=%(outfile)s
    < %(infile)s
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 17
0
def loadTranscriptInformation(infile, outfile,
                              only_proteincoding=False):
    '''load transcript-related attributes from :term:`gtf` file into database.

    This method takes transcript-associated features from an
    :term:`gtf` file and collects the gene-related attributes in the
    9th column of the gtf file, ignoring exon_id and exon_number.
    To handle different Ensembl versions, gene_biotype and
    transcript_support are enforced if they are missing.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename, contains logging information. The
       table name is derived from the filename of outfile.
    only_proteincoding : bool
       If True, only consider protein coding genes.

    '''
    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = """cgat gtf2gtf
        --method=filter --filter-method=proteincoding""" % PARAMS
    else:
        filter_cmd = "cat"

    load_statement = P.build_load_statement(
        table,
        options="--add-index=gene_id "
        "--add-index=gene_name"
        "--add-index=protein_id"
        "--add-index=transcript_id"
        "--map=gene_name:str")

    statement = '''zcat < %(infile)s
    | awk '$3 == "CDS"'
    | grep "transcript_id"
    | cgat gtf2gtf
    --method=sort --sort-order=gene+transcript
    | cgat gtf2tsv
    --attributes-as-columns --output-only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number
    | %(pipeline_scriptsdir)s/hsort 1 | uniq
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 18
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=snp_id "
                                            "--add-index=protein_id "
                                            "--map=effect:str")

    statement = '''
    gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 19
0
def loadProteinStats(infile, outfile):
    '''compute and load protein sequence properties into database.

    The method computes amino acid composition, length, and hash
    for each peptide sequence.

    The method calls :doc:`fasta2table` with the following counters:

    * length - protein sequence length
    * hid - protein sequence hash identifier
    * aa - protein sequence composition

    Arguments
    ---------
    infile : string
       Fiename of ENSEMBL peptide file in :term:`fasta` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=protein_id "
        "--map=protein_id:str")

    # the awk statement truncates ids ENSPXXX.1 to ENSPXXX
    # necessary for downstream compatibility (e.g. seleno list)
    statement = '''
    gunzip < %(infile)s
    | cgat fasta2fasta
    --method=filter
    --filter-method=min-length=1
    | awk 'match($0, /(>ENS[A-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]}
    !/^>/ {print}'
    | cgat fasta2table
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    | sed "s/^id/protein_id/"
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=snp_id "
        "--add-index=protein_id "
        "--map=effect:str")

    statement = '''
    gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemplo n.º 21
0
def loadProteinStats(infile, outfile):
    '''compute and load protein sequence properties into database.

    The method computes amino acid composition, length, and hash
    for each peptide sequence.

    The method calls :doc:`fasta2table` with the following counters:

    * length - protein sequence length
    * hid - protein sequence hash identifier
    * aa - protein sequence composition

    Arguments
    ---------
    infile : string
       Fiename of ENSEMBL peptide file in :term:`fasta` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=protein_id "
        "--map=protein_id:str")

    statement = '''
    gunzip < %(infile)s
    | cgat fasta2fasta
    --method=filter
    --filter-method=min-length=1
    | awk 'match($0, /(>[a-zA-Z]+[0-9]+)(\.[0-9])*(.*)/, a) {print a[1], a[3]}
    !/^>/ {print}'
    | cgat fasta2table
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    | sed "s/^id/protein_id/"
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 22
0
def loadGeneCoordinates(infile, outfile):
    """merge transcripts to generate the genomic coordinates per gene
    and load """

    # TS. remove transcript_id column as this is now meaningless
    load_statement = P.build_load_statement(
        P.toTable(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file "
    )

    statement = """
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2gtf.py
    --method=merge-transcripts
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 23
0
def loadGeneCoordinates(infile, outfile):
    '''merge transcripts to generate the genomic coordinates per gene
    and load '''

    # TS. remove transcript_id column as this is now meaningless
    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--ignore-column=transcript_id "
                                            "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2gtf.py
    --method=merge-transcripts
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 24
0
def loadProteinStats(infile, outfile):
    '''compute and load protein sequence properties into database.

    The method computes amino acid composition, length, and hash
    for each peptide sequence.

    The method calls :doc:`fasta2table` with the following counters:

    * length - protein sequence length
    * hid - protein sequence hash identifier
    * aa - protein sequence composition

    Arguments
    ---------
    infile : string
       Fiename of ENSEMBL peptide file in :term:`fasta` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=protein_id "
                                            "--map=protein_id:str")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/fasta2fasta.py
    --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    | sed "s/^id/protein_id/"
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 25
0
def loadProteinStats(infile, outfile):
    """compute and load protein sequence properties into database.

    The method computes amino acid composition, length, and hash
    for each peptide sequence.

    The method calls :doc:`fasta2table` with the following counters:

    * length - protein sequence length
    * hid - protein sequence hash identifier
    * aa - protein sequence composition

    Arguments
    ---------
    infile : string
       Fiename of ENSEMBL peptide file in :term:`fasta` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    """

    load_statement = P.build_load_statement(
        P.toTable(outfile), options="--add-index=protein_id " "--map=protein_id:str"
    )

    statement = """
    gunzip < %(infile)s
    | python %(scriptsdir)s/fasta2fasta.py
    --method=filter
    --filter-method=min-length=1
    | python %(scriptsdir)s/fasta2table.py
    --log=%(outfile)s
    --sequence-type=aa
    --section=length
    --section=hid
    --section=aa
    --regex-identifier="(\S+)"
    | sed "s/^id/protein_id/"
    | %(load_statement)s
    > %(outfile)s"""

    P.run()
Exemplo n.º 26
0
def loadTranscript2Gene(infile, outfile):
    """build a map of transcript to gene from gtf file and load into database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.
    """
    load_statement = P.build_load_statement(
        P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id "
    )

    statement = """
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py --output-map=transcript2gene -v 0
    | %(load_statement)s
    > %(outfile)s"""
    P.run()
Exemplo n.º 27
0
def loadGeneCoordinates(infile, outfile):
    '''merge transcripts to generate the genomic coordinates per gene
    and load '''

    # TS. remove transcript_id column as this is now meaningless
    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=gene_id "
        "--ignore-column=transcript_id "
        "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2gtf
    --method=merge-transcripts
    | cgat gtf2tsv
    | %(load_statement)s
    > %(outfile)s'''

    P.run()
Exemplo n.º 28
0
def loadDapars(infiles, outfile):
    '''Munge the DaPars output to seperate transcript and gene_ids,
    and load into database'''

    infiles = " ".join(infiles)

    statement = '''python %(scriptsdir)s/combine_tables.py
                   --cat=track
                   --use-file-prefix
                   --regex-filename='dapars_out.dir/(.+)/dapars_out'
                   %(infiles)s -L %(outfile)s
            |   sed 's/[|]/\\t/g'
            |   sed '1!b;s/Gene/transcript_id\\tgene_id\\tchrom\\tstrand/'
            |   %(load_statement)s
            > %(outfile)s'''

    load_statement = P.build_load_statement(
        P.toTable(outfile), options="-i track -i gene_id -i transcript_id")

    P.run()
Exemplo n.º 29
0
def loadTranscript2Gene(infile, outfile):
    '''build a map of transcript to gene from gtf file and load into database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.
    '''
    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py --output-map=transcript2gene -v 0
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 30
0
def loadmiRNATranscripts(infile, outfile):
    '''load transcripts from a GFF3 file into the database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gff3` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--allow-empty-file "
                                            "--header-names=feature,Name")

    statement = '''
     export LANG=en_GB.UTF-8 && zcat %(infile)s
    | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null
    | grep -v "#"
    | cut -f3,12
    |%(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 31
0
def loadBigWigStats(infiles, outfile):
    '''merge and load bigwig summary for all wiggle files.

    Summarise and merge bigwig files for all samples and load into a
    table called bigwig_stats

    Parameters
    ----------
    infiles : list
       Input filenames in :term:`bigwig` format
    outfile : string
        Output filename, the table name is derived from `outfile`.
    '''

    data = " ".join(
        ['<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' %
         x for x in infiles])
    headers = ",".join([P.snip(os.path.basename(x), ".bw")
                        for x in infiles])

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=track")

    statement = '''cgat combine_tables
    --header-names=%(headers)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(data)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    '''

    P.run()
Exemplo n.º 32
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix)
                       for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run()
Exemplo n.º 33
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | python %(scriptsdir)s/gtf2tsv.py
    | %(load_statement)s
    > %(outfile)s'''
    P.run()
Exemplo n.º 34
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT ontology FROM %s''' %
            tablename).fetchall()
    ]

    genelists = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT genelist FROM %s''' %
            tablename).fetchall()
    ]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        load_statement = P.build_load_statement(tablename=tablename)

        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | %(load_statement)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty-file')
Exemplo n.º 35
0
def loadBAMStats(infiles, outfile):
    '''load output of :func:`buildBAMStats` into database.
    Arguments
    ---------
    infiles : string
        Input files, output from :func:`buildBAMStats`.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    header = ",".join([P.snip(os.path.basename(x), ".readstats")
                       for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --allow-empty-file")

    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --ignore-empty
    %(filenames)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s"""
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options="--allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run()

    # load mapping qualities, there are two columns per row
    # 'all_reads' and 'filtered_reads'
    # Here, only filtered_reads are used (--take=3)
    for suffix in ("mapq",):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options=" --allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        --take=3
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run()
Exemplo n.º 36
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        load_statement = P.build_load_statement(
            tablename=tablename)

        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | %(load_statement)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty-file')