Python PipelineLncRNA примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGATPipelines

Класс/Тип: PipelineLncRNA

Примеров на hotexamples.com: 25

Python PipelineLncRNA - 25 примеров найдено. Это лучшие примеры Python кода для CGATPipelines.PipelineLncRNA, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CounterMultiExonGenes(3)

CounterSingleExonGenes(3)

flagExonStatus(3)

classifyLncRNAGenes(3)

CounterExonsPerGene(2)

extractMAFGeneBlocks(2)

CounterGenes(2)

CounterMultiExonTranscripts(2)

CounterSingleExonTranscripts(2)

CounterTranscripts(2)

buildFilteredLncRNAGeneSet(2)

CounterExonsPerTranscript(2)

gtfToBed12(2)

parsePhyloCSF(1)

filterMAF(1)

buildFinalLncRNAGeneSet(1)

buildRefnoncodingGeneSet(1)

buildRefcodingGeneSet(1)

buildLncRNAGeneSet(1)

buildCodingGeneSet(1)

splitAlignedFasta(1)

Пример #1

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def classifyLncRNA(infiles, outfile):
    '''

    Classify lncRNA realtive to protein coding loci

    Classify lincRNA in terms of their relationship to 
    protein coding genes - creates indices for intervals on the 
    fly - mayb should be creating additional annotations:

    antisense
       transcript overlapping protein coding exons on opposite strand
    antisense_upstream
       transcript < 2kb from tss on opposite strand
    antisense_downstream 
       transcript < 2kb from gene end on opposite strand
    sense_upstream
       transcript < 2kb from tss on same strand
    sense_downstream
       transcript < 2kb from gene end on same strand
    intergenic
       transcript >2kb from any protein coding gene
    intronic
       overlaps protein coding gene intron on same strand
    antisense_intronic
       overlaps protein coding intron on opposite strand
    '''

    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Пример #2

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: lesheng/cgat

def flagExonStatus(infile, outfile):
    '''
    Adds two attributes to the gtf entry:
    exon_status_locus - specifies whether the gene model is multi- or single exon
    exon_status - specifies whether the transcript is mult- or single exon
    '''

    PipelineLncRNA.flagExonStatus(infile, outfile)

Пример #3

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def classifyFilteredLncRNA(infiles, outfile):
    '''
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    NOTE: This task is not included when running the full pipeline
    '''
    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Пример #4

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: nishantthakur/cgat

 def buildFilteredLncRNAGeneSet(infiles, outfile):
     """
     Creates a filtered lncRNA geneset. 
     This geneset will not include any single exon lncRNA 
     unless it has been seen previously i.e. it overlaps
     a previously identified lncRNA
     """
     PipelineLncRNA.buildFilteredLncRNAGeneSet(infiles[0], outfile, infiles[1 : len(infiles)])

Пример #5

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: nishantthakur/cgat

def buildFinalLncRNAGeneSet(infile, outfile):
    """
    the final lncRNA gene set consists of transcripts that pass
    the initial filtering stage i.e. are;
    multi-exonic/previously seen single exon transcripts
    display low evidence for coding potential
    """

    # filter based on coding potential
    PipelineLncRNA.buildFinalLncRNAGeneSet(infile, "lncrna_filtered_cpc_result", outfile, PARAMS["filtering_cpc"])

Пример #6

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def splitLncRNAFasta(infile, outfiles):
    out_dir = "./phyloCSF/lncrna_fasta"

    name_dict = {}
    for mapping in PARAMS["phyloCSF_map_species_names"].split(","):
        pair = mapping.split(":")
        key = ">" + pair[0]
        value = ">" + pair[1]
        name_dict[key] = value
    E.info("Name mapping: %s" % name_dict)

    PipelineLncRNA.splitAlignedFasta(infile, out_dir, name_dict)

Пример #7

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: yangjl/cgat

def buildLncRNAGeneSet(infiles, outfile):
    '''
    build lncRNA gene set. 
    
    This is a set of transcripts in the abinitio set that
    do not overlap at any protein coding or pseudogene transcripts
    or additional biotypes from ensembl that are unwanted
    (exons) in a reference gene set.
    
    Transcripts need to have a length of at least 200 bp.
    '''
    PipelineLncRNA.buildLncRNAGeneSet( infiles[0], infiles[1], infiles[2], infiles[3], infiles[4], outfile, PARAMS["lncrna_min_length"] )

Пример #8

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def buildRefnoncodingGeneSet(infile, outfile):
    '''
    filter the refnoncoding geneset for things that are described in ensembl
    as being:
    Ambiguous_orf
    Retained_intron
    Sense_intronic
    antisense
    Sense_overlapping
    Processed transcript
    '''
    PipelineLncRNA.buildRefnoncodingGeneSet(infile, outfile)

Пример #9

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

 def buildFilteredLncRNAGeneSet(infiles, outfile):
     '''
     Creates a filtered lncRNA geneset. That contains previously identified
     gene models supplied in contig file.
     '''
     assert PARAMS["filtering_remove_single_exon"] in ["loci",
                                                       "transcripts",
                                                       None]
     PipelineLncRNA.buildFilteredLncRNAGeneSet(
         infiles[0],
         outfile,
         infiles[1:len(infiles)],
         filter_se=PARAMS["filtering_remove_single_exon"])

Пример #10

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Пример #11

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()

Пример #12

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes. 

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as 
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
    i.e. nothing novel. This may underestimate the number 
    of transcripts that are actually expressed
    '''
    PipelineLncRNA.buildCodingGeneSet(infiles[0], infiles[1], outfile)

Пример #13

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: lesheng/cgat

def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes. 
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)


    outf = open(outfile, "w")
    outf.write("\t".join(["no_transcripts", 
                          "no_genes", 
                          "no_exons_per_transcript", 
                          "no_exons_per_gene",
                          "no_single_exon_transcripts", 
                          "no_multi_exon_transcripts", 
                          "no_single_exon_genes", 
                          "no_multi_exon_genes"]) + "\n")
    outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerGene(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonGenes(tmpf).count()])))


    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))

Пример #14

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Пример #15

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Пример #16

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Пример #17

Показать файл

def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")

Пример #18

Показать файл

def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Пример #19

Показать файл

Файл: Summary.py Проект: santayana/cgat

    def __call__(self, track, slice=None):

        return odict((("single_exon", PipelineLncRNA.CounterSingleExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count()), ("multi_exon", PipelineLncRNA.CounterMultiExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count())))

Пример #20

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)

Пример #21

Показать файл

def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Пример #22

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: nishantthakur/cgat

def flagExonStatus(infile, outfile):
    """
    Adds an attribute to the gtf entry dependent on whether the lncRNA 
    is multi or single exon
    """
    PipelineLncRNA.flagExonStatus(infile, outfile)

Пример #23

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: nishantthakur/cgat

def classifyFilteredLncRNA(infiles, outfile):
    """
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    """
    PipelineLncRNA.classifyLncRNAGenes(infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Пример #24

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--index=gene_id")

Пример #25

Показать файл

def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)