def classifyLncRNA(infiles, outfile):
    '''

    Classify lncRNA realtive to protein coding loci

    Classify lincRNA in terms of their relationship to 
    protein coding genes - creates indices for intervals on the 
    fly - mayb should be creating additional annotations:

    antisense
       transcript overlapping protein coding exons on opposite strand
    antisense_upstream
       transcript < 2kb from tss on opposite strand
    antisense_downstream 
       transcript < 2kb from gene end on opposite strand
    sense_upstream
       transcript < 2kb from tss on same strand
    sense_downstream
       transcript < 2kb from gene end on same strand
    intergenic
       transcript >2kb from any protein coding gene
    intronic
       overlaps protein coding gene intron on same strand
    antisense_intronic
       overlaps protein coding intron on opposite strand
    '''

    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
示例#2
0
def flagExonStatus(infile, outfile):
    '''
    Adds two attributes to the gtf entry:
    exon_status_locus - specifies whether the gene model is multi- or single exon
    exon_status - specifies whether the transcript is mult- or single exon
    '''

    PipelineLncRNA.flagExonStatus(infile, outfile)
def classifyFilteredLncRNA(infiles, outfile):
    '''
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    NOTE: This task is not included when running the full pipeline
    '''
    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
 def buildFilteredLncRNAGeneSet(infiles, outfile):
     """
     Creates a filtered lncRNA geneset. 
     This geneset will not include any single exon lncRNA 
     unless it has been seen previously i.e. it overlaps
     a previously identified lncRNA
     """
     PipelineLncRNA.buildFilteredLncRNAGeneSet(infiles[0], outfile, infiles[1 : len(infiles)])
def buildFinalLncRNAGeneSet(infile, outfile):
    """
    the final lncRNA gene set consists of transcripts that pass
    the initial filtering stage i.e. are;
    multi-exonic/previously seen single exon transcripts
    display low evidence for coding potential
    """

    # filter based on coding potential
    PipelineLncRNA.buildFinalLncRNAGeneSet(infile, "lncrna_filtered_cpc_result", outfile, PARAMS["filtering_cpc"])
def splitLncRNAFasta(infile, outfiles):
    out_dir = "./phyloCSF/lncrna_fasta"

    name_dict = {}
    for mapping in PARAMS["phyloCSF_map_species_names"].split(","):
        pair = mapping.split(":")
        key = ">" + pair[0]
        value = ">" + pair[1]
        name_dict[key] = value
    E.info("Name mapping: %s" % name_dict)

    PipelineLncRNA.splitAlignedFasta(infile, out_dir, name_dict)
示例#7
0
def buildLncRNAGeneSet(infiles, outfile):
    '''
    build lncRNA gene set. 
    
    This is a set of transcripts in the abinitio set that
    do not overlap at any protein coding or pseudogene transcripts
    or additional biotypes from ensembl that are unwanted
    (exons) in a reference gene set.
    
    Transcripts need to have a length of at least 200 bp.
    '''
    PipelineLncRNA.buildLncRNAGeneSet( infiles[0], infiles[1], infiles[2], infiles[3], infiles[4], outfile, PARAMS["lncrna_min_length"] )        
def buildRefnoncodingGeneSet(infile, outfile):
    '''
    filter the refnoncoding geneset for things that are described in ensembl
    as being:
    Ambiguous_orf
    Retained_intron
    Sense_intronic
    antisense
    Sense_overlapping
    Processed transcript
    '''
    PipelineLncRNA.buildRefnoncodingGeneSet(infile, outfile)
 def buildFilteredLncRNAGeneSet(infiles, outfile):
     '''
     Creates a filtered lncRNA geneset. That contains previously identified
     gene models supplied in contig file.
     '''
     assert PARAMS["filtering_remove_single_exon"] in ["loci",
                                                       "transcripts",
                                                       None]
     PipelineLncRNA.buildFilteredLncRNAGeneSet(
         infiles[0],
         outfile,
         infiles[1:len(infiles)],
         filter_se=PARAMS["filtering_remove_single_exon"])
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes. 

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as 
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
    i.e. nothing novel. This may underestimate the number 
    of transcripts that are actually expressed
    '''
    PipelineLncRNA.buildCodingGeneSet(infiles[0], infiles[1], outfile)
示例#13
0
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes. 
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)


    outf = open(outfile, "w")
    outf.write("\t".join(["no_transcripts", 
                          "no_genes", 
                          "no_exons_per_transcript", 
                          "no_exons_per_gene",
                          "no_single_exon_transcripts", 
                          "no_multi_exon_transcripts", 
                          "no_single_exon_genes", 
                          "no_multi_exon_genes"]) + "\n")
    outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerGene(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonGenes(tmpf).count()])))


    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
示例#17
0
def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")
示例#18
0
def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
示例#19
0
文件: Summary.py 项目: santayana/cgat
    def __call__(self, track, slice=None):

        return odict((("single_exon", PipelineLncRNA.CounterSingleExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count()), ("multi_exon", PipelineLncRNA.CounterMultiExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count())))
def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)
示例#21
0
def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
示例#22
0
def flagExonStatus(infile, outfile):
    """
    Adds an attribute to the gtf entry dependent on whether the lncRNA 
    is multi or single exon
    """
    PipelineLncRNA.flagExonStatus(infile, outfile)
示例#23
0
def classifyFilteredLncRNA(infiles, outfile):
    """
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    """
    PipelineLncRNA.classifyLncRNAGenes(infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--index=gene_id")
示例#25
0
def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)