def classifyLncRNA(infiles, outfile): ''' Classify lncRNA realtive to protein coding loci Classify lincRNA in terms of their relationship to protein coding genes - creates indices for intervals on the fly - mayb should be creating additional annotations: antisense transcript overlapping protein coding exons on opposite strand antisense_upstream transcript < 2kb from tss on opposite strand antisense_downstream transcript < 2kb from gene end on opposite strand sense_upstream transcript < 2kb from tss on same strand sense_downstream transcript < 2kb from gene end on same strand intergenic transcript >2kb from any protein coding gene intronic overlaps protein coding gene intron on same strand antisense_intronic overlaps protein coding intron on opposite strand ''' PipelineLncRNA.classifyLncRNAGenes( infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
def flagExonStatus(infile, outfile): ''' Adds two attributes to the gtf entry: exon_status_locus - specifies whether the gene model is multi- or single exon exon_status - specifies whether the transcript is mult- or single exon ''' PipelineLncRNA.flagExonStatus(infile, outfile)
def classifyFilteredLncRNA(infiles, outfile): ''' classifies all lincRNA before cpc filtering to define any classes that are represented in the coding set that are filtered NOTE: This task is not included when running the full pipeline ''' PipelineLncRNA.classifyLncRNAGenes( infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
def buildFilteredLncRNAGeneSet(infiles, outfile): """ Creates a filtered lncRNA geneset. This geneset will not include any single exon lncRNA unless it has been seen previously i.e. it overlaps a previously identified lncRNA """ PipelineLncRNA.buildFilteredLncRNAGeneSet(infiles[0], outfile, infiles[1 : len(infiles)])
def buildFinalLncRNAGeneSet(infile, outfile): """ the final lncRNA gene set consists of transcripts that pass the initial filtering stage i.e. are; multi-exonic/previously seen single exon transcripts display low evidence for coding potential """ # filter based on coding potential PipelineLncRNA.buildFinalLncRNAGeneSet(infile, "lncrna_filtered_cpc_result", outfile, PARAMS["filtering_cpc"])
def splitLncRNAFasta(infile, outfiles): out_dir = "./phyloCSF/lncrna_fasta" name_dict = {} for mapping in PARAMS["phyloCSF_map_species_names"].split(","): pair = mapping.split(":") key = ">" + pair[0] value = ">" + pair[1] name_dict[key] = value E.info("Name mapping: %s" % name_dict) PipelineLncRNA.splitAlignedFasta(infile, out_dir, name_dict)
def buildLncRNAGeneSet(infiles, outfile): ''' build lncRNA gene set. This is a set of transcripts in the abinitio set that do not overlap at any protein coding or pseudogene transcripts or additional biotypes from ensembl that are unwanted (exons) in a reference gene set. Transcripts need to have a length of at least 200 bp. ''' PipelineLncRNA.buildLncRNAGeneSet( infiles[0], infiles[1], infiles[2], infiles[3], infiles[4], outfile, PARAMS["lncrna_min_length"] )
def buildRefnoncodingGeneSet(infile, outfile): ''' filter the refnoncoding geneset for things that are described in ensembl as being: Ambiguous_orf Retained_intron Sense_intronic antisense Sense_overlapping Processed transcript ''' PipelineLncRNA.buildRefnoncodingGeneSet(infile, outfile)
def buildFilteredLncRNAGeneSet(infiles, outfile): ''' Creates a filtered lncRNA geneset. That contains previously identified gene models supplied in contig file. ''' assert PARAMS["filtering_remove_single_exon"] in ["loci", "transcripts", None] PipelineLncRNA.buildFilteredLncRNAGeneSet( infiles[0], outfile, infiles[1:len(infiles)], filter_se=PARAMS["filtering_remove_single_exon"])
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS_ANNOTATIONS["interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def buildCodingGeneSet(infiles, outfile): ''' takes the output from cuffcompare of a transcript assembly and filters for annotated protein coding genes. NB "pruned" refers to nomenclature in the transcript building pipeline - transcripts that appear in at least two samples. Because an abinitio assembly will often contain fragments of known transcripts and describe them as novel, the default behaviour is to produce a set that is composed of 'complete' or 'contained' transcripts i.e. nothing novel. This may underestimate the number of transcripts that are actually expressed ''' PipelineLncRNA.buildCodingGeneSet(infiles[0], infiles[1], outfile)
def buildRefcodingGeneSetStats(infile, outfile): ''' counts: no. of transcripts no. genes average number of exons per transcript average number of exons per gene no. multi-exon transcripts no. single exon transcripts no. multi-exon genes no. single exon genes in the coding and lncRNA genesets ''' # calculate exon status for refcoding genes. tmpf = P.getTempFilename(".") + ".gz" PipelineLncRNA.flagExonStatus(infile, tmpf) outf = open(outfile, "w") outf.write("\t".join(["no_transcripts", "no_genes", "no_exons_per_transcript", "no_exons_per_gene", "no_single_exon_transcripts", "no_multi_exon_transcripts", "no_single_exon_genes", "no_multi_exon_genes"]) + "\n") outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), PipelineLncRNA.CounterGenes(tmpf).count(), PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), PipelineLncRNA.CounterExonsPerGene(tmpf).count(), PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), PipelineLncRNA.CounterMultiExonGenes(tmpf).count()]))) os.unlink(tmpf) os.unlink(tmpf + ".log") os.unlink(P.snip(tmpf, ".gz"))
def extractControllLncRNAFastaAlignments(infiles, outfile): bed_file, maf_file = infiles maf_tmp = P.getTempFilename("/ifs/scratch") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def convertControlGTFToBed12(infile, outfile): """ Convert either ensembl lincRNA, or control gtf to bed12 format """ PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
def convertGTFToBed12(infile, outfile): """ Transform the lncrna_final.gtf.gz into lncrna_final.bed """ PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
def loadLncRNAPhyloCSF(infile, outfile): tmpf = P.getTempFilename("/ifs/scratch") PipelineLncRNA.parsePhyloCSF(infile, tmpf) P.load(tmpf, outfile, options="--add-index=gene_id")
def __call__(self, track, slice=None): return odict((("single_exon", PipelineLncRNA.CounterSingleExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count()), ("multi_exon", PipelineLncRNA.CounterMultiExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count())))
def buildRefcodingGeneSet(infiles, outfile): ''' builds a refcoding geneset based on the genes that are present in the abinitio assembly ''' PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)
def flagExonStatus(infile, outfile): """ Adds an attribute to the gtf entry dependent on whether the lncRNA is multi or single exon """ PipelineLncRNA.flagExonStatus(infile, outfile)
def classifyFilteredLncRNA(infiles, outfile): """ classifies all lincRNA before cpc filtering to define any classes that are represented in the coding set that are filtered """ PipelineLncRNA.classifyLncRNAGenes(infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])
def loadLncRNAPhyloCSF(infile, outfile): tmpf = P.getTempFilename("/ifs/scratch") PipelineLncRNA.parsePhyloCSF(infile, tmpf) P.load(tmpf, outfile, options="--index=gene_id")