def each_exon_rpkm(list_exons, bam_reader, library_size): """ Args: list_exons = array of Exon objects bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) gene_sym = string that is the gene symbol -> used to determine which elements (introns) are constitutive for that gene. library_size = integer that is the number of mapped reads in the sample Function: returns the RPKM for all exons in array 'list_exons' """ #calculate expression of skipped exons hash_exon_rpkm = { } #k = string of position (format = chrom:start-end), v = hash that are 2 counts, RPKM from unique reads & all reads for each_exon in list_exons: exon_count_uniq = Isoform.quant_genes_rpkm(bam_reader, each_exon.str_genomic_pos(), True) exon_count_all = Isoform.quant_genes_rpkm(bam_reader, each_exon.str_genomic_pos(), True) str_exon_pos = each_exon.str_genomic_pos() hash_exon_rpkm[str_exon_pos] = {} hash_exon_rpkm[str_exon_pos]['rpkm_uniq'] = Isoform.calc_read_density( exon_count_uniq, str_exon_pos, library_size) hash_exon_rpkm[str_exon_pos]['rpkm_all'] = Isoform.calc_read_density( exon_count_all, str_exon_pos, library_size) return hash_exon_rpkm
def sj_read_support_TEST(bam_reader, genomic_range): """ Just playing around """ hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 print i, "read ", i, " - ", a print i, "dir(a) = ", dir(a) print i, "a.aligned = ", a.aligned print i, "a.read = ", a.read print i, "a.read_as_aligned = ", a.read_as_aligned print i, "a._read = ", a._read print i, "a._read_as_sequenced = ", a._read_as_sequenced print i, "a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a .sam file line print i, ": a.cigar = ", a.cigar #this is an array that contains information for each cigar. For example, if the CIGAR is 99M, then there is only 1 element in array "a.cigar" [99M. If CIGAR is 2M4926N74M, then there are 3 elements in the array "a.cigar" [2M, 4926N, 74M] #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') print i, "list_sam = ", list_sam print "----------------\n"
def create_obj_sj(hash_sj_info): """ Args: hash_sj_info = a hash_sj_info from pandas Dataframe, where each hash_sj_info is indexed by the column labels Function: creates a SpliceJunction instance for the splice junction recorded in the file. Information about the splice junction is recorded in a hash_sj_info in the file contained in the variable "arr_rc" """ sj_id = hash_sj_info['sj_id'] hash_sj_pos = Isoform.split_genome_pos(hash_sj_info['sj_range']) chrom = hash_sj_pos['chrom'] start = hash_sj_pos['start'] end = hash_sj_pos['end'] strand = '-' if int( hash_sj_info['strand']) == -1 else '+' #needs to be in string format read_count = hash_sj_info['read_count'] gene_sym = hash_sj_info['gene_name'] # isoform_id = hash_sj_info['isoform_id'] isoform_id = None sample_prevalence = hash_sj_info['prevalence_all'] control_prevalence = hash_sj_info['prevalence_control'] bool_intronic = True # obj_sj = SpliceJunction( sj_id, chrom, start, end, strand, read_count, gene_sym, isoform_id = None, sample_prevalence = 0, control_prevalence = 0, bool_intronic = False ) obj_sj = SpliceJunction(sj_id, chrom, start, end, strand, read_count, gene_sym, isoform_id, sample_prevalence, control_prevalence, bool_intronic) return obj_sj
def all_exons_rpkm(list_exons, bam_reader, library_size): """ Args: list_exons = array of Exon objects bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) gene_sym = string that is the gene symbol -> used to determine which elements (introns) are constitutive for that gene. library_size = integer that is the number of mapped reads in the sample Function: returns the RPKM for all exons in array 'list_exons' """ #calculate expression of skipped exons list_exons_rpkm = [] #records RPKM for all exons for each_exon in list_exons: exon_count = Isoform.quant_genes_rpkm(bam_reader, each_exon.str_genomic_pos(), True) exon_rpkm = Isoform.calc_read_density(exon_count, each_exon.str_genomic_pos(), library_size) list_exons_rpkm.append(exon_rpkm) return list_exons_rpkm
def sj_read_support_pysam(pysam_file, genomic_range, uniq_only=False): """ Args: pysam_file = pysam.AlignmentFile that opens up the mapped reads bam file (e.g. accepted_hits.bam) genomic_range = string that is the position of interest (format = chrom:start-end) uniq_only = boolean -True = will only quantify the uniquely-mapped gapped reads that map to 'genomic_range' -False = will quantify the uniquely & non-uniquely-mapped gapped reads that map to 'genomic_range' Function: this function retrieves gapped reads that supports range 'genomic_range'. NOTE that this function is much faster (maybe 6x faster) than SpliceJunction's def sj_read_support() """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) all_count = 0 unique_count = 0 # hash_gr = Isoform.split_genome_pos( genomic_range ) hash_gr = Isoform.split_genome_pos(genomic_range) for i, read in enumerate( pysam_file.fetch(hash_gr['chrom'], hash_gr['start'], hash_gr['end'])): #this if if I only want to quantify uniquely-mapped reads -> slightly faster ##TEST:: print i, " - read = ", read if uniq_only: if read.mapq != 50: continue if not any(x for x in read.blocks if hash_gr['start'] in x): continue if not any(x for x in read.blocks if hash_gr['end'] in x): continue unique_count += 1 else: #this if if I only want to quantify non-unique & uniquely-mapped reads -> slightly slower if not any(x for x in read.blocks if hash_gr['start'] in x): continue if not any(x for x in read.blocks if hash_gr['end'] in x): continue all_count += 1 if read.mapq != 50: continue unique_count += 1 # return [{'all_count': all_count, 'unique_count': unique_count}, hash_query_test, hash_query_pos] return {'all_count': all_count, 'unique_count': unique_count}
#/usr/bin/python import sys from cruzdb import Genome sys.path.insert(0, "/home/mokha/Documents/Krauthammer_Lab/PythonClasses") from SVSv5 import Exon, Isoform, MultiIsoform, SpliceJunction, IsoformSJ print "------------ Algorithm: 160919_Isoform_1.py ------------" """ Reconstruct transcripts based on Splice Junctions """ #assign all splice junctions to specific gene: go through cruzdb & find end points for each gene --> assign # g = Genome( 'sqlite:////tmp/hg19.db' ) g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) #retrieve gene & print information on it based on gene = g.refGene.filter_by(name2='BRAF').all() # all_genes = g.refGene.filter_by( name2 = 'TTN' ).first() # all_genes = g.refGene.filter_by( name2 = 'AGRN' ).all() # all_genes = g.refGene.filter_by( name2 = 'AGRN' ).first() # all_genes = g.refGene.filter_by( name2 = 'DIXDC1' ).all() for each_isoform in gene: obj_iso = Isoform(each_isoform.name) #print name print obj_iso.isoform_id, ":", obj_iso.gene_sym print "obj_iso = ", obj_iso
DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5" DIR_DATA = DIR_CURR + "/TestData" DIR_RESULTS = DIR_CURR + "/TestResults" # DIR_RESULTS = DIR_CURR + "/Results/160729_Analyze_KF" # DIR_RESULTS = DIR_CURR + "/Results/160731_Analyze_KF" # DIR_RESULTS = DIR_CURR + "/Results/160909_Analyze_KF" DIR_FUSION = DIR_PROJ + "/160510_GeneFusions" print "------------ TDD: 161002_IsoformFusion_1.py ------------" #set kinase gene annotation file KinaseFusion.set_kinasefile( DIR_FUSION + "/Data/160910_KinaseAnnots_hg38_Final.txt" ) obj_cruzdb = Genome( 'sqlite:////tmp/hg38_v2.db' ) #set cruzdb Genome database instance Isoform.set_cruzdb( obj_cruzdb ) #CASE: This returns "None" for the kinase domain for the kinase gene (TLK2 - NM_001284363) #Fusion - ASIC2:TLK2 hash_multi_isoform = { "orientation": 'fr', "chrom_start": 'chr17', "chrom_end": 'chr17', "pos_start": 34038904, "pos_end": 62565136, "read_span": 5, "read_matepair": 5, "read_matepair_break": 5 } obj_mif = MultiIsoformFusion( hash_multi_isoform ) #MIF = MultiIsoform Fusion instance for i, (k,v) in enumerate( obj_mif.isoform_fusions.iteritems() ): #k = isoformIDs in fusion (format: isoformID_1:isoformID_2), v = IsoformFusion instance
def sj_read_support(bam_reader, genomic_range): """ Args: bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self" Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 """ NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. """ for cigop in a.cigar: if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[ 'start'] and cigop.ref_iv.end == hash_gr['end']: print i, ": a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a sam line print i, ": cigop = ", cigop print i, ": a.cigar = ", a.cigar print i, ">>>>>>>>>>>>>>>>>>>>\n" # if a.optional_field( "NH" ) == 1: #check if read is uniquely-mapped read ##TEST:: see the output of read print i, ": a = ", dir( a) #see all possible properties of object print i, ": a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a sam line print i, ": a.from_SAM_line = ", a.from_SAM_line print i, ": dir( a.from_SAM_line ) = ", dir(a.from_SAM_line) print i, ": a.aligned = ", a.aligned print i, ": a.flag = ", a.flag print i, ": a.get_sam_line = ", a.get_sam_line, " & dir = ", dir( a.get_sam_line) print i, ": a.read = ", a.read #this is the nucleotide sequence print i, ": a.read_as_aligned = ", a.read_as_aligned #this is the nucleotide sequence. "a.read_as_aligned" could be the reverse-complement to "a.read" print i, ": a.from_pysam_AlignedRead = ", a.from_pysam_AlignedRead, " & dir = ", dir( a.from_pysam_AlignedRead) print i, ": NH = ", a.optional_field("NH") print i, ": aligned = ", a.aligned, " & aQual = ", a.aQual, " & read quality = ", a.read.qual print i, ": dir( a.cigar ) = ", dir(a.cigar) print i, ": dir( cigop ) = ", dir(cigop) print i, ": cigop.ref_iv = ", cigop.ref_iv, " & chrom = ", cigop.ref_iv.chrom, " & start = ", cigop.ref_iv.start, " & end = ", cigop.ref_iv.end print i, ": cigop.size = ", cigop.size print i, ": cigop.type = ", cigop.type print i, ": cigop.check = ", cigop.check, " & dir = ", dir( cigop.check) print i, ": cigop.query_from = ", cigop.query_from #CONJ: I think this refers the start of the range of nucleotides that map to the genome print i, ": cigop.query_to = ", cigop.query_to #CONJ: I think this refers the end of the range of nucleotides that map to the genome #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') print i, "list_sam = ", list_sam print '------------------\n\n' count += 1 if a.optional_field("NH") == 1: uniq_count += 1 #this also counts uniquely mapped reads if a.aQual == 50: align_score_max_count += 1 ##TEST:: # print " | total reads = ", count, # print " | unique map = ", count_uniq, # print " | quality 50 count = ", align_score_max_count return { "all_count": count, "unique_count": uniq_count, "unique_count_50": align_score_max_count }
def sj_read_support_variety_reads(bam_reader, genomic_range): """ Args: bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self" Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position Output: {"all_count": count, "unique_count": uniq_count, "unique_count_50": align_score_max_count, "count_all_variety_reads": len( list_variety_reads ), "count_unique_variety_reads": len( unique_variety_reads ) } returns a hash with the following values: -"all_count" = all reads that support splicing event with position 'genomic_range' -"unique_count" = uniquely-mapped reads supporting splicing event with position 'genomic_range'. This looks for NH == 1 -"unique_count_50" = same as "unique_count", but looks at quality of read (aQual == 50), where 50 means uniquely mapped reads -see "Protocol: 15.10.30 - Samtools" -50 (or 255): unique mapping (NH:i:1) -3: maps to 2 locations in the target (NH:i:2, but I’ve also seen NH:i:3) -2: maps to 3 locations -1: maps to 4-9 locations (NH:i:4 or higher) -0: maps to 10 or more locations TO RETRIEVE UNIQUELY MAPPED READS: use command “samtools -q 4 file.bam” means any values above 4 are unique, where for tophat2 values 0 <= x <= 3 means multiple mapping at 50 means unique -"count_all_variety_reads" = total count of all reads with different end positions that support the splicing event. This is considering all reads, therefore there will be duplicates (meaning they will have the same start & end points - think of "thickBlocks" & "thinBlocks" for UCSC Genome Browser) -"count_unique_variety_reads" = count of # of reads with different end positions. This is the actually number of reads that support the splicing event & has different end points. -The hypothesis is the reads with the same end points are just the same RNA fragments sequenced during RNA-sequencing, therefore the more different types of reads supporting a splicing event, the more convincine the support. """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" list_variety_reads = [] for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 """ NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. """ for cigop in a.cigar: if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[ 'start'] and cigop.ref_iv.end == hash_gr['end']: #if splicing event matches end positions #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') str_read_info = list_sam[2] + "|" + list_sam[ 3] + "|" + list_sam[5] list_variety_reads.append(str_read_info) count += 1 if a.optional_field("NH") == 1: uniq_count += 1 #this also counts uniquely mapped reads if a.aQual == 50: align_score_max_count += 1 unique_variety_reads = list(set(list_variety_reads)) print "show unique_variety_reads: " print unique_variety_reads print "list_variety_reads = ", len(list_variety_reads) print "# of unique variety reads = ", len(unique_variety_reads)
obj_tt.arr_nuc_seq[i_tp_start:] ) #retrieve from tp_start until the last nucleotide ##TEST:: print "NMD_IRREL: strand = ", obj_tt.iso_sj.strand, " | i_tp_pux = ", i_tp_pux, " | i_tp_start = ", i_tp_start, " seq = ", nmd_irrelevant_seq return nmd_irrelevant_seq print "------------ TDD: 170325_ExtractMutatedAASeq.py ------------" """ Algorithm: this tests -this algorithm will determine the mutated amino acid sequence based on aberrant splicing. """ g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) #show the end positions for each SJ get_iso_id = 'NM_001008844' get_isoform = g.refGene.filter_by(name=get_iso_id).first() print "for gene = ", get_isoform.name2, " & isoform = ", get_isoform.name for i, intron in get_isoform.introns: print "intron ", i, " - ", intron #Test SJ 2 # sj_id = 'JUNC00090748' # sj_range = 'chr12:56121123-56122061' # strand = -1 # read_count = 13 # gene_sym = 'CD63' # isoform_id = 'NM_001257400'
intron_retain_info = { 'ric_1': ric_1, 'ric_2': ric_2, 'intron_retained_1': express_intron_1[0], 'intron_retained_2': express_intron_2[0], 'constitutive_exons_express': constitutive_exons_express } return intron_retain_info print "------------ TDD: 161109_Overlapping_Elements_V2.py ------------" """ Algorithm: this will test the functions in MultiIsoform -> find_overlapping_elements(), """ g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) sample_name = 'yuhimo' path_bam = DIR_RNASEQ + '/tophat_sample_' + sample_name + '/accepted_hits.bam' bam_reader = HTSeq.BAM_Reader(path_bam) #find overlapping splice junctions # gene_sym = 'CDK11B' # sj_pos = 'chr1:1573952-1575753' gene_sym = 'MLIP' isoform_info = g.refGene.filter_by(name2=gene_sym).all() isoform_id = isoform_info[0].name sj_pos = 'chr6:54025230-54034325' hash_pos = Isoform.split_genome_pos(sj_pos) chrom = hash_pos['chrom']
hash_c3 = { 'ratio_skip_constitutive': ratio_skip_constitutive, 'exons_skipped_express': exons_skipped_express, 'constitutive_exons_express': constitutive_exons_express, 'skip_exons_rpkm': skip_exons_rpkm, 'c_exons_rpkm': c_exons_rpkm, 'alt_exons_rpkm': alt_exons_rpkm } return hash_c3 print "------------ TDD: 161111_SJ_Metrics.py ------------" g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) # sample_name = 'yuhimo' # sample_name = 'yunige' # sample_name = 'gopik' #parameters for setting up classes #example 1 # sample_name = 'yunige' # gene_sym = 'MLIP' # sj_pos = 'chr6:54025230-54034325' #example 2 sample_name = 'yufulo' gene_sym = 'MITF' sj_pos = 'chr3:70014008-70014109'
#Constants: file paths DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab" DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5" DIR_DATA = DIR_CURR + "/TestData" DIR_RESULTS = DIR_CURR + "/TestResults" #get mapped reads DIR_RNASEQ = DIR_PROJ + "/150802_TophatSamples" print "------------ TDD: 161103_IdentifySJ.py ------------" """ Algorithm: this is meant to test SpliceJunction class to see if can determine a canonical SJ from an aberrant SJ """ g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) # test_obj = Isoform.obj_cruzdb.knownToRefSeq.filter_by( value = 'NM_005112' ).first() # print "test_obj = ", test_obj.name # sj_pos = 'chr6:32410470-32410961' # gene_sym = 'HLA-DRA' # sj_pos = 'chr4:10084802-10086066' # gene_sym = 'WDR1' sj_pos = 'chr7:140500428-140501361' gene_sym = 'BRAF' isoform_id = 'NM_004333' hash_sj = MultiIsoform.split_genome_pos(sj_pos) chrom = hash_sj['chrom']