示例#1
0
def each_exon_rpkm(list_exons, bam_reader, library_size):
    """
    Args:
        list_exons = array of Exon objects
        bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) )
        gene_sym = string that is the gene symbol -> used to determine which elements (introns) are constitutive for that gene.
        library_size = integer that is the number of mapped reads in the sample
    Function: returns the RPKM for all exons in array 'list_exons'
    """
    #calculate expression of skipped exons
    hash_exon_rpkm = {
    }  #k = string of position (format = chrom:start-end), v = hash that are 2 counts, RPKM from unique reads & all reads
    for each_exon in list_exons:
        exon_count_uniq = Isoform.quant_genes_rpkm(bam_reader,
                                                   each_exon.str_genomic_pos(),
                                                   True)
        exon_count_all = Isoform.quant_genes_rpkm(bam_reader,
                                                  each_exon.str_genomic_pos(),
                                                  True)

        str_exon_pos = each_exon.str_genomic_pos()
        hash_exon_rpkm[str_exon_pos] = {}
        hash_exon_rpkm[str_exon_pos]['rpkm_uniq'] = Isoform.calc_read_density(
            exon_count_uniq, str_exon_pos, library_size)
        hash_exon_rpkm[str_exon_pos]['rpkm_all'] = Isoform.calc_read_density(
            exon_count_all, str_exon_pos, library_size)

    return hash_exon_rpkm
def sj_read_support_TEST(bam_reader, genomic_range):
    """
    Just playing around
    """
    hash_gr = Isoform.split_genome_pos(
        genomic_range)  #hash_gr = hash table of genomic range
    #these are the counts
    count = 0  #count all reads that map to "genomic_range"
    uniq_count = 0  #count all uniquely mapped reads to "genomic_range"
    align_score_max_count = 0  #this also counts all uniquely mapped reads to "genomic_range"
    for i, a in enumerate(bam_reader.fetch(region=genomic_range)):
        # score = 0
        # if a.optional_field( "NH" ) == 1:
        #     score += 1
        print i, "read ", i, " - ", a
        print i, "dir(a) = ", dir(a)
        print i, "a.aligned = ", a.aligned
        print i, "a.read = ", a.read
        print i, "a.read_as_aligned = ", a.read_as_aligned
        print i, "a._read = ", a._read
        print i, "a._read_as_sequenced = ", a._read_as_sequenced
        print i, "a.get_sam_line = ", a.get_sam_line(
        )  #this retrieves the read information & presents it as a .sam file line
        print i, ": a.cigar = ", a.cigar  #this is an array that contains information for each cigar. For example, if the CIGAR is 99M, then there is only 1 element in array "a.cigar" [99M. If CIGAR is 2M4926N74M, then there are 3 elements in the array "a.cigar" [2M, 4926N, 74M]

        #I can split the information for each read by splitting by tab-delimiter '\t'
        sam_line = a.get_sam_line()
        list_sam = sam_line.split('\t')
        print i, "list_sam = ", list_sam

        print "----------------\n"
示例#3
0
def create_obj_sj(hash_sj_info):
    """
    Args:
        hash_sj_info = a hash_sj_info from pandas Dataframe, where each hash_sj_info is indexed by the column labels
    Function: creates a SpliceJunction instance for the splice junction recorded in the file. Information about the splice junction is recorded in a hash_sj_info in the file contained in the variable "arr_rc"
    """
    sj_id = hash_sj_info['sj_id']
    hash_sj_pos = Isoform.split_genome_pos(hash_sj_info['sj_range'])
    chrom = hash_sj_pos['chrom']
    start = hash_sj_pos['start']
    end = hash_sj_pos['end']
    strand = '-' if int(
        hash_sj_info['strand']) == -1 else '+'  #needs to be in string format
    read_count = hash_sj_info['read_count']
    gene_sym = hash_sj_info['gene_name']
    # isoform_id = hash_sj_info['isoform_id']
    isoform_id = None
    sample_prevalence = hash_sj_info['prevalence_all']
    control_prevalence = hash_sj_info['prevalence_control']
    bool_intronic = True
    # obj_sj = SpliceJunction( sj_id, chrom, start, end, strand, read_count, gene_sym, isoform_id = None, sample_prevalence = 0, control_prevalence = 0, bool_intronic = False )
    obj_sj = SpliceJunction(sj_id, chrom, start, end, strand, read_count,
                            gene_sym, isoform_id, sample_prevalence,
                            control_prevalence, bool_intronic)

    return obj_sj
示例#4
0
def all_exons_rpkm(list_exons, bam_reader, library_size):
    """
    Args:
        list_exons = array of Exon objects
        bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) )
        gene_sym = string that is the gene symbol -> used to determine which elements (introns) are constitutive for that gene.
        library_size = integer that is the number of mapped reads in the sample
    Function: returns the RPKM for all exons in array 'list_exons'
    """
    #calculate expression of skipped exons
    list_exons_rpkm = []  #records RPKM for all exons
    for each_exon in list_exons:
        exon_count = Isoform.quant_genes_rpkm(bam_reader,
                                              each_exon.str_genomic_pos(),
                                              True)
        exon_rpkm = Isoform.calc_read_density(exon_count,
                                              each_exon.str_genomic_pos(),
                                              library_size)
        list_exons_rpkm.append(exon_rpkm)

    return list_exons_rpkm
def sj_read_support_pysam(pysam_file, genomic_range, uniq_only=False):
    """
    Args:
        pysam_file = pysam.AlignmentFile that opens up the mapped reads bam file (e.g. accepted_hits.bam)
        genomic_range = string that is the position of interest (format = chrom:start-end)
        uniq_only = boolean
            -True = will only quantify the uniquely-mapped gapped reads that map to 'genomic_range'
            -False = will quantify the uniquely & non-uniquely-mapped gapped reads that map to 'genomic_range'
    Function: this function retrieves gapped reads that supports range 'genomic_range'. NOTE that this function is much faster (maybe 6x faster) than SpliceJunction's def sj_read_support()
    """
    # if not genomic_range:
    #     genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end )

    all_count = 0
    unique_count = 0
    # hash_gr = Isoform.split_genome_pos( genomic_range )
    hash_gr = Isoform.split_genome_pos(genomic_range)
    for i, read in enumerate(
            pysam_file.fetch(hash_gr['chrom'], hash_gr['start'],
                             hash_gr['end'])):
        #this if if I only want to quantify uniquely-mapped reads -> slightly faster

        ##TEST::
        print i, " - read = ", read

        if uniq_only:
            if read.mapq != 50:
                continue

            if not any(x for x in read.blocks if hash_gr['start'] in x):
                continue
            if not any(x for x in read.blocks if hash_gr['end'] in x):
                continue

            unique_count += 1
        else:  #this if if I only want to quantify non-unique & uniquely-mapped reads -> slightly slower
            if not any(x for x in read.blocks if hash_gr['start'] in x):
                continue
            if not any(x for x in read.blocks if hash_gr['end'] in x):
                continue
            all_count += 1

            if read.mapq != 50:
                continue
            unique_count += 1

    # return [{'all_count': all_count, 'unique_count': unique_count}, hash_query_test, hash_query_pos]
    return {'all_count': all_count, 'unique_count': unique_count}
示例#6
0
#/usr/bin/python
import sys

from cruzdb import Genome

sys.path.insert(0, "/home/mokha/Documents/Krauthammer_Lab/PythonClasses")
from SVSv5 import Exon, Isoform, MultiIsoform, SpliceJunction, IsoformSJ

print "------------ Algorithm: 160919_Isoform_1.py ------------"
""" Reconstruct transcripts based on Splice Junctions """

#assign all splice junctions to specific gene: go through cruzdb & find end points for each gene --> assign
# g = Genome( 'sqlite:////tmp/hg19.db' )
g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

#retrieve gene & print information on it based on
gene = g.refGene.filter_by(name2='BRAF').all()
# all_genes = g.refGene.filter_by( name2 = 'TTN' ).first()
# all_genes = g.refGene.filter_by( name2 = 'AGRN' ).all()
# all_genes = g.refGene.filter_by( name2 = 'AGRN' ).first()
# all_genes = g.refGene.filter_by( name2 = 'DIXDC1' ).all()

for each_isoform in gene:
    obj_iso = Isoform(each_isoform.name)

    #print name
    print obj_iso.isoform_id, ":", obj_iso.gene_sym

    print "obj_iso = ", obj_iso
示例#7
0
DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5"
DIR_DATA = DIR_CURR + "/TestData"
DIR_RESULTS = DIR_CURR + "/TestResults"
# DIR_RESULTS = DIR_CURR + "/Results/160729_Analyze_KF"
# DIR_RESULTS = DIR_CURR + "/Results/160731_Analyze_KF"
# DIR_RESULTS = DIR_CURR + "/Results/160909_Analyze_KF"

DIR_FUSION = DIR_PROJ + "/160510_GeneFusions"

print "------------ TDD: 161002_IsoformFusion_1.py ------------"

#set kinase gene annotation file
KinaseFusion.set_kinasefile( DIR_FUSION + "/Data/160910_KinaseAnnots_hg38_Final.txt" )
obj_cruzdb = Genome( 'sqlite:////tmp/hg38_v2.db' )
#set cruzdb Genome database instance
Isoform.set_cruzdb( obj_cruzdb )

#CASE: This returns "None" for the kinase domain for the kinase gene (TLK2 - NM_001284363)
#Fusion - ASIC2:TLK2
hash_multi_isoform = { "orientation": 'fr',
"chrom_start": 'chr17',
"chrom_end": 'chr17',
"pos_start": 34038904,
"pos_end": 62565136,
"read_span": 5, 
"read_matepair": 5,
"read_matepair_break": 5 }

obj_mif = MultiIsoformFusion( hash_multi_isoform )      #MIF = MultiIsoform Fusion instance

for i, (k,v) in enumerate( obj_mif.isoform_fusions.iteritems() ):         #k = isoformIDs in fusion (format: isoformID_1:isoformID_2), v = IsoformFusion instance
def sj_read_support(bam_reader, genomic_range):
    """
    Args:
        bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) )
        genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self"
    Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position
    """
    # if not genomic_range:
    #     genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end )

    hash_gr = Isoform.split_genome_pos(
        genomic_range)  #hash_gr = hash table of genomic range
    #these are the counts
    count = 0  #count all reads that map to "genomic_range"
    uniq_count = 0  #count all uniquely mapped reads to "genomic_range"
    align_score_max_count = 0  #this also counts all uniquely mapped reads to "genomic_range"
    for i, a in enumerate(bam_reader.fetch(region=genomic_range)):
        # score = 0
        # if a.optional_field( "NH" ) == 1:
        #     score += 1
        """
        NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. 
    
        """
        for cigop in a.cigar:
            if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[
                    'start'] and cigop.ref_iv.end == hash_gr['end']:

                print i, ": a.get_sam_line = ", a.get_sam_line(
                )  #this retrieves the read information & presents it as a sam line
                print i, ": cigop = ", cigop
                print i, ": a.cigar = ", a.cigar
                print i, ">>>>>>>>>>>>>>>>>>>>\n"

                # if a.optional_field( "NH" ) == 1:     #check if read is uniquely-mapped read
                ##TEST:: see the output of read
                print i, ": a = ", dir(
                    a)  #see all possible properties of object
                print i, ": a.get_sam_line = ", a.get_sam_line(
                )  #this retrieves the read information & presents it as a sam line
                print i, ": a.from_SAM_line = ", a.from_SAM_line
                print i, ": dir( a.from_SAM_line ) = ", dir(a.from_SAM_line)
                print i, ": a.aligned = ", a.aligned
                print i, ": a.flag = ", a.flag
                print i, ": a.get_sam_line = ", a.get_sam_line, " & dir = ", dir(
                    a.get_sam_line)
                print i, ": a.read = ", a.read  #this is the nucleotide sequence
                print i, ": a.read_as_aligned = ", a.read_as_aligned  #this is the nucleotide sequence. "a.read_as_aligned" could be the reverse-complement to "a.read"
                print i, ": a.from_pysam_AlignedRead = ", a.from_pysam_AlignedRead, " & dir = ", dir(
                    a.from_pysam_AlignedRead)
                print i, ": NH = ", a.optional_field("NH")
                print i, ": aligned = ", a.aligned, " & aQual = ", a.aQual, " & read quality = ", a.read.qual
                print i, ": dir( a.cigar ) = ", dir(a.cigar)
                print i, ": dir( cigop ) = ", dir(cigop)
                print i, ": cigop.ref_iv = ", cigop.ref_iv, " & chrom = ", cigop.ref_iv.chrom, " & start = ", cigop.ref_iv.start, " & end = ", cigop.ref_iv.end
                print i, ": cigop.size = ", cigop.size
                print i, ": cigop.type = ", cigop.type
                print i, ": cigop.check = ", cigop.check, " & dir = ", dir(
                    cigop.check)
                print i, ": cigop.query_from = ", cigop.query_from  #CONJ: I think this refers the start of the range of nucleotides that map to the genome
                print i, ": cigop.query_to = ", cigop.query_to  #CONJ: I think this refers the end of the range of nucleotides that map to the genome

                #I can split the information for each read by splitting by tab-delimiter '\t'
                sam_line = a.get_sam_line()
                list_sam = sam_line.split('\t')
                print i, "list_sam = ", list_sam

                print '------------------\n\n'

                count += 1
                if a.optional_field("NH") == 1:
                    uniq_count += 1

                #this also counts uniquely mapped reads
                if a.aQual == 50:
                    align_score_max_count += 1

    ##TEST::
    # print " | total reads = ", count,
    # print " | unique map = ", count_uniq,
    # print " | quality 50 count = ", align_score_max_count

    return {
        "all_count": count,
        "unique_count": uniq_count,
        "unique_count_50": align_score_max_count
    }
def sj_read_support_variety_reads(bam_reader, genomic_range):
    """
    Args:
        bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) )
        genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self"
    Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position
    Output:
        {"all_count": count, "unique_count": uniq_count, "unique_count_50": align_score_max_count, "count_all_variety_reads": len( list_variety_reads ), "count_unique_variety_reads": len( unique_variety_reads ) }
        returns a hash with the following values:
            -"all_count" = all reads that support splicing event with position 'genomic_range'
            -"unique_count" = uniquely-mapped reads supporting splicing event with position 'genomic_range'. This looks for NH == 1
            -"unique_count_50" = same as "unique_count", but looks at quality of read (aQual == 50), where 50 means uniquely mapped reads
                -see "Protocol: 15.10.30 - Samtools" 
                    -50 (or 255): unique mapping (NH:i:1)
                    -3: maps to 2 locations in the target (NH:i:2, but I’ve also seen NH:i:3)
                    -2: maps to 3 locations
                    -1: maps to 4-9 locations (NH:i:4 or higher)
                    -0: maps to 10 or more locations
                    TO RETRIEVE UNIQUELY MAPPED READS: use command “samtools -q 4 file.bam” means any values above 4 are unique, where for tophat2 values 0 <= x <= 3 means multiple mapping at 50 means unique
            -"count_all_variety_reads" = total count of all reads with different end positions that support the splicing event. This is considering all reads, therefore there will be duplicates (meaning they will have the same start & end points - think of "thickBlocks" & "thinBlocks" for UCSC Genome Browser)
            -"count_unique_variety_reads" = count of # of reads with different end positions. This is the actually number of reads that support the splicing event & has different end points.
                -The hypothesis is the reads with the same end points are just the same RNA fragments sequenced during RNA-sequencing, therefore the more different types of reads supporting a splicing event, the more convincine the support.
    """
    # if not genomic_range:
    #     genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end )

    hash_gr = Isoform.split_genome_pos(
        genomic_range)  #hash_gr = hash table of genomic range
    #these are the counts
    count = 0  #count all reads that map to "genomic_range"
    uniq_count = 0  #count all uniquely mapped reads to "genomic_range"
    align_score_max_count = 0  #this also counts all uniquely mapped reads to "genomic_range"

    list_variety_reads = []
    for i, a in enumerate(bam_reader.fetch(region=genomic_range)):
        # score = 0
        # if a.optional_field( "NH" ) == 1:
        #     score += 1
        """
        NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. 
        """
        for cigop in a.cigar:
            if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[
                    'start'] and cigop.ref_iv.end == hash_gr['end']:

                #if splicing event matches end positions
                #I can split the information for each read by splitting by tab-delimiter '\t'
                sam_line = a.get_sam_line()
                list_sam = sam_line.split('\t')
                str_read_info = list_sam[2] + "|" + list_sam[
                    3] + "|" + list_sam[5]
                list_variety_reads.append(str_read_info)

                count += 1
                if a.optional_field("NH") == 1:
                    uniq_count += 1

                #this also counts uniquely mapped reads
                if a.aQual == 50:
                    align_score_max_count += 1

    unique_variety_reads = list(set(list_variety_reads))

    print "show unique_variety_reads: "
    print unique_variety_reads

    print "list_variety_reads = ", len(list_variety_reads)
    print "# of unique variety reads = ", len(unique_variety_reads)
示例#10
0
            obj_tt.arr_nuc_seq[i_tp_start:]
        )  #retrieve from tp_start until the last nucleotide

    ##TEST:: print "NMD_IRREL: strand = ", obj_tt.iso_sj.strand, " | i_tp_pux = ", i_tp_pux, " | i_tp_start = ", i_tp_start, " seq = ", nmd_irrelevant_seq

    return nmd_irrelevant_seq


print "------------ TDD: 170325_ExtractMutatedAASeq.py ------------"
"""
Algorithm: this tests
    -this algorithm will determine the mutated amino acid sequence based on aberrant splicing.
"""

g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

#show the end positions for each SJ
get_iso_id = 'NM_001008844'
get_isoform = g.refGene.filter_by(name=get_iso_id).first()
print "for gene = ", get_isoform.name2, " & isoform = ", get_isoform.name
for i, intron in get_isoform.introns:
    print "intron ", i, " - ", intron

#Test SJ 2
# sj_id = 'JUNC00090748'
# sj_range = 'chr12:56121123-56122061'
# strand = -1
# read_count = 13
# gene_sym = 'CD63'
# isoform_id = 'NM_001257400'
    intron_retain_info = {
        'ric_1': ric_1,
        'ric_2': ric_2,
        'intron_retained_1': express_intron_1[0],
        'intron_retained_2': express_intron_2[0],
        'constitutive_exons_express': constitutive_exons_express
    }
    return intron_retain_info


print "------------ TDD: 161109_Overlapping_Elements_V2.py ------------"
"""
Algorithm: this will test the functions in MultiIsoform -> find_overlapping_elements(), 
"""
g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

sample_name = 'yuhimo'
path_bam = DIR_RNASEQ + '/tophat_sample_' + sample_name + '/accepted_hits.bam'
bam_reader = HTSeq.BAM_Reader(path_bam)

#find overlapping splice junctions
# gene_sym = 'CDK11B'
# sj_pos = 'chr1:1573952-1575753'
gene_sym = 'MLIP'
isoform_info = g.refGene.filter_by(name2=gene_sym).all()
isoform_id = isoform_info[0].name

sj_pos = 'chr6:54025230-54034325'
hash_pos = Isoform.split_genome_pos(sj_pos)
chrom = hash_pos['chrom']
示例#12
0
    hash_c3 = {
        'ratio_skip_constitutive': ratio_skip_constitutive,
        'exons_skipped_express': exons_skipped_express,
        'constitutive_exons_express': constitutive_exons_express,
        'skip_exons_rpkm': skip_exons_rpkm,
        'c_exons_rpkm': c_exons_rpkm,
        'alt_exons_rpkm': alt_exons_rpkm
    }

    return hash_c3


print "------------ TDD: 161111_SJ_Metrics.py ------------"

g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

# sample_name = 'yuhimo'
# sample_name = 'yunige'
# sample_name = 'gopik'

#parameters for setting up classes
#example 1
# sample_name = 'yunige'
# gene_sym = 'MLIP'
# sj_pos = 'chr6:54025230-54034325'

#example 2
sample_name = 'yufulo'
gene_sym = 'MITF'
sj_pos = 'chr3:70014008-70014109'
示例#13
0
#Constants: file paths
DIR_PROJ = "/home/mokha/Documents/Krauthammer_Lab"
DIR_CURR = DIR_PROJ + "/PythonClasses/SVSv5"
DIR_DATA = DIR_CURR + "/TestData"
DIR_RESULTS = DIR_CURR + "/TestResults"
#get mapped reads
DIR_RNASEQ = DIR_PROJ + "/150802_TophatSamples"

print "------------ TDD: 161103_IdentifySJ.py ------------"
"""
Algorithm: this is meant to test SpliceJunction class to see if can determine a canonical SJ from an aberrant SJ
"""

g = Genome('sqlite:////tmp/hg19_v2.db')
Isoform.set_cruzdb(g)

# test_obj = Isoform.obj_cruzdb.knownToRefSeq.filter_by( value = 'NM_005112' ).first()

# print "test_obj = ", test_obj.name

# sj_pos = 'chr6:32410470-32410961'
# gene_sym = 'HLA-DRA'
# sj_pos = 'chr4:10084802-10086066'
# gene_sym = 'WDR1'
sj_pos = 'chr7:140500428-140501361'
gene_sym = 'BRAF'
isoform_id = 'NM_004333'

hash_sj = MultiIsoform.split_genome_pos(sj_pos)
chrom = hash_sj['chrom']