def Sashimiplottting(bamdir, countsin, inputpsi, genelis): inputpsi = unique.filepath(inputpsi) text_file = open(inputpsi, 'rU') lines = text_file.readlines() text_file.close() samp = sample(inputpsi) gene_label, gene_sym = genelist(inputpsi) header = True junction_max = [] countsin = unique.filepath(countsin) for line in open(countsin, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if header: samples = t[1:] header = False exon_sum_array = [0] * len(samples) count_sum_array = [0] * len(samples) else: values = map(float, t[1:]) count_sum_array = [ sum(value) for value in zip(*[count_sum_array, values]) ] for i in range(len(samp)): sample_read[samp[i]] = count_sum_array[i] #print samp[i],sample_read[samp[i]] genelis = unique.filepath(genelis) sashmi_plot_list(bamdir, genelis, gene_label, lines, samp, gene_sym)
def retreiveAllKnownSpliceSites(): ### Uses a priori strand information when none present import export, unique chromosomes_found={} parent_dir = export.findParentDir(bam_file) species = None for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] if species == None: species = IndicatedSpecies splicesite_db={} refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] return splicesite_db,chromosomes_found
def Sashimiplottting(bamdir,countsin,PSIFilename,eventsToVisualizeFilename,events=None): PSIFilename = unique.filepath(PSIFilename) header=True junction_max=[] countsin = unique.filepath(countsin) count_sum_array=[] count=0 for line in open(countsin,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if header: samples = [] for s in t[1:]: if '.bed' not in s: s+='.bed' samples.append(s) header=False count_sum_array=[0]*len(samples) else: values = map(float,t[1:]) count_sum_array = [sum(value) for value in zip(*[count_sum_array,values])] count+=1 if count >30000 and 'salomonis' in bamdir: break index=0 for sample in samples: count_sum_array_db[sample] = count_sum_array[index] index+=1 if events==None: #print 'Preparing Sashimi-Input:',eventsToVisualizeFilename eventsToVisualizeFilename = unique.filepath(eventsToVisualizeFilename) gene_to_symbol=sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=events) return gene_to_symbol
def retreiveAllKnownSpliceSites(): ### Uses a priori strand information when none present import export, unique chromosomes_found={} parent_dir = export.findParentDir(bam_file) for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] splicesite_db={} refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] return splicesite_db,chromosomes_found
def Sashimiplottting(bamdir,countsin,inputpsi,genelis): inputpsi = unique.filepath(inputpsi) text_file = open(inputpsi,'rU') lines = text_file.readlines() text_file.close() samp=sample(inputpsi) gene_label,gene_sym=genelist(inputpsi) header=True junction_max=[] countsin = unique.filepath(countsin) for line in open(countsin,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if header: samples = t[1:] header=False exon_sum_array=[0]*len(samples) count_sum_array=[0]*len(samples) else: values = map(float,t[1:]) count_sum_array = [sum(value) for value in zip(*[count_sum_array,values])] for i in range(len(samp)): sample_read[samp[i]]=count_sum_array[i] #print samp[i],sample_read[samp[i]] genelis = unique.filepath(genelis) sashmi_plot_list(bamdir,genelis,gene_label,lines,samp,gene_sym)
def retreiveAllKnownSpliceSites(returnExonRetention=False,DesignatedSpecies=None,path=None): ### Uses a priori strand information when none present import export, unique chromosomes_found={} try: parent_dir = export.findParentDir(bam_file) except Exception: parent_dir = export.findParentDir(path) species = None for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] if species == None: try: species = IndicatedSpecies except Exception: species = DesignatedSpecies splicesite_db={} gene_coord_db={} try: if ExonReference==None: exon_dir = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt' length = verifyFileLength(exon_dir) except Exception: #print traceback.format_exc();sys.exit() length = 0 if length==0: exon_dir = ExonReference refExonCoordinateFile = unique.filepath(exon_dir) firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] spliceEvent = t[-2] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] try: gene_coord_db[geneID,chr].append(int(start)) gene_coord_db[geneID,chr].append(int(stop)) except Exception: gene_coord_db[geneID,chr] = [int(start)] gene_coord_db[geneID,chr].append(int(stop)) if returnExonRetention: if 'exclusion' in spliceEvent or 'exclusion' in spliceEvent: splicesite_db[geneID+':'+exon]=[] else: splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] for i in gene_coord_db: gene_coord_db[i].sort() gene_coord_db[i] = [gene_coord_db[i][0],gene_coord_db[i][-1]] return splicesite_db,chromosomes_found,gene_coord_db
def verifyFile(filename): status = False try: fn=unique.filepath(filename) for line in open(fn,'rU').xreadlines(): status = True;break except Exception: status = False return status
def importIsoformAnnotations(species,platform,psievents,annotType=None,junctionPairFeatures={},dataType='reciprocal'): count=0 if annotType == 'domain': if dataType == 'reciprocal': fn = 'AltDatabase/'+species+'/'+platform+'/'+'probeset-domain-annotations-exoncomp.txt' else: fn = 'AltDatabase/'+species+'/'+platform+'/junction/'+'probeset-domain-annotations-exoncomp.txt' else: if dataType == 'reciprocal': fn = 'AltDatabase/'+species+'/'+platform+'/'+'probeset-protein-annotations-exoncomp.txt' else: fn = 'AltDatabase/'+species+'/'+platform+'/junction/'+'probeset-protein-annotations-exoncomp.txt' fn = unique.filepath(fn) for line in open(fn,'rU'): line = line.rstrip('\n') values = string.split(line,'\t') junctions = string.split(values[0],'|') features = formatFeatures(values[1:]) antiFeatures = inverseFeatureDirections(features) if tuple(junctions) in psievents: try: junctionPairFeatures[tuple(junctions)].append(string.join(features,', ')) except Exception: junctionPairFeatures[tuple(junctions)] = [string.join(features,', ')] if dataType == 'reciprocal': junctions.reverse() if tuple(junctions) in psievents: try: junctionPairFeatures[tuple(junctions)].append(string.join(antiFeatures,', ')) except Exception: junctionPairFeatures[tuple(junctions)] = [string.join(antiFeatures,', ')] count+=1 print count, 'protein predictions added' return junctionPairFeatures
def importDatabaseEventAnnotations(species,platform): terminal_exons={} header=True count=0 fn = 'AltDatabase/'+species+'/'+platform+'/'+species+'_Ensembl_exons.txt' fn = unique.filepath(fn) for line in open(fn,'rU'): line = line.rstrip('\n') values = string.split(line,'\t') if header: eI = values.index('splice_events') header=False continue exon = values[0] event = values[eI] if 'alt-N-term' in event or 'altPromoter' in event: if 'cassette' not in event: terminal_exons[exon] = 'altPromoter' count+=1 elif 'alt-C-term' in event: if 'cassette' not in event: terminal_exons[exon] = 'alt-C-term' count+=1 """ elif 'bleedingExon' in event or 'altFinish' in event: terminal_exons[exon] = 'bleedingExon' count+=1""" print count, 'terminal exon annotations stored' return terminal_exons
def indexdic(fname): fname = unique.filepath(fname) head=0 for line in open(fname,'rU').xreadlines(): #for k in range(len(a['AltAnalyze_ID'])): if head ==0: head=1 continue else: a=string.split(line,'\t') #p=a['AltAnalyze_ID'][k] p=a[0] j=string.split(p,':') #print j[0] for i in range(len(j)): if "ENS" in j[i]: if '-' in j[i]: ji=string.split(j[i],'-') jj=ji[1] else: jj=j[i] #print jj,'first check' if jj in index_read: index_read[jj].append(p) else: index_read[jj]=[p,] return index_read
def importExpressionValues(filename): """ Imports tab-delimited expression values""" header = True sample_expression_db = {} fn = unique.filepath(filename) for line in open(fn, "rU").xreadlines(): data = UI.cleanUpLine(line) if header: sample_names = string.split(data, "\t") header = False else: exp_values = string.split(data, "\t") gene = exp_values[0] index = 1 for value in exp_values[1:]: sample_name = sample_names[index] if sample_name in sample_expression_db: gene_expression_db = sample_expression_db[sample_name] gene_expression_db[gene] = value else: gene_expression_db = {} gene_expression_db[gene] = value sample_expression_db[sample_name] = gene_expression_db index += 1 return sample_expression_db
def extractFeatures(species,countinp): import export ExonsPresent=False if 'counts.' in countinp: feature_file = string.replace(countinp,'counts.','features.') fe = export.ExportFile(feature_file) firstLine = True for line in open(countinp,'rU').xreadlines(): if firstLine: firstLine=False else: feature_info = string.split(line,'\t')[0] fe.write(feature_info+'\n') if ExonsPresent == False: exon = string.split(feature_info,'=')[0] if '-' not in exon: ExonsPresent = True ### Add exon-info if necessary if ExonsPresent == False: exons_file = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine = True for line in open(exons_file,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t') gene = t[0] exon = t[1] chr = t[2] strand = t[3] start = t[4] end = t[5] fe.write(gene+':'+exon+'='+chr+':'+start+'-'+end+'\n') fe.close() return feature_file
def importPSIJunctions(fname): fname = unique.filepath(fname) header=True for line in open(fname,'rU').xreadlines(): line = line.rstrip(os.linesep) if header: header = False else: t=string.split(line,'\t') try: ### Re-order these to have the exclusion be listed first j1a,j1b = string.split(t[2],'-') j2a,j2b = string.split(t[3],'-') j1a = string.split(j1a,':')[1] j2a = string.split(j2a,':')[1] j1a = int(float(string.split(j1a,'.')[0][1:])) j1b = int(float(string.split(j1b,'.')[0][1:])) j2a = int(float(string.split(j2a,'.')[0][1:])) j2b = int(float(string.split(j2b,'.')[0][1:])) #print [j1a,j2a,j1b,j2b], t[2], t[3] if j1a>j2a or j1b<j2b: val = t[2]+' '+t[3] else: val=t[3]+' '+t[2] except Exception: #print traceback.format_exc();sys.exit() val=t[2]+' '+t[3] if '-' not in t[2]: val = t[3]+' '+t[2] val = string.replace(val,":","__") lis.append(val) #print t[0] return lis
def importAgilentExpressionValues(filename,array,channel_to_extract): """ Imports Agilent Feature Extraction files for one or more channels """ print '.', red_expr_db={} green_expr_db={} parse=False fn=unique.filepath(filename) for line in open(fn,'rU').xreadlines(): data = UI.cleanUpLine(line) if parse==False: if 'ProbeName' in data: headers = string.split(data,'\t') pn = headers.index('ProbeName') try: gc = headers.index('gProcessedSignal') except Exception: pass try: rc = headers.index('rProcessedSignal') except Exception: pass parse = True else: t = string.split(data,'\t') probe_name = t[pn] try: green_channel = math.log(float(t[gc])+1,2) #min is 0 except Exception: pass try: red_channel = math.log(float(t[rc])+1,2) #min is 0 except Exception: pass if 'red' in channel_to_extract: red_expr_db[probe_name] = red_channel if 'green' in channel_to_extract: green_expr_db[probe_name] = green_channel if 'red' in channel_to_extract: red_channel_db[array] = red_expr_db if 'green' in channel_to_extract: green_channel_db[array] = green_expr_db
def genelist(fname): fname = unique.filepath(fname) header=True for line in open(fname,'rU').xreadlines(): line = line.rstrip(os.linesep) if header: header = False else: t=string.split(line,'\t') try: ### Re-order these to have the exclusion be listed first j1a,j1b = string.split(t[2],'-') j2a,j2b = string.split(t[3],'-') j1a = string.split(j1a,':')[1] j2a = string.split(j2a,':')[1] j1a = int(float(string.split(j1a,'.')[0][1:])) j1b = int(float(string.split(j1b,'.')[0][1:])) j2a = int(float(string.split(j2a,'.')[0][1:])) j2b = int(float(string.split(j2b,'.')[0][1:])) #print [j1a,j2a,j1b,j2b], t[2], t[3] if j1a>j2a or j1b<j2b: val = t[2]+' '+t[3] else: val=t[3]+' '+t[2] except Exception: #print traceback.format_exc();sys.exit() val=t[2]+' '+t[3] if '-' not in t[2]: val = t[3]+' '+t[2] val = string.replace(val,":","__") lis.append(val) #print t[0] return lis
def importPSIJunctions(fname): All_PSI_Reciprocol_Junctions=[] fname = unique.filepath(fname) header=True for line in open(fname,'rU').xreadlines(): line = line.rstrip(os.linesep) if header: header = False else: t=string.split(line,'\t') junction1 = t[2] junction2 = t[3] try: ### Re-order these to have the exclusion be listed first j1a,j1b = string.split(t[2],'-') j2a,j2b = string.split(t[3],'-') j1a = string.split(j1a,':')[1] j2a = string.split(j2a,':')[1] j1a = int(float(string.split(j1a,'.')[0][1:])) j1b = int(float(string.split(j1b,'.')[0][1:])) j2a = int(float(string.split(j2a,'.')[0][1:])) j2b = int(float(string.split(j2b,'.')[0][1:])) #print [j1a,j2a,j1b,j2b], t[2], t[3] event1 = string.replace(junction2,":","__") ### first listed junction event2 = string.replace(junction2,":","__") ### second listed junction if j1a>j2a or j1b<j2b: event_pair = event1,event2 else: event_pair=event2,event1 except Exception: #print traceback.format_exc();sys.exit() event_pair=event1,event2 if '-' not in event1: event_pair = event2,event1 All_PSI_Reciprocol_Junctions.append(event_pair) return All_PSI_Reciprocol_Junctions
def importCircularRNAEvents(folder, circ_p): dataset_events = {} files = unique.read_directory(folder) for file in files: if 'circRNA.' in file and '.txt' in file: events = [] dataset = file[:-4] fn = unique.filepath(folder + '/' + file) firstRow = True for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if firstRow: index = 0 """ Standard Fields from MultiPath-PSI """ for i in t: if 'PValue' == i: pv = index if 'logFC' == i: lf = index index += 1 firstRow = False else: id = t[0] pval = float(t[pv]) logFC = float(t[lf]) ci = circInformation(id, pval, logFC) if pval < circ_p: events.append(ci) dataset_events[dataset] = events return dataset_events
def reimportFeatures(featureFile): """ Import the exon and gene coordinates """ gene_event_db = {} featureFile = unique.filepath(featureFile) head = 0 for line in open(featureFile, 'rU').xreadlines(): #for k in range(len(strand['AltAnalyze_ID'])): if head == 0: head = 1 else: line = line.rstrip('\n') event = string.split( line, '\t' )[0] #example event: ENSMUSG00000025915:E17.2-E17.5=chr1:9885753-9886047 event = string.replace(event, ':', '__') event_split = string.split(event, '__') for i in range(len(event_split)): if "ENS" in event_split[i] or '00000' in event_split[i]: if '-' in event_split[i]: ji = string.split(event_split[i], '-') gene = ji[1] else: gene = event_split[i] featureID, position = string.split( event, '=' ) ### store the feature (exon or junction) position and ID separately pd = PositionData(position) if gene in gene_event_db: feature_db = gene_event_db[gene] feature_db[featureID] = pd else: feature_db = {featureID: pd} gene_event_db[gene] = feature_db return gene_event_db
def importExpressionValues(filename): """ Imports tab-delimited expression values""" header = True sample_expression_db={} fn=unique.filepath(filename) for line in open(fn,'rU').xreadlines(): data = UI.cleanUpLine(line) if header: sample_names = string.split(data,'\t') header = False else: exp_values = string.split(data,'\t') gene = exp_values[0] index=1 for value in exp_values[1:]: sample_name = sample_names[index] if sample_name in sample_expression_db: gene_expression_db = sample_expression_db[sample_name] gene_expression_db[gene] = value else: gene_expression_db={} gene_expression_db[gene] = value sample_expression_db[sample_name] = gene_expression_db index+=1 return sample_expression_db
def reimportFeatures(featureFile): """ Import the exon and gene coordinates """ gene_event_db={} featureFile = unique.filepath(featureFile) head=0 for line in open(featureFile,'rU').xreadlines(): #for k in range(len(strand['AltAnalyze_ID'])): if head ==0: head=1 else: line = line.rstrip('\n') event=string.split(line,'\t')[0] #example event: ENSMUSG00000025915:E17.2-E17.5=chr1:9885753-9886047 event = string.replace(event,':','__') event_split=string.split(event,'__') for i in range(len(event_split)): if "ENS" in event_split[i] or '00000' in event_split[i]: if '-' in event_split[i]: ji=string.split(event_split[i],'-') gene=ji[1] else: gene=event_split[i] featureID,position = string.split(event,'=') ### store the feature (exon or junction) position and ID separately pd = PositionData(position) if gene in gene_event_db: feature_db = gene_event_db[gene] feature_db[featureID] = pd else: feature_db = {featureID:pd} gene_event_db[gene]=feature_db return gene_event_db
def genelist(fname): fname = unique.filepath(fname) for line in open(fname,'rU').xreadlines(): line = line.rstrip(os.linesep) t=string.split(line,'\t') val=t[2]+' '+t[3] lis.append(val) #print t[0] return lis
def verifyFileLength(filename): count = 0 try: fn=unique.filepath(filename) for line in open(fn,'rU').xreadlines(): count+=1 if count>9: break except Exception: null=[] return count
def extractFeatures(species,countsFileDir): import export ExonsPresent=False lastgene = None lastend = None genes_detected={} count=0 first_last_exons = {} ### Make strand fake junction comprised of the first and last exon if 'counts.' in countsFileDir: ### The feature_file contains only ExonID or Gene IDs and associated coordinates feature_file = string.replace(countsFileDir,'counts.','features.') fe = export.ExportFile(feature_file) firstLine = True for line in open(countsFileDir,'rU').xreadlines(): if firstLine: firstLine=False else: feature_info = string.split(line,'\t')[0] fe.write(feature_info+'\n') junction_annotation = string.split(feature_info,'=')[0] if '-' in junction_annotation: geneid = string.split(junction_annotation,':')[0] genes_detected[geneid]=[] if ExonsPresent == False: exon = string.split(feature_info,'=')[0] if '-' not in exon: ExonsPresent = True ### Add exon-info if necessary exons_file = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine = True for line in open(exons_file,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t') gene,exon,chr,strand,start,end = t[:6] if gene!=lastgene: if len(genes_detected)==0 or gene in genes_detected: ### restrict to detected genes first_last_exons[gene,strand] = [(chr,start)] if len(genes_detected)==0 or lastgene in genes_detected: ### restrict to detected genes try: first_last_exons[lastgene,laststrand].append(lastend) except Exception: pass ### occurs for the first gene if ExonsPresent == False: fe.write(gene+':'+exon+'='+chr+':'+start+'-'+end+'\n') lastgene = gene; lastend = end; laststrand = strand if len(genes_detected)==0 or lastgene in genes_detected: first_last_exons[lastgene,laststrand].append(lastend) ### Add strand fake junction for the whole gene for (gene,strand) in first_last_exons: (chr,start),end = first_last_exons[gene,strand] if strand == '-': start,end = end,start # Need to encode strand in this annotation, do this by strand orienting the positions fe.write(gene+':E1.1-E100.1'+'='+chr+':'+start+'-'+end+'\n') fe.close() return feature_file ### return the location of the exon and gene coordinates file
def filepath(filename): try: import unique ### local to AltAnalyze fn = unique.filepath(filename) except Exception: ### Should work fine when run as a script with this (AltAnalyze code is specific for packaging with AltAnalyze) dir=os.path.dirname(dirfile.__file__) try: dir_list = os.listdir(filename); fn = filename ### test to see if the path can be found (then it is the full path) except Exception: fn=os.path.join(dir,filename) return fn
def extractFeatures(species,countinp): import export ExonsPresent=False lastgene = None lastend = None genes_detected={} count=0 first_last_exons = {} ### Make a fake junction comprised of the first and last exon if 'counts.' in countinp: feature_file = string.replace(countinp,'counts.','features.') fe = export.ExportFile(feature_file) firstLine = True for line in open(countinp,'rU').xreadlines(): if firstLine: firstLine=False else: feature_info = string.split(line,'\t')[0] fe.write(feature_info+'\n') junction_annotation = string.split(feature_info,'=')[0] if '-' in junction_annotation: geneid = string.split(junction_annotation,':')[0] genes_detected[geneid]=[] if ExonsPresent == False: exon = string.split(feature_info,'=')[0] if '-' not in exon: ExonsPresent = True ### Add exon-info if necessary exons_file = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine = True for line in open(exons_file,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t') gene,exon,chr,strand,start,end = t[:6] if gene!=lastgene: if len(genes_detected)==0 or gene in genes_detected: ### restrict to detected genes first_last_exons[gene] = [(chr,start)] if len(genes_detected)==0 or lastgene in genes_detected: ### restrict to detected genes try: first_last_exons[lastgene].append(lastend) except Exception: pass ### occurs for the first gene if ExonsPresent == False: fe.write(gene+':'+exon+'='+chr+':'+start+'-'+end+'\n') lastgene = gene; lastend = end if len(genes_detected)==0 or lastgene in genes_detected: first_last_exons[lastgene].append(lastend) ### Add a fake junction for the whole gene for gene in first_last_exons: (chr,start),end = first_last_exons[gene] fe.write(gene+':E1.1-E100.1'+'='+chr+':'+start+'-'+end+'\n') fe.close() return feature_file
def importExonCoordinates(species): """ Import exon block, intron block and gene coordinates """ firstRow = True exon_coordinate_path = 'AltDatabase/ensembl/' + species + '/' + species + '_Ensembl_exon.txt' fn = unique.filepath(exon_coordinate_path) gene_coordinates = {} gene_to_exons = {} exon_block_coordinates = {} gene_chr_strand = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if firstRow: firstRow = False else: gene, exonid, chr, strand, exon_region_starts, exon_region_ends, constitutive_call, ens_exon_ids, splice_events, splice_junctions = t exon_region_starts = map(int, string.split(exon_region_starts, '|')) exon_region_ends = map(int, string.split(exon_region_ends, '|')) exon_block = gene + ':' + string.split(exonid, '.')[0] gene_chr_strand[gene] = chr, strand if gene in gene_to_exons: gene_to_exons[gene].append(exon_block) else: gene_to_exons[gene] = [exon_block] if gene in gene_coordinates: gene_coordinates[gene] += exon_region_starts + exon_region_ends else: gene_coordinates[gene] = exon_region_starts + exon_region_ends if exon_block in exon_block_coordinates: exon_block_coordinates[ exon_block] += exon_region_starts + exon_region_ends else: exon_block_coordinates[ exon_block] = exon_region_starts + exon_region_ends for gene in gene_coordinates: gene_coordinates[gene].sort() start = gene_coordinates[gene][0] end = gene_coordinates[gene][-1] chr, strand = gene_chr_strand[gene] gene_coordinates[gene] = chr, strand, start, end for exon in exon_block_coordinates: exon_block_coordinates[exon].sort() start = exon_block_coordinates[exon][0] end = exon_block_coordinates[exon][-1] chr, strand = gene_chr_strand[string.split(exon, ':')[0]] exon_block_coordinates[exon] = chr, strand, start, end print len(gene_coordinates), 'genes' print len(exon_block_coordinates), 'exons/introns' return gene_coordinates, exon_block_coordinates, gene_to_exons
def update_plot_settings(bamdir, group_psi_values, sample_headers): ### This functions writes out the sample orders, colors and sequence coverage for each BAM files for SashimiPlot bams = [] sample_colors = [] sample_coverage = [] colors = [ "red", "blue", "green", "grey", "orange", "purple", "yellow", "peach", "pink", "violet", "magenta", "navy", ] colors = colors * 300 color_index = 0 for group in group_psi_values: for index in group_psi_values[group]: g = sample_headers[index].replace(".bed", ".bam") bams.append('"' + g + '"') sample_colors.append('"' + colors[color_index] + '"') sample_coverage.append(str(int(sampleReadDepth[index]))) color_index += 1 ### reset for the new group bams = string.join(bams, ",") sample_colors = string.join(sample_colors, ",") sample_coverage = string.join(sample_coverage, ",") export_pl = open(unique.filepath("Config/sashimi_plot_settings.txt"), "w") export_pl.write("[data]\n") export_pl.write("bam_prefix = " + bamdir + "\n") export_pl.write("bam_files =[" + bams + "]\n") export_pl.write("\n") export_pl.write("[plotting]") export_pl.write("\n") export_pl.write("fig_width = 7 \nfig_height = 7 \nintron_scale = 30 \nexon_scale = 4 \nlogged = False\n") export_pl.write("font_size = 6 \nbar_posteriors = False \nnyticks = 4 \nnxticks = 4 \n") export_pl.write("show_ylabel = False \nshow_xlabel = True \nshow_posteriors = False \nnumber_junctions = True \n") export_pl.write("resolution = .5 \nposterior_bins = 40 \ngene_posterior_ratio = 5 \n") export_pl.write("colors =[" + sample_colors + "]\n") export_pl.write("coverages =[" + sample_coverage + "]\n") export_pl.write('bar_color = "b" \nbf_thresholds = [0, 1, 2, 5, 10, 20]') export_pl.close()
def searchDirectory(directory, var, secondary=None): directory = unique.filepath(directory) files = unique.read_directory(directory) for file in files: if var in file: if secondary == None: return directory + '/' + file break elif secondary in file: return directory + '/' + file break ### if all else fails return directory + '/' + file
def sample(fname): fname = unique.filepath(fname) head=0 samplelis=[] for line in open(fname,'rU').xreadlines(): line = cleanUpLine(line) if head ==0: t=string.split(line,'\t') #print t for p in range(11,len(t)): samplelis.append(t[p]) head=1 else: break; return samplelis
def searchDirectory(directory,var,secondary=None): directory = unique.filepath(directory) files = unique.read_directory(directory) for file in files: if var in file: if secondary== None: return directory+'/'+file break elif secondary in file: return directory+'/'+file break ### if all else fails return directory+'/'+file
def sample(fname): fname = unique.filepath(fname) head = 0 samplelis = [] for line in open(fname, 'rU').xreadlines(): line = cleanUpLine(line) if head == 0: t = string.split(line, '\t') #print t for p in range(11, len(t)): samplelis.append(t[p]) head = 1 else: break return samplelis
def update_plot_settings(bamdir, group_psi_values, sample_headers): ### This functions writes out the sample orders, colors and sequence coverage for each BAM files for SashimiPlot bams = [] sample_colors = [] sample_coverage = [] colors = [ 'red', 'blue', 'green', 'grey', 'orange', 'purple', 'yellow', 'peach', 'pink', 'violet', 'magenta', 'navy' ] colors = colors * 300 color_index = 0 for group in group_psi_values: for index in group_psi_values[group]: g = sample_headers[index].replace('.bed', '.bam') bams.append('"' + g + '"') sample_colors.append('"' + colors[color_index] + '"') sample_coverage.append(str(int(sampleReadDepth[index]))) color_index += 1 ### reset for the new group bams = string.join(bams, ',') sample_colors = string.join(sample_colors, ',') sample_coverage = string.join(sample_coverage, ',') export_pl = open(unique.filepath('Config/sashimi_plot_settings.txt'), 'w') export_pl.write('[data]\n') export_pl.write('bam_prefix = ' + bamdir + '\n') export_pl.write('bam_files =[' + bams + ']\n') export_pl.write('\n') export_pl.write('[plotting]') export_pl.write('\n') export_pl.write( 'fig_width = 7 \nfig_height = 7 \nintron_scale = 30 \nexon_scale = 4 \nlogged = False\n' ) export_pl.write( 'font_size = 6 \nbar_posteriors = False \nnyticks = 4 \nnxticks = 4 \n' ) export_pl.write( 'show_ylabel = False \nshow_xlabel = True \nshow_posteriors = False \nnumber_junctions = True \n' ) export_pl.write( 'resolution = .5 \nposterior_bins = 40 \ngene_posterior_ratio = 5 \n') export_pl.write('colors =[' + sample_colors + ']\n') export_pl.write('coverages =[' + sample_coverage + ']\n') export_pl.write('bar_color = "b" \nbf_thresholds = [0, 1, 2, 5, 10, 20]') export_pl.close()
def genelist(fname): fname = unique.filepath(fname) lis = [] for line in open(fname, 'rU').xreadlines(): line = cleanUpLine(line) t = string.split(line, '\t') gene = string.split(t[2], ':') val = t[2] + ' ' + t[3] lis.append(val) if gene[0] in gene_sym: continue else: gene_sym[gene[0]] = t[0] #print t[0] return lis, gene_sym
def remoteSashimiPlot(species, fl, bamdir, genelis): global inputpsi global outputdir try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + '/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir + '/' + file inputpsi = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt' #outputdir=findParentDir(inputpsi)+"sashimiplots" outputdir = root_dir + '/ExonPlots' outputdir = root_dir + '/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass #print bamdir #print countinp #print inputpsi #print genelis Sashimiplottting(bamdir, countinp, inputpsi, genelis) gene_label, gene_sym = genelist(inputpsi) for filename in os.listdir(outputdir): if '.pdf' in filename: newname = string.split(filename, ':') if newname[0] in gene_sym: new_filename = str(filename) if ':' in filename: new_filename = string.split(filename, ':')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = gene_sym[newname[0]] + '-SashimiPlot_' + new_filename os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) else: continue
def genelist(fname): fname = unique.filepath(fname) lis=[] for line in open(fname,'rU').xreadlines(): line = cleanUpLine(line) t=string.split(line,'\t') gene=string.split(t[2],':') val=t[2]+' '+t[3] lis.append(val) if gene[0] in gene_sym: continue else: gene_sym[gene[0]]=t[0] #print t[0] return lis,gene_sym
def remoteSashimiPlot(species,fl,bamdir,genelis): global inputpsi global outputdir try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir+'/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir+'/'+file inputpsi = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt' #outputdir=findParentDir(inputpsi)+"sashimiplots" outputdir = root_dir+'/ExonPlots' outputdir = root_dir+'/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass #print bamdir #print countinp #print inputpsi #print genelis Sashimiplottting(bamdir,countinp,inputpsi,genelis) gene_label,gene_sym=genelist(inputpsi) for filename in os.listdir(outputdir): if '.pdf' in filename: newname=string.split(filename,'__') if newname[0] in gene_sym: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename,'__')[1] elif '\\' in filename: new_filename = string.split(filename,'\\')[1] elif '/' in filename: new_filename = string.split(filename,'/')[1] nnname=gene_sym[newname[0]]+'-SashimiPlot_'+new_filename os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname)) else: continue
def importAgilentExpressionValues(filename, array, channel_to_extract): """ Imports Agilent Feature Extraction files for one or more channels """ print '.', red_expr_db = {} green_expr_db = {} parse = False fn = unique.filepath(filename) for line in open(fn, 'rU').xreadlines(): data = UI.cleanUpLine(line) if parse == False: if 'ProbeName' in data: headers = string.split(data, '\t') pn = headers.index('ProbeName') try: gc = headers.index('gProcessedSignal') except Exception: pass try: rc = headers.index('rProcessedSignal') except Exception: pass parse = True else: t = string.split(data, '\t') probe_name = t[pn] try: green_channel = math.log(float(t[gc]) + 1, 2) #min is 0 except Exception: pass try: red_channel = math.log(float(t[rc]) + 1, 2) #min is 0 except Exception: pass if 'red' in channel_to_extract: red_expr_db[probe_name] = red_channel if 'green' in channel_to_extract: green_expr_db[probe_name] = green_channel if 'red' in channel_to_extract: red_channel_db[array] = red_expr_db if 'green' in channel_to_extract: green_channel_db[array] = green_expr_db
def importSplicingEvents(folder): dataset_events = {} files = unique.read_directory(folder) for file in files: if 'PSI.' in file and '.txt' in file: events = [] dataset = file[:-4] fn = unique.filepath(folder + '/' + file) firstRow = True for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if firstRow: index = 0 """ Standard Fields from MultiPath-PSI """ for i in t: if 'Event-Direction' == i: ed = index if 'ClusterID' == i: ci = index if 'AltExons' == i: ae = index if 'EventAnnotation' == i: ea = index if 'Coordinates' == i: co = index index += 1 firstRow = False else: id = t[0] event_direction = t[ed] clusterID = t[ci] altExons = t[ae] coordinates = t[co] ei = EventInformation(id, event_direction, clusterID, altExons, coordinates) events.append(ei) dataset_events[dataset] = events return dataset_events
def download(url,dir,file_type): global suppress_printouts try: suppress_printouts = Suppress_Printouts except Exception: suppress_printouts = 'no' try: dp = download_protocol(url,dir,file_type); output_filepath, status = dp.getStatus(); fp = output_filepath except Exception: try: dir = unique.filepath(dir) ### Can result in the wrong filepath exported for AltDatabase RNA-Seq zip files (don't include by default) dp = download_protocol(url,dir,file_type); output_filepath, status = dp.getStatus(); fp = output_filepath except Exception: output_filepath='failed'; status = "Internet connection not established. Re-establish and try again." fp = filepath(dir+url.split('/')[-1]) ### Remove this empty object if saved if 'Internet' not in status: if '.zip' in fp or '.gz' in fp or '.tar' in fp: #print "\nRemoving zip file:",fp try: os.remove(fp); status = 'removed' except Exception: null=[] ### Not sure why this error occurs since the file is not open #print "\nRemoving zip file:",string.replace(fp,'.gz','') if '.tar' in fp: try: os.remove(string.replace(fp,'.gz','')) except Exception: null=[] return output_filepath, status
def eCLIPimport(folder): eCLIP_dataset_peaks = {} files = unique.read_directory(folder) for file in files: if '.bed' in file: peaks = [] dataset = file[:-4] fn = unique.filepath(folder + '/' + file) for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') chr = t[0] start = int(t[1]) end = int(t[2]) strand = t[5] annotation = t[6] gene = string.split(t[8], '.')[0] symbol = t[-2] pi = PeakInformation(chr, start, end, strand, annotation, gene, symbol) peaks.append(pi) eCLIP_dataset_peaks[dataset] = peaks return eCLIP_dataset_peaks
def importPSIJunctions(fname): All_PSI_Reciprocol_Junctions = [] fname = unique.filepath(fname) header = True for line in open(fname, 'rU').xreadlines(): line = line.rstrip(os.linesep) if header: header = False else: t = string.split(line, '\t') junction1 = t[2] junction2 = t[3] try: ### Re-order these to have the exclusion be listed first j1a, j1b = string.split(t[2], '-') j2a, j2b = string.split(t[3], '-') j1a = string.split(j1a, ':')[1] j2a = string.split(j2a, ':')[1] j1a = int(float(string.split(j1a, '.')[0][1:])) j1b = int(float(string.split(j1b, '.')[0][1:])) j2a = int(float(string.split(j2a, '.')[0][1:])) j2b = int(float(string.split(j2b, '.')[0][1:])) #print [j1a,j2a,j1b,j2b], t[2], t[3] event1 = string.replace(junction2, ":", "__") ### first listed junction event2 = string.replace(junction2, ":", "__") ### second listed junction if j1a > j2a or j1b < j2b: event_pair = event1, event2 else: event_pair = event2, event1 except Exception: #print traceback.format_exc();sys.exit() event_pair = event1, event2 if '-' not in event1: event_pair = event2, event1 All_PSI_Reciprocol_Junctions.append(event_pair) return All_PSI_Reciprocol_Junctions
def update_plot_settings(bamdir,group_psi_values,sample_headers): ### This functions writes out the sample orders, colors and sequence coverage for each BAM files for SashimiPlot bams=[] sample_colors=[] sample_coverage=[] colors = ['red','blue','green','grey','orange','purple','yellow','peach','pink','violet','magenta','navy'] colors = colors*300 color_index=0 for group in group_psi_values: for index in group_psi_values[group]: g=sample_headers[index].replace('.bed','.bam') bams.append('"'+g+'"') sample_colors.append('"'+colors[color_index]+'"') sample_coverage.append(str(int(sampleReadDepth[index]))) color_index+=1 ### reset for the new group bams = string.join(bams,',') sample_colors = string.join(sample_colors,',') sample_coverage = string.join(sample_coverage,',') export_pl=open(unique.filepath('Config/sashimi_plot_settings.txt'),'w') export_pl.write('[data]\n') export_pl.write('bam_prefix = '+bamdir+'\n') export_pl.write('bam_files =['+bams+']\n') export_pl.write('\n') export_pl.write('[plotting]') export_pl.write('\n') export_pl.write('fig_width = 7 \nfig_height = 7 \nintron_scale = 30 \nexon_scale = 4 \nlogged = False\n') export_pl.write('font_size = 6 \nbar_posteriors = False \nnyticks = 4 \nnxticks = 4 \n') export_pl.write('show_ylabel = False \nshow_xlabel = True \nshow_posteriors = False \nnumber_junctions = True \n') export_pl.write('resolution = .5 \nposterior_bins = 40 \ngene_posterior_ratio = 5 \n') export_pl.write('colors =['+sample_colors+']\n') export_pl.write('coverages =['+sample_coverage+']\n') export_pl.write('bar_color = "b" \nbf_thresholds = [0, 1, 2, 5, 10, 20]') export_pl.close()
def reimportFeatures(featureFile): gene_event_db={} featureFile = unique.filepath(featureFile) head=0 for line in open(featureFile,'rU').xreadlines(): #for k in range(len(a['AltAnalyze_ID'])): if head ==0: head=1 else: line = line.rstrip('\n') event=string.split(line,'\t')[0] #example event: ENSMUSG00000025915:E17.2-E17.5=chr1:9885753-9886047 event = string.replace(event,':','__') event_split=string.split(event,'__') for i in range(len(event_split)): if "ENS" in event_split[i] or '00000' in event_split[i]: if '-' in event_split[i]: ji=string.split(event_split[i],'-') gene=ji[1] else: gene=event_split[i] if gene in gene_event_db: gene_event_db[gene].append(event) else: gene_event_db[gene]=[event] return gene_event_db
max_len: define the upper limitation for the length of command string. A command string will be passed to R by a temporary file if it is longer than this value. use_numpy: Used as a boolean value. A False value will disable numpy even if it has been imported. use_dict: named list will be returned a dict if use_dict is True, otherwise it will be a list of tuples (name, value). host: The computer name (or IP) on which the R interpreter is installed. The value "localhost" means that the R locates on the the localhost computer. On POSIX systems (including Cygwin environment on Windows), it is possible to use R on a remote computer if the command "ssh" works. To do that, the user need set this value, and perhaps the parameter "user". user: The user name on the remote computer. This value need to be set only if the user name is different on the remote computer. In interactive environment, the password can be input by the user if prompted. If running in a program, the user need to be able to login without typing password! ssh: The program to login to remote computer. ''' if isinstance(Robj, basestring): Robj = R(RCMD=Robj, max_len=max_len, use_numpy=use_numpy, use_dict=use_dict, host=host, user=user, ssh=ssh) rlt = Robj.run(CMDS=CMDS) if len(rlt) == 1: rlt = rlt[0] return rlt if __name__ == '__main__': import unique path = unique.filepath("AltDatabase/R/Contents/MacOS/R") r = R(RCMD='R',use_numpy=True)
def remoteGene(gene,Species,root_dir,comparison_file): global Transcript_Annotations_File global ExonRegion_File global Selected_Gene global Prt_Trans_File global Prt_Regions_File global Prt_Boundaries_File global SplicingIndex_File global UniPrt_Regions_File global microRNA_File global domainAnnotation_db global platform global species Selected_Gene = str(gene) species = Species comparison_name = string.split(export.findFilename(comparison_file),'.')[0] ExonRegion_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_exon.txt") Transcript_Annotations_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_transcript-annotations.txt") Prt_Trans_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'Ensembl_Protein') Prt_Regions_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinFeatures') Prt_Boundaries_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinCoordinates') UniPrt_Regions_File = searchDirectory("AltDatabase/uniprot/"+species+"/",'FeatureCoordinate') SplicingIndex_File = searchDirectory(root_dir+'/AltResults/ProcessedSpliceData/','splicing-index',secondary=comparison_name) platform = getPlatform(SplicingIndex_File) microRNA_File = searchDirectory("AltDatabase/"+species+"/"+platform,'microRNAs_multiple') #print(SplicingIndex_File) total_val = ProteinCentricIsoformView(Selected_Gene) junctions = total_val[0] p_boundaries = total_val[1] p_domains = total_val[2] transcript_db = total_val[3] exon_db = total_val[4] splice_db = total_val[5] microRNA_db = total_val[6] domainAnnotation_db = total_val[7] #for i in exon_db: # print("THE", i, exon_db[i], "\n") #for i in microRNA_db: # m_test = microRNA_db[i] # print(len(m_test)) # for q in m_test: # print("microRNA", q.ExonBlock(), q.Description(), q.BP(), "\n") #for i in exon_db["ENST00000349238"]: # print(i[2].EnsemblRegion()) domain_color_list = [] for i in p_domains: ploy = p_domains[i] for a in ploy: domain_color_list.append(a[1]) domain_color_list = list(set(domain_color_list)) domain_color_key = {} c_color1 = [0.8, 0.6, 0.1] c_color2 = [0.1, 0.6, 0.8] c_color3 = [0.6, 0.1, 0.8] c_color4 = [0.95, 0.6, 0.3] c_color5 = [0.3, 0.6, 0.95] c_color6 = [0.6, 0.3, 0.95] FLAG = 1 for item in domain_color_list: if(FLAG == 1): domain_color_key[item] = c_color1 FLAG = FLAG + 1 continue if(FLAG == 2): domain_color_key[item] = c_color2 FLAG = FLAG + 1 continue if(FLAG == 3): domain_color_key[item] = c_color3 FLAG = FLAG + 1 continue if(FLAG == 4): domain_color_key[item] = c_color4 FLAG = FLAG + 1 continue if(FLAG == 5): domain_color_key[item] = c_color5 FLAG = FLAG + 1 continue if(FLAG == 6): domain_color_key[item] = c_color6 FLAG = 1 continue #for i in domain_color_key: #print(i, domain_color_key[i], "\n") Y = 100 Transcript_to_Y = {} for transcript in transcript_db: Transcript_to_Y[transcript] = Y Y = Y + 300 import traceback def onpick(event): #ind = event.ind print(event.artist.get_label()) #for i in domainAnnotation_db: print(i,len(domainAnnotation_db));break fig = pylab.figure() ylim = Y + 200 currentAxis = pylab.gca() #ax = pylab.axes() ax = fig.add_subplot(111) X_Pos_List = [] CoordsBank = [] for transcript in transcript_db: try: Junc_List = junctions[transcript] y_pos = Transcript_to_Y[transcript] Gene_List = exon_db[transcript] color_flag = 1 for entry in Gene_List: G_start = entry[0][0] G_end = entry[0][1] Exon_Object = entry[2] try: LabelClass = splice_db[Exon_Object.EnsemblRegion()] ExonName = Exon_Object.EnsemblExon() RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "Exon: " + str(ExonName) + "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n" Label = string.replace(Label,"\n"," ") if(RegCall == "UC"): color_choice = "Grey" else: S_Int = float(SplicingIndex) if(S_Int > 0): #color_choice = (0.7, 0.7, 0.99) color_choice = 'blue' if(S_Int < 0): #color_choice = (0.8, 0.4, 0.4) color_choice = 'red' except: #print(traceback.format_exc());sys.exit() Label = "" color_choice = "Grey" #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion()) if((color_flag % 2) == 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True)) y_end = y_pos + 50 try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' Pval: '+str(Midas)[:4])) except Exception: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) if((color_flag % 2) != 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True)) y_end = y_pos + 50 try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' p-value: '+str(Midas)[:4])) except Exception: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) color_flag = color_flag + 1 if(entry[2].EnsemblRegion() in microRNA_db): microRNA_object = microRNA_db[entry[2].EnsemblRegion()] mr_label = "MICRORNA MATCHES" + "\n" for class_object in microRNA_object: mr_exonname = class_object.ExonBlock() mr_desc = class_object.Description() + " " + class_object.Algorithms() #print(mr_desc) mr_label = mr_label + mr_desc + "\n" currentAxis.add_patch(Rectangle((G_start, (y_pos - 75)), (G_end - G_start), 40, color = "Green", label = (mr_label), picker = True)) y_start = y_pos - 75 y_end = y_pos - 35 CoordsBank.append((G_start, G_end, y_start, y_end, mr_desc)) for entry in Junc_List: junctionID = entry[-1] try: LabelClass = splice_db[entry[2]] RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n" if(float(SplicingIndex) > 0): color_junc = "blue" if(float(SplicingIndex) < 0): color_junc = "red" if(RegCall == "UC"): color_junc = "grey" except: Label = "" color_junc = "grey" currentAxis.add_patch(Rectangle((entry[0], y_pos), (entry[1] - entry[0]), 50, color = "White", label = (str(entry[2]) + Label), picker = True)) ax.arrow(entry[0], (y_pos+50), 8, 40, label = (str(entry[2]) + Label), color = color_junc, picker = True) ax.arrow((entry[0] + 8), (y_pos+90), 11, -40, label = (str(entry[2]) + Label), color = color_junc, picker = True) y_start = y_pos y_end = y_pos + 30 #print(junctionID,y_start,y_end) CoordsBank.append((G_start, G_end, y_start, y_end, junctionID)) try: P_Bound_List = p_boundaries[transcript] E_Start = P_Bound_List[-2] E_End = P_Bound_List[-1] P_Start = P_Bound_List[1] P_End = P_Bound_List[2] #print("Boundaries: ", P_Start, P_End) X_Pos_List.append(int(E_End)) #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue")) try: currentAxis.add_patch(Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start), 10)) except: pass p_label_list = ["DEF"] #CoordsBank.append((P_Start, P_End, y_pos, P_End - P_Start, transcript)) ### Added by NS - needs work try: P_Domain_List = p_domains[transcript] except Exception: P_Domain_List=[] for entry in P_Domain_List: #print("Domain", entry) color_domain_choice = domain_color_key[entry[1]] domain_annotation = domainAnnotation_db[entry[1]] #domain_annotation = string.replace(domain_annotation,'REGION-','') p_label = (str(entry[0]) + " " + str(domain_annotation)) #print(entry[0], entry[2], entry[3], P_Start, P_End, domain_annotation, ) Repeat_Flag = 0 for i in p_label_list: if(p_label == i): Repeat_Flag = 1 if(Repeat_Flag == 1): continue p_label_list.append(p_label) currentAxis.add_patch(Rectangle((entry[2], y_pos + 100), (entry[3] - entry[2]), 50, color = color_domain_choice, label= p_label, picker = True)) y_start = y_pos + 100 y_end = y_pos + 150 CoordsBank.append((entry[2], entry[3], y_start, y_end, p_label)) except Exception: pass #print(traceback.format_exc()) except: #print(traceback.format_exc()) pass pylab.ylim([0.0, ylim]) try: max_x = max(X_Pos_List) except: max_x = 5000 try: pylab.xlim([0.0, max_x]) except: pylab.xlim([0.0, 3000]) fig.canvas.mpl_connect('pick_event', onpick) def format_coord(x, y): for m in CoordsBank: if(x >= m[0] and x <= m[1] and y >= m[2] and y <= m[3]): string_display = m[4] return string_display string_display = " " return string_display ax.format_coord = format_coord #datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None) pylab.show()
def remoteSashimiPlot(Species, fl, bamdir, eventsToVisualizeFilename, events=None, show=False): global PSIFilename global outputdir global root_dir global steady_state_exp_file global species species = Species try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + "/ExpressionInput" files = unique.read_directory(search_dir) for file in files: if "counts." in file and "steady-state.txt" not in file: countinp = search_dir + "/" + file PSIFilename = root_dir + "/AltResults/AlternativeOutput/" + species + "_RNASeq_top_alt_junctions-PSI.txt" import ExpressionBuilder dir_list = unique.read_directory(root_dir + "/ExpressionInput") for file in dir_list: if "exp." in file and "steady-state" not in file: exp_file = root_dir + "/ExpressionInput/" + file elif "exp." in file and "steady-state" in file: steady_state_exp_file = root_dir + "/ExpressionInput/" + file global sample_group_db sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file) # outputdir=findParentDir(PSIFilename)+"sashimiplots" outputdir = root_dir + "/ExonPlots" outputdir = root_dir + "/SashimiPlots" try: os.mkdir(unique.filepath(outputdir)) except Exception: pass if show: s = open(outputdir + "/show.txt", "w") s.write("TRUE") s.close() else: s = open(outputdir + "/show.txt", "w") s.write("FALSE") s.close() geneSymbol_db = Sashimiplottting(bamdir, countinp, PSIFilename, eventsToVisualizeFilename, events=events) for filename in os.listdir(outputdir): if ".pdf" in filename or ".png" in filename: fn = string.replace(filename, ".pdf", "") fn = string.replace(fn, ".png", "") newname = string.split(fn, "__") if newname[0] in geneSymbol_db: new_filename = str(filename) if "__" in filename: new_filename = string.split(filename, "__")[1] elif "\\" in filename: new_filename = string.split(filename, "\\")[1] elif "/" in filename: new_filename = string.split(filename, "/")[1] nnname = geneSymbol_db[newname[0]][0] + "-SashimiPlot_" + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: if "already exists" in traceback.format_exc(): ### File already exists, delete the new one try: os.remove(os.path.join(outputdir, nnname)) except Exception: pass ### Now right the new one try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass pass else: continue print ""
def remoteGene(gene): global Transcript_Annotations_File global ExonRegion_File global Selected_Gene global Prt_Trans_File global Prt_Regions_File global Prt_Boundaries_File global Etc_File import unique Selected_Gene = gene ExonRegion_File = unique.filepath("ExonViewFiles/Hs_Ensembl_exon.txt") Transcript_Annotations_File = unique.filepath("ExonViewFiles/Hs_Ensembl_transcript-annotations.txt") Prt_Trans_File = unique.filepath("ExonViewFiles/Hs_Ensembl_Protein__65_37.txt") Prt_Regions_File = unique.filepath("ExonViewFiles/Hs_ProteinFeatures_build_65_37.txt") Prt_Boundaries_File = unique.filepath("ExonViewFiles/Hs_ProteinCoordinates_build_65_37.tab") Etc_File = unique.filepath("ExonViewFiles/Hs_RNASeq_K562_SRSF2_P95mut_vs_K562_SRSF2_WT.ExpCutoff-5.0_average-splicing-index-ProcessedSpliceData.txt") #"ENSG00000005801" #"ENSG00000110514" total_val = ProteinCentricIsoformView(Selected_Gene) junctions = total_val[0] p_boundaries = total_val[1] p_domains = total_val[2] transcript_db = total_val[3] exon_db = total_val[4] splice_db = total_val[5] #for i in exon_db["ENST00000349238"]: # print(i[2].EnsemblRegion()) domain_color_list = [] for i in p_domains: ploy = p_domains[i] for a in ploy: domain_color_list.append(a[1]) domain_color_list = list(set(domain_color_list)) domain_color_key = {} c_color1 = [0.8, 0.6, 0.1] c_color2 = [0.1, 0.6, 0.8] c_color3 = [0.6, 0.1, 0.8] c_color4 = [0.95, 0.6, 0.3] c_color5 = [0.3, 0.6, 0.95] c_color6 = [0.6, 0.3, 0.95] FLAG = 1 for item in domain_color_list: if(FLAG == 1): domain_color_key[item] = c_color1 FLAG = FLAG + 1 continue if(FLAG == 2): domain_color_key[item] = c_color2 FLAG = FLAG + 1 continue if(FLAG == 3): domain_color_key[item] = c_color3 FLAG = FLAG + 1 continue if(FLAG == 4): domain_color_key[item] = c_color4 FLAG = FLAG + 1 continue if(FLAG == 5): domain_color_key[item] = c_color5 FLAG = FLAG + 1 continue if(FLAG == 6): domain_color_key[item] = c_color6 FLAG = 1 continue for i in domain_color_key: print(i, domain_color_key[i], "\n") Y = 50 Transcript_to_Y = {} for transcript in transcript_db: Transcript_to_Y[transcript] = Y Y = Y + 200 import traceback ylim = Y + 200 currentAxis = plt.gca() ax = plt.axes() X_Pos_List = [] for transcript in transcript_db: try: Junc_List = junctions[transcript] y_pos = Transcript_to_Y[transcript] Gene_List = exon_db[transcript] color_flag = 1 for entry in Gene_List: G_start = entry[0][0] G_end = entry[0][1] Exon_Object = entry[2] try: LabelClass = splice_db[Exon_Object.EnsemblRegion()] ExonName = Exon_Object.EnsemblExon() RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "Exon: " + str(ExonName) + "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) if(RegCall == "UC"): color_choice = "Grey" else: S_Int = float(SplicingIndex) if(S_Int > 0): color_choice = (0.7, 0.7, 0.99) if(S_Int < 0): color_choice = (0.8, 0.4, 0.4) except: #print(traceback.format_exc());sys.exit() Label = "" color_choice = "Grey" #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion()) if((color_flag % 2) == 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label))) if((color_flag % 2) != 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label))) color_flag = color_flag + 1 for entry in Junc_List: try: LabelClass = splice_db[entry[2]] RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) if(float(SplicingIndex) > 0): color_junc = "blue" if(float(SplicingIndex) < 0): color_junc = "red" if(RegCall == "UC"): color_junc = "grey" except: Label = "" color_junc = "grey" currentAxis.add_patch(Rectangle((entry[0], y_pos), (entry[1] - entry[0]), 50, color = "White", label = (str(entry[2]) + Label))) ax.arrow(entry[0], (y_pos+50), 8, 40, label = (str(entry[2]) + Label), color = color_junc) ax.arrow((entry[0] + 8), (y_pos+90), 11, -40, label = (str(entry[2]) + Label), color = color_junc) P_Bound_List = p_boundaries[transcript] P_Domain_List = p_domains[transcript] E_Start = P_Bound_List[-2] E_End = P_Bound_List[-1] P_Start = P_Bound_List[1] P_End = P_Bound_List[2] #print("Boundaries: ", P_Start, P_End) X_Pos_List.append(int(E_End)) #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue")) try: currentAxis.add_patch(Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start), 10, label = ("Protein: " + str(P_Bound_List[0])))) except: pass for entry in P_Domain_List: #print("Domain", entry) color_domain_choice = domain_color_key[entry[1]] currentAxis.add_patch(Rectangle((entry[2], y_pos + 100), (entry[3] - entry[2]), 50, color = color_domain_choice, label= ("Protein: " + str(entry[0]) + "\n" + "Domain: " + str(entry[1])))) except: continue plt.ylim([0.0, ylim]) try: max_x = max(X_Pos_List) except: max_x = 5000 try: plt.xlim([0.0, max_x]) except: plt.xlim([0.0, 3000]) datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None) plt.show()
def sashmi_plot_list(bamdir,fname,gene_label,lines,samp,gene_sym): splicing_events=[] type = None firstLine = True for line in open(fname,'rU').xreadlines(): line = cleanUpLine(line) t = string.split(line,'\t') if firstLine: if 'junctionID-1' in t: j1i = t.index('junctionID-1') j2i = t.index('junctionID-2') type='ASPIRE' if 'ANOVA' in t: type='PSI' elif 'independent confirmation' in t: type='confirmed' elif 'ANOVA' in fname: type = 'ANOVA' firstLine=False if ' ' in t[0] and ':' in t[0]: splicing_events.append(t[0]) elif type=='ASPIRE': splicing_events.append(t[j1i] +' '+ t[j2i]) elif type=='ANOVA': try: a,b = string.split(t[0],'|') a = string.split(a,':') a = string.join(a[1:],':') splicing_events.append(a +' '+ b) except Exception: pass elif type=='PSI': try: j1,j2 = string.split(t[0],'|') a,b,c = string.split(j1,':') j1 = b+':'+c splicing_events.append(j1 +' '+ j2) except Exception: #print traceback.format_exc();sys.exit() pass elif type=='confirmed': try: event_pair1 = string.split(t[1],'|')[0] a,b,c,d = string.split(event_pair1,'-') splicing_events.append(a+'-'+b +' '+ c+'-'+d) except Exception: pass if len(splicing_events)==0: forceNoCompatibleEventsInFile print 'Exporting plots', for li in splicing_events: if ":U" in li or "-U" in li: continue else: li=cleanUpLine(li) #print li #dem[0]=['ENSG00000132424:I10.1 ENSG00000132424:E10.1-E11.1','ENSG00000146147:E10.3-E11.1 ENSG00000146147:E9.3-E15.1'] de=string.split(li,'\t') dem[0]=de #print dem[0] for key in dem: for i in range(len(dem[key])): list1=[] list2=[] try: k=gene_label.index(dem[key][i]) flag=1 lt=cleanUpLine(lines[k]) t=string.split(lt,'\t') #print t t=t[11:] #print t #list3=[] #ind=[] for x in range(len(t)): #print x,t[x] if(t[x]!=''): if float(t[x]) < 0.8: list1.append(x) #print x #print 'list1:'+str(x) else: list2.append(x) #print x # print str(x) else: continue if len(list1)>5: list1=list1[1:5] if len(list2)>5: list2=list2[1:5] #print len(list1),len(list2) except Exception: for ij in range(len(samp)): list1.append(ij) update_plot_settings(bamdir,list1,list2,samp) a=string.split(dem[key][i]," ") if '-' in a[1]: ch1=a[1] f=string.split(a[0],':') else: ch1=a[0] f=string.split(a[1],':') event=findParentDir(inputpsi) event=event+"trial_index/" setting =unique.filepath("Config/sashimi_plot_settings.txt") try: ch1=string.replace(ch1,':','__') except Exception: pass name=ch1 #outputdir=findParentDir(inputpsi)+"sashimiplots" try: os.makedirs(outputdir) except Exception: pass #print '********',[ch1],[event],outputdir try: ssp.plot_event(ch1,event,setting,outputdir) except Exception: #print '^^^^^^^^^^^^',[ch1],[event],outputdir;sys.exit() #print traceback.format_exc() #print "error2" #sys.exit() continue #outputdir=findParentDir(inputpsi)+"sashimiplots" for filename in os.listdir(outputdir): newname=string.split(filename,'/') #print newname[0] if newname[0] in gene_sym: new_path = gene_sym[newname[0]]+'-'+filename #new_path = string.replace() os.rename(filename,new_path) else: continue
def filepath(filename): fn = unique.filepath(filename) return fn
def formatAndSubmitSplicingEventsToSashimiPlot( filename, bamdir, splicing_events, sample_group_db, groups, expandedSearch ): ### Begin exporting parameters and events for SashimiPlot visualization firstLine = True setting = unique.filepath("Config/sashimi_plot_settings.txt") psi_parent_dir = findParentDir(filename) if "PSI" not in filename: index_dir = string.split(psi_parent_dir, "ExpressionInput")[0] + "AltResults/AlternativeOutput/sashimi_index/" else: index_dir = psi_parent_dir + "sashimi_index/" spliced_junctions = [] ### Alternatively, compare to just one of the junctions for splicing_event in splicing_events: try: j1, j2 = string.split(splicing_event, " ") spliced_junctions.append(j1) spliced_junctions.append(j2) except Exception: spliced_junctions.append(splicing_event) ### single gene ID or junction if "PSI" not in filename: splicing_events_db = {} for event in splicing_events: event = string.replace(event, ":", "__") if " " in event: event = string.split(event, " ")[-1] gene = string.split(event, "__")[0] try: splicing_events_db[gene].append(event) except Exception: splicing_events_db[gene] = [event] splicing_events = splicing_events_db import collections analyzed_junctions = [] processed_events = [] for line in open(filename, "rU").xreadlines(): line = cleanUpLine(line) t = string.split(line, "\t") if firstLine: if "PSI" in filename: sampleIndexBegin = 11 sample_headers = t[sampleIndexBegin:] else: sampleIndexBegin = 1 sample_headers = t[sampleIndexBegin:] if ".bed" not in sample_headers[0]: ### Add .bed if removed manually sample_headers = map(lambda s: s + ".bed", sample_headers) index = 0 sample_group_index = {} for s in sample_headers: group = sample_group_db[s] sample_group_index[index] = group try: sampleReadDepth[index] = count_sum_array_db[s] except Exception: sampleReadDepth[index] = count_sum_array_db[s] index += 1 firstLine = False else: if "PSI" in filename: splicing_event = val = t[2] + " " + t[3] j1 = t[2] j2 = t[3] if t[2] in analyzed_junctions and t[3] in analyzed_junctions: continue else: splicing_event = t[0] ### The gene ID j1 = t[0] j2 = t[0] if ":U" in splicing_event or "-U" in splicing_event: continue else: ### First check to see if the full splicing event matches the entry ### If not (and not a PSI regulation hits list), look for an individual junction match if splicing_event in splicing_events or ( expandedSearch and (j1 in spliced_junctions or j2 in spliced_junctions) ): if splicing_event in processed_events: continue if j2 in processed_events: continue if j1 in processed_events: continue processed_events.append(splicing_event) processed_events.append(j1) processed_events.append(j2) # print processed_events, splicing_event if "PSI" in filename: geneID = string.split(t[2], ":")[0] symbol = t[0] analyzed_junctions.append(t[2]) analyzed_junctions.append(t[3]) else: ### For exp.dataset-steady-state.txt files geneID = splicing_event events = splicing_events[geneID] index = 0 import collections initial_group_psi_values = {} try: group_psi_values = collections.OrderedDict() except Exception: try: import ordereddict group_psi_values = ordereddict.OrderedDict() except Exception: group_psi_values = {} for i in t[sampleIndexBegin:]: ### Value PSI range in the input file try: group = sample_group_index[index] except Exception: group = None try: try: initial_group_psi_values[group].append([float(i), index]) except Exception: initial_group_psi_values[group] = [[float(i), index]] except Exception: # print traceback.format_exc();sys.exit() pass ### Ignore the NULL values index += 1 ### limit the number of events reported and sort based on the PSI values in each group if "None" in groups and len(groups) == 1: initial_group_psi_values["None"].sort() group_size = len(initial_group_psi_values["None"]) / 2 filtered_group_index1 = map(lambda x: x[1], initial_group_psi_values["None"][:group_size]) filtered_group_index2 = map(lambda x: x[1], initial_group_psi_values["None"][group_size:]) group_psi_values["low"] = filtered_group_index1 group_psi_values["high"] = filtered_group_index2 else: gn = 0 for group in groups: gn += 1 # if gn>4: break if group in initial_group_psi_values: initial_group_psi_values[group].sort() if len(groups) > 7: filtered_group_indexes = map(lambda x: x[1], initial_group_psi_values[group][:1]) elif len(groups) > 5: filtered_group_indexes = map(lambda x: x[1], initial_group_psi_values[group][:2]) elif len(groups) > 3: filtered_group_indexes = map(lambda x: x[1], initial_group_psi_values[group][:4]) else: filtered_group_indexes = map(lambda x: x[1], initial_group_psi_values[group][:5]) group_psi_values[group] = filtered_group_indexes try: update_plot_settings(bamdir, group_psi_values, sample_headers) except Exception: print "Cannot update the settings file. Likely permissions issue." try: reordered = reorderEvents([t[2] + " " + t[3]]) reordered = string.split(reordered[0], " ") except Exception: reordered = [t[2] + " " + t[3]] reordered = string.split(reordered[0], " ") # print reordered if "PSI" in filename: try: formatted_splice_event = string.replace(reordered[1], ":", "__") except Exception: pass ### Submit the query try: ssp.plot_event(formatted_splice_event, index_dir, setting, outputdir) success = True except Exception: success = False # print traceback.format_exc() else: for event in events: try: ssp.plot_event(event, index_dir, setting, outputdir) # print 'success' #formatted_splice_event='ENSMUSG00000000355__E4.1-E5.1' except Exception: ### If it fails, output the gene-level plot try: ssp.plot_event(geneID, index_dir, setting, outputdir) success = True except Exception: success = False # print traceback.format_exc() """ ### Second attempt if 'PSI' in filename and success==False: ### Only relevant when parsing the junction pairs but not genes try: formatted_splice_event=string.replace(reordered[0],':','__') except Exception: pass try: ssp.plot_event(formatted_splice_event,index_dir,setting,outputdir); # print 'success' except Exception: pass """ return processed_events
def ProteinCentricIsoformView(Selected_Gene): Transcript_List = [] Transcript_db = {} Exon_db = {} for line in open(Transcript_Annotations_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") if(line[0] == Selected_Gene): transcriptID = line[-1] exonID = line[5] start = line[3] stop = line[4] strand = line[2] chr = line[1] if 'chr' not in chr: chr = 'chr'+chr exon_data = EnsemblRegionClass(start,stop,exonID,None,strand) exon_data.setChr(chr) Transcript_List.append((transcriptID, exonID)) try: Transcript_db[transcriptID].append(exon_data) except Exception: Transcript_db[transcriptID]=[exon_data] try: Exon_db[exonID].append(transcriptID) except Exception: Exon_db[exonID]=[transcriptID] Transcript_Protein_db = {} Protein_Transcript_db = {} Protein_List = [] count = 0 for line in open(Prt_Trans_File, "rU").xreadlines(): if(count == 0): count = 1 continue line = line.rstrip() line = line.split("\t") if(len(line) != 3): continue geneID = line[0] transcriptID = line[1] proteinID = line[2] if Selected_Gene == geneID: Transcript_Protein_db[transcriptID] = proteinID Protein_Transcript_db[proteinID] = transcriptID Protein_List.append(proteinID) #MicroRNA File microRNA_db = {} for line in open(microRNA_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") try: gene_and_exon_id = line[0].split(":") current_gene_id = gene_and_exon_id[0] current_exon_id = gene_and_exon_id[1] except Exception: continue #print([current_gene_id,current_exon_id,Selected_Gene]);break current_description = line[1] current_base_pairs = line[2] algorithms = line[3] if(current_gene_id == Selected_Gene): m = MicroRNAClass(current_exon_id, current_description, current_base_pairs, algorithms) try: if(len(microRNA_db[current_exon_id]) > 6): continue microRNA_db[current_exon_id].append(m) #print("ADDED!") except: microRNA_db[current_exon_id] = [m] Transcript_ExonRegion_db={} geneExonRegion_db={} exon_coord_db={} exonRegion_db={} AllBlocks = [("E", []), ("I", [])] # Store the exon region positions and later link them to the Ensembl exons for line in open(ExonRegion_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") geneID = line[0] exon_region = line[1] chr = line[2] exonID = line[1] strand = line[3] start = line[4] stop = line[5] er = EnsemblRegionClass(start,stop,exonID,exon_region,strand) if(geneID == Selected_Gene): Block_Num = exon_region[1:] I_E_id = exon_region[0] if(I_E_id == "E"): AllBlocks[0][1].append(Block_Num) if(I_E_id == "I"): AllBlocks[1][1].append(Block_Num) continue exon_added = False #Exon_List = line[7].split("|") exon_coord_db[chr,int(start),'start'] = exon_region exon_coord_db[chr,int(stop),'stop'] = exon_region exonRegion_db[Selected_Gene,exon_region] = er #print chr,start,'start' probeset_to_ExonID={} if platform != 'RNASeq': for line in open(unique.filepath('AltDatabase/'+species+'/'+string.lower(platform)+'/'+species+'_Ensembl_probesets.txt'), "rU").xreadlines(): line = line.rstrip() line = line.split("\t") gene = line[2] if gene == Selected_Gene: probeset = line[0] exon_region = line[12] if '.' not in exon_region: exon_region = string.replace(exon_region,'-','.') probeset_to_ExonID[probeset] = exon_region ETC_List = [] for line in open(SplicingIndex_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") if ':' in line[0]: GeneLine = line[0].split(":") FeatureID = GeneLine[1] else: FeatureID = line[0] Gene = line[1] regcall = line[2] spl_index = line[3] pval = line[4] midas = line[5] S_I_data = SplicingIndexClass(regcall, spl_index, pval, midas) if(Gene == Selected_Gene): if platform != 'RNASeq': if FeatureID in probeset_to_ExonID: FeatureID = probeset_to_ExonID[FeatureID] #print(FeatureID) ETC_List.append((FeatureID, S_I_data)) else: try: FeatureID = FeatureID.split("_") FeatureID = FeatureID[0] ETC_List.append((FeatureID, S_I_data)) except: pass ETC_dict = {} # Link the exon regions to the Ensembl exons for transcriptID in Transcript_db: for exon_data in Transcript_db[transcriptID]: start = exon_data.Start() stop = exon_data.Stop() chr = exon_data.Chr() strand = exon_data.Strand() try: start_exon_region = exon_coord_db[chr,start,'start'] stop_exon_region = exon_coord_db[chr,stop,'stop'] proceed = True except Exception: ### Not clear why this error occurs. Erroring region was found to be an intron region start position (I7.2 ENSMUSG00000020385) proceed = False if proceed: if '-' in strand: stop_exon_region,start_exon_region = start_exon_region,stop_exon_region regions = [start_exon_region] block,start_region = start_exon_region.split('.') start_region = int(float(start_region)) block,stop_region = stop_exon_region.split('.') stop_region = int(float(stop_region)) region = start_region+1 while region<stop_region: er = block+'.'+str(region) regions.append(er) region+=1 if stop_region != start_region: regions.append(stop_exon_region) for region in regions: er = exonRegion_db[Selected_Gene,region] try: Transcript_ExonRegion_db[transcriptID].append(er) except: Transcript_ExonRegion_db[transcriptID] = [er] exon_virtualToRealPos= c.OrderedDict() junction_transcript_db = {} for transcriptID in Transcript_ExonRegion_db: #print('transcripts:',transcriptID) position=0 Buffer=15 for exon_object in Transcript_ExonRegion_db[transcriptID]: if position!=0: if last_exon != exon_object.ExonBlock(): #print last_exon_region+'-'+exon_object.EnsemblRegion(),position,Buffer junctionID = last_exon_region+'-'+exon_object.EnsemblRegion() try: junction_transcript_db[transcriptID].append((position,position+Buffer, junctionID)) ### virtual junction positions except: junction_transcript_db[transcriptID] = [(position,position+Buffer, junctionID)] position+=Buffer virtualStart = position virtualStop = virtualStart + exon_object.Length() position = virtualStop try: exon_virtualToRealPos[transcriptID].append(([virtualStart,virtualStop],[exon_object.Start(), exon_object.Stop()],exon_object)) except Exception: exon_virtualToRealPos[transcriptID]=[([virtualStart,virtualStop],[exon_object.Start(), exon_object.Stop()],exon_object)] #print transcriptID,exon_object.ExonBlock(),exon_object.EnsemblExon(),exon_object.EnsemblRegion(),exon_object.Start(),exon_object.Stop(),virtualStart,virtualStop,"\n" last_exon = exon_object.ExonBlock() last_exon_region = exon_object.EnsemblRegion() for i in ETC_List: Region = i[0] S_I = i[1] Region = Region.split("-") if(len(Region) > 1): #Delete underscores from values. R_Start = Region[0] R_End = Region[1] R_Start = R_Start.split("_") R_End = R_End.split("_") R_Start = R_Start[0] R_End = R_End[0] R_Final = R_Start + "-" + R_End R_Type = R_Final[0] #print(R_Final) ETC_dict[R_Final] = S_I else: Region = Region[0] Region = Region.split("_") Region = Region[0] Region_type = Region[0] ETC_dict[Region] = S_I #if(Region_type == "E"): # for entry in AllBlocks[0][1]: # if(Region[1:] == entry): # ETC_dict[("E" + entry)] = S_I #if(Region_type == "I"): # for entry in AllBlocks[1][1]: # if(Region[1:] == entry): # ETC_dict[("I" + entry)] = S_I #for a in ETC_dict: # print(ETC_dict[a].RegCall(), a) #for i in junction_transcript_db: # print i, junction_transcript_db[i], "\n" Protein_Pos_Db = {} last_protein=None stored_stop=None for line in open(Prt_Boundaries_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") proteinID = line[0] if(proteinID in Protein_List): Stop = int(line[-1]) Start = int(line[-2]) if(proteinID != last_protein): if stored_stop !=None: #print proteinID,stored_start,stored_stop Protein_Pos_Db[last_protein] = [[stored_start,stored_stop,None]] stored_start = int(Start) if(proteinID == last_protein): stored_stop = int(Stop) last_protein = str(proteinID) Protein_Pos_Db[last_protein] = [(stored_start,stored_stop,None)] Protein_virtualPos = RealToVirtual(Protein_Pos_Db, exon_virtualToRealPos, Protein_Transcript_db,Transcript_ExonRegion_db) Domain_Pos_Db={} domainAnnotation_db={} #""" for line in open(Prt_Regions_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") proteinID = line[0] if proteinID in Protein_Pos_Db: domain_start = int(float(line[3])) domain_stop = int(float(line[4])) domainID = line[-2] domainName = line[-1] try: Domain_Pos_Db[proteinID].append((domain_start,domain_stop,domainID)) except: Domain_Pos_Db[proteinID] = [(domain_start,domain_stop,domainID)] domainAnnotation_db[domainID] = domainName #""" for line in open(UniPrt_Regions_File, "rU").xreadlines(): line = line.rstrip() line = line.split("\t") proteinID = line[0] if proteinID in Protein_Pos_Db: domain_start = int(float(line[3])) domain_stop = int(float(line[4])) domainID = line[-1] domainName = line[-1] try: Domain_Pos_Db[proteinID].append((domain_start,domain_stop,domainID)) except: Domain_Pos_Db[proteinID] = [(domain_start,domain_stop,domainID)] domainAnnotation_db[domainID] = domainName #print('--',domainName,domain_start,domain_stop) # Do the same for domain coordinates Domain_virtualPos = RealToVirtual(Domain_Pos_Db, exon_virtualToRealPos, Protein_Transcript_db,Transcript_ExonRegion_db) return_val = ((junction_transcript_db, Protein_virtualPos, Domain_virtualPos, Transcript_db, exon_virtualToRealPos, ETC_dict, microRNA_db, domainAnnotation_db)) return return_val
def performGroupNormalization(filename,export_dir,platform): expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename) groups_dir = string.replace(export_dir,'exp.','batch.') fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False group_db = importGroups(groups_dir) export_data = export.ExportFile(export_dir) for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and row_number==0: row_number = 0 elif row_number==0: sample_list = t[1:] new_sample_list = [] for group in group_db: group_samples = group_db[group] try: sample_index_list = map(lambda x: sample_list.index(x), group_samples) group_db[group] = sample_index_list new_sample_list+=group_samples except Exception: missing=[] for x in sample_list: if x not in t[1:]: missing.append(x) print 'missing:',missing print t print sample_list print filename, groups_dir print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order) export_data.write(title) row_number=1 else: gene = t[0] if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'): ### Convert to log2 RPKM values - or counts try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) else: try: all_values = map(float,t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs gene_log_folds = [] for group in group_db: sample_index_list = group_db[group] ### Calculate log-fold values relative to the mean of all sample expression values try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples except Exception: print len(values), sample_index_list;kill try: avg = statistics.avg(values) except Exception: values2=[] for v in values: try: values2.append(float(v)) except Exception: pass values = values2 try: avg = statistics.avg(values) except Exception: if len(values)>0: avg = values[0] else: avg = 0 try: log_folds = map(lambda x: (x-avg), values) except Exception: log_folds=[] for x in values: try: log_folds.append(x-avg) except Exception: log_folds.append('') gene_log_folds+=log_folds gene_log_folds = map(lambda x: str(x),gene_log_folds) export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n') export_data.close()
def remoteSashimiPlot(Species, fl, bamdir, eventsToVisualizeFilename, events=None, show=False): global PSIFilename global outputdir global root_dir global steady_state_exp_file global species species = Species try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + '/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir + '/' + file PSIFilename = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt' import ExpressionBuilder dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'exp.' in file and 'steady-state' not in file: exp_file = root_dir + '/ExpressionInput/' + file elif 'exp.' in file and 'steady-state' in file: steady_state_exp_file = root_dir + '/ExpressionInput/' + file global sample_group_db sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file) #outputdir=findParentDir(PSIFilename)+"sashimiplots" outputdir = root_dir + '/ExonPlots' outputdir = root_dir + '/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass if show: s = open(outputdir + '/show.txt', 'w') s.write('TRUE') s.close() else: s = open(outputdir + '/show.txt', 'w') s.write('FALSE') s.close() geneSymbol_db = Sashimiplottting(bamdir, countinp, PSIFilename, eventsToVisualizeFilename, events=events) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename, '.pdf', '') fn = string.replace(fn, '.png', '') newname = string.split(fn, '__') if newname[0] in geneSymbol_db: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename, '__')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = geneSymbol_db[ newname[0]][0] + '-SashimiPlot_' + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: if 'already exists' in traceback.format_exc(): ### File already exists, delete the new one try: os.remove(os.path.join(outputdir, nnname)) except Exception: pass ### Now right the new one try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass pass else: continue print ''
"iconfile": "Viewer.icns"} } setup(name=_appName, app=[_script], version=_appVersion, description=_appDescription, author=_authorName, author_email=_authorEmail, url=_authorURL, options=options, #data_files=data_files, setup_requires=["py2app"] ) import unique, shutil root_path = unique.filepath('') software_path = root_path+'/dist/AltAnalyzeViewer.app/Contents/Frameworks/Tcl.framework' shutil.rmtree(software_path) software_path = root_path+'/dist/AltAnalyzeViewer.app/Contents/Frameworks/Tk.framework' shutil.rmtree(software_path) software_path = root_path+'/dist/AltAnalyzeViewer.app/Contents/Resources/mpl-data/sample_data' shutil.rmtree(software_path) software_path = root_path+'/dist/AltAnalyzeViewer.app/Contents/Resources/lib/python2.7/matplotlib/tests' shutil.rmtree(software_path) if sys.platform.startswith("win"): ### example command: python setup.py py2exe from distutils.core import setup import py2exe import suds import numpy
def update_plot_settings(bamdir,list1,list2,samp): export_pl=open(unique.filepath('Config/sashimi_plot_settings.txt'),'w') export_pl.write('[data]') export_pl.write('\n') export_pl.write('bam_prefix = '+bamdir+'\n') export_pl.write('bam_files =[') for i in range(len(list1)): g=samp[list1[i]].replace('.bed','.bam') #print i if i==len(list1)-1 and len(list2)==0: export_pl.write('"'+g+'"]') else: export_pl.write('"'+g+'",') for j in range(len(list2)): #print j g=samp[list2[j]].replace('.bed','.bam') export_pl.write('"'+g+'"') if j==len(list2)-1: export_pl.write(']') else: export_pl.write(',') export_pl.write('\n') export_pl.write('[plotting]') export_pl.write('\n') export_pl.write('fig_width = 7 \nfig_height = 7 \nintron_scale = 30 \nexon_scale = 4 \nlogged = False\n') export_pl.write('font_size = 6 \nbar_posteriors = False \nnyticks = 4 \nnxticks = 4 \n') export_pl.write('show_ylabel = False \nshow_xlabel = True \nshow_posteriors = False \nnumber_junctions = True \n') export_pl.write('resolution = .5 \nposterior_bins = 40 \ngene_posterior_ratio = 5 \n') export_pl.write('colors =[') for i in range(len(list1)): export_pl.write('"'+'red'+'"') if i==len(list1)-1 and len(list2)==0: export_pl.write(']') else: export_pl.write(',') for j in range(len(list2)): export_pl.write('"'+'blue'+'"') if j==len(list2)-1: export_pl.write(']') else: export_pl.write(',') export_pl.write('\n') export_pl.write('coverages =[') for i in range(len(list1)): e=sample_read[samp[list1[i]]] export_pl.write(str(int(e))) if i==len(list1)-1 and len(list2)==0: export_pl.write(']') else: export_pl.write(',') for j in range(len(list2)): e=sample_read[samp[list2[j]]] export_pl.write(str(int(e))) if j==len(list2)-1: export_pl.write(']') else: export_pl.write(',') export_pl.write('\n') export_pl.write('bar_color = "b" \nbf_thresholds = [0, 1, 2, 5, 10, 20]') export_pl.close()