def annotateJunctionIDsAsExon(species,array_type): import ExonSeqModule probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_junction_probesets-filtered.txt' if array_type == 'RNASeq': probeset_annotations_file = string.replace(probeset_annotations_file,'junction_probesets-filtered','exons') junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type) probeset_annotations_file = 'AltDatabase/'+species+'/exon/'+species+'_Ensembl_probesets.txt' exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type) ### Extract unique exon regions from Exon Array annotations multiple_exon_regions={}; unique_exon_regions={} for probeset in exon_db: y = exon_db[probeset] geneid = y.GeneID() if '|' in y.ExonRegionID(): exonids = string.split(y.ExonRegionID(),'|') for exonid in exonids: multiple_exon_regions[geneid,exonid] = y else: unique_exon_regions[geneid,y.ExonRegionID()] = y ### Add missing exons to unique for uid in multiple_exon_regions: if uid not in unique_exon_regions: unique_exon_regions[uid]=multiple_exon_regions[uid] """ for i in unique_exon_regions: if 'ENSMUSG00000066842' in i: print i stop """ ### Extract unique exon regions from Junction Array annotation junction_to_exonids={} for probeset in junction_exon_db: if 'ENSMUSG00000066842' in probeset: print probeset y = junction_exon_db[probeset] geneid = y.GeneID() if '|' in y.ExonRegionID(): exonids = string.split(y.ExonRegionID(),'|') if probeset == 'ENSMUSG00000066842|E60.1': print [[exonids]] for exonid in exonids: if (geneid,exonid) in unique_exon_regions: y = unique_exon_regions[geneid,exonid] if probeset == 'ENSMUSG00000066842:E60.1': print [y.Probeset()] junction_to_exonids[probeset] = y.Probeset() else: if (geneid,string.replace(y.ExonRegionID(),'.','-')) in unique_exon_regions: #if ':' in probeset: print [probeset,y.ExonRegionID()];kill y = unique_exon_regions[geneid,string.replace(y.ExonRegionID(),'.','-')] junction_to_exonids[probeset] = y.Probeset() output_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt' fn=filepath(output_file); data = open(fn,'w') data.write(array_type+'_probeset\texon_probeset\n') for probeset in junction_to_exonids: exon_probeset = junction_to_exonids[probeset] data.write(probeset+'\t'+exon_probeset+'\n') data.close()
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type): if data_type == 'critical-exons': if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt' else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt' ###Import probe-level associations exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type) start_time = time.time() probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species) ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array) end_time = time.time(); time_diff = int(end_time-start_time) elif data_type == 'junctions': start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype) end_time = time.time(); time_diff = int(end_time-start_time) print "Analyses finished in %d seconds" % time_diff return probeset_seq_db
def getParametersAndExecute(probeset_seq_file, array_type, species, data_type): if data_type == 'critical-exons': if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_exons.txt' else: probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt' ###Import probe-level associations exon_db = ExonSeqModule.importSplicingAnnotationDatabase( probeset_annotations_file, array_type) start_time = time.time() probeset_seq_db = importProbesetSeqeunces( probeset_seq_file, exon_db, species ) ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array) end_time = time.time() time_diff = int(end_time - start_time) elif data_type == 'junctions': start_time = time.time() biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets probeset_seq_db = importSplicingAnnotationDatabaseAndSequence( species, array_type, biotype) end_time = time.time() time_diff = int(end_time - start_time) print "Analyses finished in %d seconds" % time_diff return probeset_seq_db