def importCriticalExonLocations(species,array_type,ensembl_exon_db,force): ###ensembl_exon_db[(geneid,chr,strand)] = [[E5,exon_info]] #exon_info = (exon_start,exon_stop,exon_id,exon_annot) ###ensembl_probeset_db[geneid,chr,strand].append(probeset_data) #probeset_data = [start,stop,probeset_id,exon_class,transcript_cluster_id] gene_info_db = {} for (ens_geneid,chr,strand) in ensembl_exon_db: gene_info_db[ens_geneid] = chr,strand filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical_exon_locations.txt' array_ensembl={} ###Get the most recent gene-symbol annotations (applicable with a new Ensembl build for the same genomic build) ensembl_symbol_db = getEnsemblAnnotations(species) primary_gene_annotation_file = 'AltDatabase/'+species +'/'+ array_type +'/'+ array_type+ '_gene_annotations.txt' update.verifyFile(primary_gene_annotation_file,array_type) array_gene_annotations = JunctionArray.importGeneric(primary_gene_annotation_file) for array_geneid in array_gene_annotations: t = array_gene_annotations[array_geneid]; description=t[0];entrez=t[1];symbol=t[2] if symbol in ensembl_symbol_db and len(symbol)>0 and len(array_geneid)>0: ens_geneid = ensembl_symbol_db[symbol] if len(ens_geneid)>0: array_ensembl[array_geneid]= ens_geneid update.verifyFile(filename,array_type) ensembl_probeset_db = importJunctionLocationData(filename,array_ensembl,gene_info_db,test) print len(ensembl_probeset_db), "Genes inlcuded in",array_type,"location database" return ensembl_probeset_db
def importCriticalJunctionSeq(filename, species, array_type): update.verifyFile(filename, array_type) ### Will force download if missing fn = filepath(filename) probeset_seq_db = {} x = 0 for line in open(fn, 'rU').xreadlines(): data, newline = string.split(line, '\n') t = string.split(data, '\t') if x == 0: x = 1 else: try: probeset, probeset_seq, junction_seq = t except Exception: try: probeset, probeset_seq, junction_seq, null = t except Exception: print filename, t kill if array_type == 'RNASeq': ### Ensure the junction sequence is sufficient for searching left, right = string.split(probeset_seq, '|') if len(left) > 2 and len(right) > 2: null = [] else: probeset_seq = '' if len(probeset_seq) < 8: probeset_seq = '' probeset_seq = string.replace(probeset_seq, '|', '') probeset_seq_db[probeset] = probeset_seq, junction_seq x += 1 print len(probeset_seq_db), 'probesets with associated sequence' return probeset_seq_db
def importCriticalJunctionSeq(filename, species, array_type): update.verifyFile(filename, array_type) ### Will force download if missing fn = filepath(filename) probeset_seq_db = {} x = 0 for line in open(fn, "rU").xreadlines(): data, newline = string.split(line, "\n") t = string.split(data, "\t") if x == 0: x = 1 else: try: probeset, probeset_seq, junction_seq = t except Exception: try: probeset, probeset_seq, junction_seq, null = t except Exception: print filename, t kill if array_type == "RNASeq": ### Ensure the junction sequence is sufficient for searching left, right = string.split(probeset_seq, "|") if len(left) > 2 and len(right) > 2: null = [] else: probeset_seq = "" if len(probeset_seq) < 8: probeset_seq = "" probeset_seq = string.replace(probeset_seq, "|", "") probeset_seq_db[probeset] = probeset_seq, junction_seq x += 1 print len(probeset_seq_db), "probesets with associated sequence" return probeset_seq_db
def importCriticalExonLocations(species, array_type, ensembl_exon_db, force): ###ensembl_exon_db[(geneid,chr,strand)] = [[E5,exon_info]] #exon_info = (exon_start,exon_stop,exon_id,exon_annot) ###ensembl_probeset_db[geneid,chr,strand].append(probeset_data) #probeset_data = [start,stop,probeset_id,exon_class,transcript_cluster_id] gene_info_db = {} for (ens_geneid, chr, strand) in ensembl_exon_db: gene_info_db[ens_geneid] = chr, strand filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical_exon_locations.txt' array_ensembl = {} ###Get the most recent gene-symbol annotations (applicable with a new Ensembl build for the same genomic build) ensembl_symbol_db = getEnsemblAnnotations(species) primary_gene_annotation_file = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_gene_annotations.txt' update.verifyFile(primary_gene_annotation_file, array_type) array_gene_annotations = JunctionArray.importGeneric( primary_gene_annotation_file) for array_geneid in array_gene_annotations: t = array_gene_annotations[array_geneid] description = t[0] entrez = t[1] symbol = t[2] if symbol in ensembl_symbol_db and len(symbol) > 0 and len( array_geneid) > 0: ens_geneid = ensembl_symbol_db[symbol] if len(ens_geneid) > 0: array_ensembl[array_geneid] = ens_geneid update.verifyFile(filename, array_type) ensembl_probeset_db = importJunctionLocationData(filename, array_ensembl, gene_info_db, test) print len(ensembl_probeset_db ), "Genes inlcuded in", array_type, "location database" return ensembl_probeset_db
def importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype): array_ens_db={} if array_type == 'AltMouse': filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt' update.verifyFile(filename,array_type) ### Will force download if missing fn=filepath(filename); x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: array_gene,ens_gene = t try: array_ens_db[array_gene].append(ens_gene) except KeyError: array_ens_db[array_gene]=[ens_gene] filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-junction-seq.txt' fn=filepath(filename); probeset_seq_db={}; x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: probeset,probeset_seq,junction_seq = t; junction_seq=string.replace(junction_seq,'|','') probeset_seq_db[probeset] = probeset_seq,junction_seq ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data ###This short-cuts what we did in two function in ExonSeqModule with exon level data filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt' fn=filepath(filename); probeset_gene_seq_db={}; x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|') probesets = [probeset1,probeset2] if array_type == 'junction' or array_type == 'RNASeq': array_ens_db[array_gene]=[array_gene] if array_gene in array_ens_db: ensembl_gene_ids = array_ens_db[array_gene] for probeset_id in probesets: if probeset_id in probeset_seq_db: probeset_seq,junction_seq = probeset_seq_db[probeset_id] if biotype == 'gene': for ensembl_gene_id in ensembl_gene_ids: probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_id,array_gene,probesets,critical_exons) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data] else: ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_ids,array_gene,probesets,critical_exons) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) probeset_gene_seq_db[probeset_id] = probe_data print len(probeset_gene_seq_db),"genes with probeset sequence associated" return probeset_gene_seq_db
def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_url,force): global uniprot_ensembl_db;uniprot_ensembl_db={} global uniprot_db;uniprot_db={}; global species_name; global uniprot_fildir global secondary_to_primary_db; secondary_to_primary_db={} import update; reload(update) species_name = species_full import UI; species_names = UI.getSpeciesInfo() species_full = species_names[species] species_full = string.replace(species_full,' ','_') uniprot_file = string.split(uniprot_filename_url,'/')[-1]; uniprot_file = string.replace(uniprot_file,'.gz','') trembl_file = string.split(trembl_filename_url,'/')[-1]; trembl_file = string.replace(trembl_file,'.gz','') uniprot_fildir = 'AltDatabase/uniprot/'+species+'/' uniprot_download_fildir = 'AltDatabase/uniprot/' uniprot_ens_file = species+'_Ensembl-UniProt.txt'; uniprot_ens_location = uniprot_fildir+uniprot_ens_file uniprot_location = uniprot_download_fildir+uniprot_file trembl_location = uniprot_download_fildir+trembl_file add_trembl_annotations = 'no' ### Currently we don't need these annotations try: importEnsemblUniprot(uniprot_ens_location) except IOError: try: ### Download the data from the AltAnalyze website (if there) update.downloadCurrentVersion(uniprot_ens_location,species,'txt') importEnsemblUniprot(uniprot_ens_location) except Exception: null=[] try: uniprot_ens_location_built = string.replace(uniprot_ens_location,'UniProt','Uniprot-SWISSPROT') uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT') importEnsemblUniprot(uniprot_ens_location_built) except Exception: null=[] ### Import UniProt annotations counts = update.verifyFile(uniprot_location,'counts') if force == 'no' or counts > 8: import_uniprot_db(uniprot_location) else: ### Directly download the data from UniProt gz_filepath, status = update.download(uniprot_filename_url,uniprot_download_fildir,'') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status import_uniprot_db(uniprot_location) if add_trembl_annotations == 'yes': ### Import TreMBL annotations try: if force == 'yes': uniprot_location += '!!!!!' ### Force an IOError import_uniprot_db(trembl_location) except IOError: ### Directly download the data from UniProt update.download(trembl_filename_url,uniprot_download_fildir,'') import_uniprot_db(trembl_location) export() exportEnsemblUniprot(uniprot_ens_location)
def grab_exon_level_feature_calls(species,array_type,genes_analyzed): arrayid_uniprot_file = 'AltDatabase/uniprot/'+species+'/'+'arrayid-uniprot.txt' arrayid_ensembl_file = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt' ensembl_ft_file = 'AltDatabase/ensembl/'+species+'/'+'DomainFile_All.txt' null,null,null,protein_coordinate_file = getEnsemblRelationshipDirs(species) global uniprot_arrayid_db; uniprot_arrayid_db = {}; global arrayid_uniprot_db; arrayid_uniprot_db = {} global ensembl_arrayid_db; ensembl_arrayid_db={} if array_type == 'AltMouse': update.verifyFile(arrayid_uniprot_file,array_type) ### Will force download if missing update.verifyFile(arrayid_ensembl_file,array_type) ### Will force download if missing import_arrayid_uniprot(arrayid_uniprot_file) import_arrayid_ensembl(arrayid_ensembl_file) ###Otherwise, these databases can be built on-the-fly in downstream methods, since Ensembl will be used as the array gene id else: ensembl_arrayid_db = genes_analyzed ###ensembl to ensembl for those being analyzed in the program ensembl_protein_seq_db,ensembl_ft_db,domain_gene_counts = import_ensembl_ft_data(species,ensembl_ft_file,ensembl_arrayid_db,array_type) ###Import function domain annotations for Ensembl proteins print 'Ensembl based domain feature genes:',len(ensembl_ft_db),len(domain_gene_counts) uniprot_protein_seq_db,uniprot_ft_db,domain_gene_counts = import_uniprot_ft_data(species,protein_coordinate_file,domain_gene_counts,ensembl_arrayid_db,array_type) ###" " " " UniProt " print 'UniProt based domain feature genes:',len(uniprot_ft_db),len(domain_gene_counts) arrayid_ft_db = combineDatabases(uniprot_ft_db,ensembl_ft_db) ###arrayid relating to classes of functional domain attributes and associated proteins (ensembl and uniprot) return arrayid_ft_db,domain_gene_counts
def importCriticalJunctionSeq(filename,species,array_type): update.verifyFile(filename,array_type) ### Will force download if missing fn=filepath(filename); probeset_seq_db={}; x = 0 for line in open(fn,'rU').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: try: probeset,probeset_seq,junction_seq = t except Exception: try: probeset,probeset_seq,junction_seq, null = t except Exception: print filename,t;kill if array_type == 'RNASeq': ### Ensure the junction sequence is sufficient for searching left,right = string.split(probeset_seq,'|') if len(left)>2 and len(right)>2: null=[] else: probeset_seq = '' if len(probeset_seq) < 8: probeset_seq = '' probeset_seq=string.replace(probeset_seq,'|','') probeset_seq_db[probeset] = probeset_seq,junction_seq x+=1 print len(probeset_seq_db),'probesets with associated sequence' return probeset_seq_db
def exportAltMouseExonSequence(): probeset_exon_db = {} x = 0 species = 'Mm' array_type = 'AltMouse' critical_exon_import_file = 'AltDatabase/Mm/AltMouse/AltMouse_junction-comparisons.txt' update.verifyFile(critical_exon_import_file, array_type) critical_exon_db = {} critical_probesets = {} fn = filepath(critical_exon_import_file) for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) gene, probeset1, probeset2, critical_exons = string.split(data, '\t') critical_exons = string.split(critical_exons, '|') for exon in critical_exons: try: critical_exon_db[gene, exon].append(probeset1 + '-' + probeset2) except KeyError: critical_exon_db[gene, exon] = [probeset1 + '-' + probeset2] critical_probesets[probeset1] = [] critical_probesets[probeset2] = [] probeset_annotations_file = "AltDatabase/Mm/AltMouse/MASTER-probeset-transcript.txt" update.verifyFile(probeset_annotations_file, array_type) fn = filepath(probeset_annotations_file) for line in open(fn, 'rU').xreadlines(): probeset_data = cleanUpLine(line) #remove endline if x == 0: x = 1 else: probeset, affygene, exons, transcript_num, transcripts, probe_type_call, ensembl, block_exon_ids, block_structure, comparison_info = string.split( probeset_data, '\t') if probeset in critical_probesets: exons = exons[:-1] exons = string.split(exons, '-') affygene = affygene[:-1] if '|' in exons: print exons kill probeset_exon_db[probeset, affygene] = exons exon_protein_sequence_file = "AltDatabase/Mm/AltMouse/SEQUENCE-transcript-dbase.txt" update.verifyFile(exon_protein_sequence_file, array_type) transcript_cdna_sequence_dbase, transcript_associations, exon_sequence_database = import_existing_sequence_build( exon_protein_sequence_file) critical_exon_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-exon-seq.txt' update.verifyFile(critical_exon_seq_export, array_type) fn = filepath(critical_exon_seq_export) data = open(fn, 'w') title = ['Affygene:exon', 'critical_exon-num', 'critical-probeset-comps'] title = string.join(title, '\t') + '\n' data.write(title) for (gene, exon_num) in critical_exon_db: probeset_comp_list = critical_exon_db[(gene, exon_num)] probeset_comp_list = string.join(probeset_comp_list, '|') try: ###Restrict export to previously exported critical exons (ExonAnnotate_module) exon_sequence_database[(gene, exon_num)] esd = exon_sequence_database[(gene, exon_num)] exon_seq = esd.ExonSeq() exon_data = string.join( [gene + ':' + exon_num, probeset_comp_list, exon_seq], '\t') + '\n' data.write(exon_data) except KeyError: null = [] data.close() probeset_seq_file = 'AltDatabase/Mm/AltMouse/probeset_sequence_reversed.txt' update.verifyFile(probeset_seq_file, array_type) probeset_seq_db = {} x = 0 fn = filepath(probeset_seq_file) for line in open(fn, 'rU').xreadlines(): if x == 0: x = 1 else: data = cleanUpLine(line) t = string.split(data, '\t') probeset = t[0] probeset_seq_list = t[1:] probeset_seq_db[probeset] = probeset_seq_list critical_junction_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-junction-seq.txt' update.verifyFile(critical_junction_seq_export, array_type) fn = filepath(critical_junction_seq_export) data = open(fn, 'w') x = 0 k = 0 l = 0 title = ['probeset', 'probeset-seq', 'junction-seq'] title = string.join(title, '\t') + '\n' data.write(title) for (probeset, gene) in probeset_exon_db: junction_seq = [] y = 0 positions = [] try: probeset_seq_list = probeset_seq_db[probeset] for exon_num in probeset_exon_db[(probeset, gene)]: try: ###Restrict export to previously exported critical exons (ExonAnnotate_module) exon_sequence_database[(gene, exon_num)] esd = exon_sequence_database[(gene, exon_num)] exon_seq = esd.ExonSeq() strand = esd.Strand() junction_seq.append(exon_seq) y += 1 #exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n' #data.write(exon_data) except KeyError: null = [] #if 'E5' in probeset_exon_db[(probeset,gene)]: if y > 0: if strand == '-': junction_seq.reverse() junction_seq_str = string.join(junction_seq, '') junction_seq_str = string.upper(junction_seq_str) not_found = 0 for probeset_seq in probeset_seq_list: #probeset_seq = reverse_string(probeset_seq) probeset_seq_rev = reverse_orientation(probeset_seq) if probeset_seq in junction_seq_str: f = string.find(junction_seq_str, probeset_seq) positions.append((f, len(probeset_seq))) k += 1 else: not_found = 1 x += 1 if not_found == 1: new_probeset_seq = probeset_seq_list[ 0] ###pick the first probe sequence found if len(positions) > 0: positions.sort() new_probeset_seq = junction_seq_str[ positions[0][0]:positions[-1][0] + positions[-1][1]] #print new_probeset_seq,positions, probeset,probeset_exon_db[(probeset,gene)],probeset_seq_list,junction_seq;kill junction_seq = string.join( junction_seq, '|') ###indicate where the junction is probe_seq_data = string.join( [probeset, new_probeset_seq, junction_seq], '\t') + '\n' data.write(probe_seq_data) except KeyError: null = [] data.close() print k, x
def importJunctionAnnotationDatabaseAndSequence(species, array_type, biotype): """This function imports AffyGene-Ensembl relationships, junction probeset sequences, and recipricol junction comparisons. with data stored from this function, we can match probeset sequence to mRNAs and determine which combinations of probesets can be used as match-match or match-nulls.""" array_ens_db = {} if array_type == 'AltMouse': ### Import AffyGene to Ensembl associations (e.g., AltMouse array) filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '-Ensembl_relationships.txt' update.verifyFile(filename, array_type) ### Will force download if missing fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data, newline = string.split(line, '\n') t = string.split(data, '\t') if x == 0: x = 1 else: array_gene, ens_gene = t try: array_ens_db[array_gene].append(ens_gene) except KeyError: array_ens_db[array_gene] = [ens_gene] print len(array_ens_db), 'Ensembl-AltMouse relationships imported.' if array_type == 'RNASeq' and coordinateBasedMatching == True: probeset_seq_db = {} else: filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical-junction-seq.txt' probeset_seq_db = importCriticalJunctionSeq(filename, species, array_type) ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data ###This short-cuts what we did in two function in ExonModule with exon level data if array_type == 'AltMouse': filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_junction-comparisons.txt' update.verifyFile(filename, array_type) ### Will force download if missing elif array_type == 'junction': filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps_updated.txt' elif array_type == 'RNASeq': filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps.txt' fn = filepath(filename) probeset_gene_seq_db = {} added_probesets = {} pairwise_probesets = {} x = 0 for line in open(fn, 'rU').xreadlines(): data, newline = string.split(line, '\n') t = string.split(data, '\t') if x == 0: x = 1 else: if (array_type == 'junction' or array_type == 'RNASeq'): array_gene, critical_exons, excl_junction, incl_junction, probeset2, probeset1, data_source = t array_ens_db[array_gene] = [array_gene] elif array_type == 'AltMouse': array_gene, probeset1, probeset2, critical_exons = t #; critical_exons = string.split(critical_exons,'|') probesets = [probeset1, probeset2] pairwise_probesets[probeset1, probeset2] = [] if array_gene in array_ens_db: ensembl_gene_ids = array_ens_db[array_gene] for probeset_id in probesets: if probeset_id in probeset_seq_db: probeset_seq, junction_seq = probeset_seq_db[ probeset_id] if biotype == 'gene': for ensembl_gene_id in ensembl_gene_ids: if probeset_id not in added_probesets: probe_data = JunctionDataSimple( probeset_id, array_gene) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) try: probeset_gene_seq_db[ ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ ensembl_gene_id] = [probe_data] added_probesets[probeset_id] = [] elif array_type == 'RNASeq' and coordinateBasedMatching == True: ### Coordinate matching as opposed to sequence if biotype == 'gene': for ensembl_gene_id in ensembl_gene_ids: if probeset_id not in added_probesets: probe_data = JunctionDataSimple( probeset_id, array_gene) try: probeset_gene_seq_db[ ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ ensembl_gene_id] = [probe_data] added_probesets[probeset_id] = [] print len(probeset_gene_seq_db), "genes with probeset sequence associated" return probeset_gene_seq_db, pairwise_probesets
def importJunctionAnnotationDatabaseAndSequence(species, array_type, biotype): """This function imports AffyGene-Ensembl relationships, junction probeset sequences, and recipricol junction comparisons. with data stored from this function, we can match probeset sequence to mRNAs and determine which combinations of probesets can be used as match-match or match-nulls.""" array_ens_db = {} if array_type == "AltMouse": ### Import AffyGene to Ensembl associations (e.g., AltMouse array) filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "-Ensembl_relationships.txt" update.verifyFile(filename, array_type) ### Will force download if missing fn = filepath(filename) x = 0 for line in open(fn, "rU").xreadlines(): data, newline = string.split(line, "\n") t = string.split(data, "\t") if x == 0: x = 1 else: array_gene, ens_gene = t try: array_ens_db[array_gene].append(ens_gene) except KeyError: array_ens_db[array_gene] = [ens_gene] print len(array_ens_db), "Ensembl-AltMouse relationships imported." if array_type == "RNASeq" and coordinateBasedMatching == True: probeset_seq_db = {} else: filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "_critical-junction-seq.txt" probeset_seq_db = importCriticalJunctionSeq(filename, species, array_type) ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data ###This short-cuts what we did in two function in ExonModule with exon level data if array_type == "AltMouse": filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "_junction-comparisons.txt" update.verifyFile(filename, array_type) ### Will force download if missing elif array_type == "junction": filename = "AltDatabase/" + species + "/" + array_type + "/" + species + "_junction_comps_updated.txt" elif array_type == "RNASeq": filename = "AltDatabase/" + species + "/" + array_type + "/" + species + "_junction_comps.txt" fn = filepath(filename) probeset_gene_seq_db = {} added_probesets = {} pairwise_probesets = {} x = 0 for line in open(fn, "rU").xreadlines(): data, newline = string.split(line, "\n") t = string.split(data, "\t") if x == 0: x = 1 else: if array_type == "junction" or array_type == "RNASeq": array_gene, critical_exons, excl_junction, incl_junction, probeset2, probeset1, data_source = t array_ens_db[array_gene] = [array_gene] elif array_type == "AltMouse": array_gene, probeset1, probeset2, critical_exons = ( t ) # ; critical_exons = string.split(critical_exons,'|') probesets = [probeset1, probeset2] pairwise_probesets[probeset1, probeset2] = [] if array_gene in array_ens_db: ensembl_gene_ids = array_ens_db[array_gene] for probeset_id in probesets: if probeset_id in probeset_seq_db: probeset_seq, junction_seq = probeset_seq_db[probeset_id] if biotype == "gene": for ensembl_gene_id in ensembl_gene_ids: if probeset_id not in added_probesets: probe_data = JunctionDataSimple(probeset_id, array_gene) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data] added_probesets[probeset_id] = [] elif ( array_type == "RNASeq" and coordinateBasedMatching == True ): ### Coordinate matching as opposed to sequence if biotype == "gene": for ensembl_gene_id in ensembl_gene_ids: if probeset_id not in added_probesets: probe_data = JunctionDataSimple(probeset_id, array_gene) try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data] added_probesets[probeset_id] = [] print len(probeset_gene_seq_db), "genes with probeset sequence associated" return probeset_gene_seq_db, pairwise_probesets
def exportAltMouseExonSequence(): probeset_exon_db={}; x=0 species = 'Mm'; array_type = 'AltMouse' critical_exon_import_file = 'AltDatabase/Mm/AltMouse/AltMouse_junction-comparisons.txt' update.verifyFile(critical_exon_import_file,array_type) critical_exon_db={}; critical_probesets={} fn=filepath(critical_exon_import_file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) gene,probeset1,probeset2,critical_exons=string.split(data,'\t') critical_exons= string.split(critical_exons,'|') for exon in critical_exons: try: critical_exon_db[gene,exon].append(probeset1+'-'+probeset2) except KeyError: critical_exon_db[gene,exon] = [probeset1+'-'+probeset2] critical_probesets[probeset1]=[]; critical_probesets[probeset2]=[] probeset_annotations_file = "AltDatabase/Mm/AltMouse/MASTER-probeset-transcript.txt" update.verifyFile(probeset_annotations_file,array_type) fn=filepath(probeset_annotations_file) for line in open(fn,'rU').xreadlines(): probeset_data = cleanUpLine(line) #remove endline if x==0: x=1 else: probeset,affygene,exons,transcript_num,transcripts,probe_type_call,ensembl,block_exon_ids,block_structure,comparison_info = string.split(probeset_data,'\t') if probeset in critical_probesets: exons = exons[:-1]; exons = string.split(exons,'-') affygene = affygene[:-1] if '|' in exons: print exons;kill probeset_exon_db[probeset,affygene]=exons exon_protein_sequence_file = "AltDatabase/Mm/AltMouse/SEQUENCE-transcript-dbase.txt" update.verifyFile(exon_protein_sequence_file,array_type) transcript_cdna_sequence_dbase,transcript_associations,exon_sequence_database = import_existing_sequence_build(exon_protein_sequence_file) critical_exon_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-exon-seq.txt' update.verifyFile(critical_exon_seq_export,array_type) fn=filepath(critical_exon_seq_export) data = open(fn,'w') title = ['Affygene:exon','critical_exon-num','critical-probeset-comps']; title = string.join(title,'\t')+'\n'; data.write(title) for (gene,exon_num) in critical_exon_db: probeset_comp_list = critical_exon_db[(gene,exon_num)]; probeset_comp_list = string.join(probeset_comp_list,'|') try: ###Restrict export to previously exported critical exons (ExonAnnotate_module) exon_sequence_database[(gene,exon_num)]; esd = exon_sequence_database[(gene,exon_num)] exon_seq = esd.ExonSeq() exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n' data.write(exon_data) except KeyError: null=[] data.close() probeset_seq_file = 'AltDatabase/Mm/AltMouse/probeset_sequence_reversed.txt' update.verifyFile(probeset_seq_file,array_type) probeset_seq_db={}; x=0 fn=filepath(probeset_seq_file) for line in open(fn,'rU').xreadlines(): if x == 0: x=1 else: data = cleanUpLine(line); t = string.split(data,'\t') probeset = t[0] probeset_seq_list = t[1:] probeset_seq_db[probeset] = probeset_seq_list critical_junction_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-junction-seq.txt' update.verifyFile(critical_junction_seq_export,array_type) fn=filepath(critical_junction_seq_export) data = open(fn,'w'); x=0; k=0;l=0 title = ['probeset','probeset-seq','junction-seq']; title = string.join(title,'\t')+'\n'; data.write(title) for (probeset,gene) in probeset_exon_db: junction_seq = []; y=0; positions=[] try: probeset_seq_list = probeset_seq_db[probeset] for exon_num in probeset_exon_db[(probeset,gene)]: try: ###Restrict export to previously exported critical exons (ExonAnnotate_module) exon_sequence_database[(gene,exon_num)]; esd = exon_sequence_database[(gene,exon_num)] exon_seq = esd.ExonSeq(); strand = esd.Strand() junction_seq.append(exon_seq); y+=1 #exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n' #data.write(exon_data) except KeyError: null=[] #if 'E5' in probeset_exon_db[(probeset,gene)]: if y>0: if strand == '-': junction_seq.reverse() junction_seq_str = string.join(junction_seq,'') junction_seq_str = string.upper(junction_seq_str) not_found = 0 for probeset_seq in probeset_seq_list: #probeset_seq = reverse_string(probeset_seq) probeset_seq_rev = reverse_orientation(probeset_seq) if probeset_seq in junction_seq_str: f = string.find(junction_seq_str,probeset_seq) positions.append((f,len(probeset_seq))) k+=1 else: not_found = 1 x+=1 if not_found == 1: new_probeset_seq = probeset_seq_list[0] ###pick the first probe sequence found if len(positions)>0: positions.sort() new_probeset_seq = junction_seq_str[positions[0][0]:positions[-1][0]+positions[-1][1]] #print new_probeset_seq,positions, probeset,probeset_exon_db[(probeset,gene)],probeset_seq_list,junction_seq;kill junction_seq = string.join(junction_seq,'|') ###indicate where the junction is probe_seq_data = string.join([probeset,new_probeset_seq,junction_seq],'\t')+'\n' data.write(probe_seq_data) except KeyError: null=[] data.close() print k,x