def geneCoordinates(species,symbols): genes=[] import EnsemblImport ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True) for symbol in symbols: ens_geneid = ensembl_annotation_db[symbol] genes.append((ens_geneid,symbol)) ### Get gene genomic locations gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') search_locations=[] for (gene,symbol) in genes: chr,strand,start,end = gene_location_db[gene] if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953' if len(chr)>6: print symbol, 'bad chromosomal reference:',chr else: search_locations.append([chr,strand,start,end,symbol])
def alignProbesetsToTranscripts(species,array_type,Analysis_type,Force, CoordinateBasedMatching = False): global force; force = Force; global analysis_type; analysis_type = Analysis_type global coordinateBasedMatching; coordinateBasedMatching = CoordinateBasedMatching """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts""" if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': data_type = 'junctions'; probeset_seq_file=''; biotype = 'gene' if data_type == 'junctions' and analysis_type == 'reciprocal': start_time = time.time() ### Indicates whether to store information at the level of genes or probesets probeset_seq_db,pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence(species,array_type,biotype) end_time = time.time(); time_diff = int(end_time-start_time) elif analysis_type == 'single': start_time = time.time() probeset_seq_db,pairwise_probeset_combinations = importAllJunctionSequences(species,array_type) end_time = time.time(); time_diff = int(end_time-start_time) print "Analyses finished in %d seconds" % time_diff elif array_type == 'exon': data_type = 'exon' probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt' ###Import probe-level associations exon_db = importSplicingAnnotationDatabase(probeset_annotations_file) start_time = time.time() probeset_seq_db = importProbesetSequences(exon_db,species) end_time = time.time(); time_diff = int(end_time-start_time) print "Analyses finished in %d seconds" % time_diff ### Match probesets to mRNAs\= import EnsemblImport if coordinateBasedMatching == True and array_type == 'RNASeq': EnsemblImport.exportTranscriptExonIDAssociations(species) matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs else: #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs importEnsemblTranscriptSequence(species,array_type,probeset_seq_db) try: mRNASeqAlign.importUCSCTranscriptSequences(species,array_type,probeset_seq_db) except Exception: pass ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available probeset_seq_db={} ### Re-set db ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset if data_type == 'junctions': ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file import_dir = '/AltDatabase/'+species+'/SequenceData/output' g = GrabFiles(); g.setdirectory(import_dir) align_files = g.searchdirectory('mRNA_alignments') reAnalyzeRNAProbesetMatches(align_files,species,array_type,pairwise_probeset_combinations)
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def geneCoordinates(species,symbols): genes=[] import EnsemblImport ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True) for symbol in symbols: if symbol in ensembl_annotation_db: ens_geneid = ensembl_annotation_db[symbol] genes.append((ens_geneid,symbol)) else: print symbol, 'not found' ### Get gene genomic locations gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') search_locations=[] for (gene,symbol) in genes: chr,strand,start,end = gene_location_db[gene] #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953' if len(chr)>6: print symbol, 'bad chromosomal reference:',chr else: search_locations.append([chr,strand,start,end,symbol])
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations( species) export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, 'RNASeq') print 'Annotations for', len(gene_location_db), 'genes imported' sorted_list = [] protein_coding = 0 for gene in gene_location_db: chr, strand, start, end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding += 1 else: biotype = 'NA' if len(chr) < 7: sorted_list.append( [chr, strand, int(start), int(end), gene, biotype]) #else: print chr;sys.exit() print len(sorted_list), 'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr, strand, start, end, gene, biotype = values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene, symbol, chr, strand, str(start), str(end), biotype] export_data.write(string.join(values, '\t') + '\n') export_data.close() print species, 'chromosome locations exported to:\n', export_path
def grabExonIntronPromoterSequences(species,array_type,data_type,output_types): ### output_types could be adjacent intron sequences, adjacent exon sequences, targets exon sequence or promoter sequence_input_dir_list=[] if data_type == 'probeset': sequence_input_dir = '/AltResults/AlternativeOutput/'+array_type+'/sequence_input' if data_type == 'gene': sequence_input_dir = '/ExpressionOutput/'+array_type+'/sequence_input' dir_list = read_directory(sequence_input_dir) for input_file in dir_list: filedir = sequence_input_dir[1:]+'/'+input_file filter_db = inputResultFiles(filedir,data_type) export_exon_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt' ensembl_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,data_type,filter_db) """for gene in ensembl_probeset_db: if gene == 'ENSG00000139737': for x in ensembl_probeset_db[gene]: exon_id,((probe_start,probe_stop,probeset_id,exon_class,transcript_clust),ed) = x print gene, ed.ExonID() kill""" analysis_type = 'get_sequence' dir = 'AltDatabase/ensembl/'+species+'/'; gene_seq_filename = dir+species+'_gene-seq-2000_flank' ensembl_probeset_db = EnsemblImport.import_sequence_data(gene_seq_filename,ensembl_probeset_db,species,analysis_type) """ critical_exon_file = 'AltDatabase/'+species+'/'+ array_type + '/' + array_type+'_critical-exon-seq.txt' if output_types == 'all' and data_type == 'probeset': output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns'] else: output_types = [output_types] for output_type in output_types: sequence_input_dir = string.replace(sequence_input_dir,'_input','_output') filename = sequence_input_dir[1:]+'/ExportedSequence-'+data_type+'-'+output_type+'.txt' exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type) """ if output_types == 'all' and data_type == 'probeset': output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns'] else: output_types = [output_types] for output_type in output_types: sequence_input_dir2 = string.replace(sequence_input_dir,'_input','_output') filename = sequence_input_dir2[1:]+'/'+input_file[:-4]+'-'+data_type+'-'+output_type+'.txt' exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type)
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq') print 'Annotations for',len(gene_location_db),'genes imported' sorted_list=[]; protein_coding=0 for gene in gene_location_db: chr,strand,start,end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding+=1 else: biotype = 'NA' if len(chr)<7: sorted_list.append([chr,strand,int(start),int(end),gene,biotype]) #else: print chr;sys.exit() print len(sorted_list),'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr,strand,start,end,gene,biotype=values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene,symbol,chr,strand,str(start),str(end),biotype] export_data.write(string.join(values,'\t')+'\n') export_data.close() print species, 'chromosome locations exported to:\n',export_path
def reformatPolyAdenylationCoordinates(species,force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version={} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for',species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/'+species + '/' if force == 'yes': filename, status = update.download(url,output_dir,'') else: filename = output_dir+'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations; import OBO_import; import EnsemblImport; import export try: ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene') print len(ens_unigene),'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no' except Exception: ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene') print len(ens_entrez),'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') export_bedfile = output_dir+species+'_polyADB_2_predictions.bed' print 'exporting',export_bedfile export_data = export.ExportFile(export_bedfile) header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n' export_data.write(header) fn=filepath(filename); x=0; not_found={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr'+chr strand = '+'; geneid = siteid pos_start = str(int(position)-1); pos_end = position if use_entrez=='no': external_geneid = string.join(string.split(siteid,'.')[:2],'.') else: external_geneid=llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-'+ens_geneid chr,strand,start,end = gene_location_db[ens_geneid] else: not_found[external_geneid]=[] bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n' export_data.write(bed_format) export_data.close()
def alignProbesetsToTranscripts(species, array_type, Analysis_type, Force, CoordinateBasedMatching=False): global force force = Force global analysis_type analysis_type = Analysis_type global coordinateBasedMatching coordinateBasedMatching = CoordinateBasedMatching """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts""" if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': data_type = 'junctions' probeset_seq_file = '' biotype = 'gene' if data_type == 'junctions' and analysis_type == 'reciprocal': start_time = time.time( ) ### Indicates whether to store information at the level of genes or probesets probeset_seq_db, pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence( species, array_type, biotype) end_time = time.time() time_diff = int(end_time - start_time) elif analysis_type == 'single': start_time = time.time() probeset_seq_db, pairwise_probeset_combinations = importAllJunctionSequences( species, array_type) end_time = time.time() time_diff = int(end_time - start_time) print "Analyses finished in %d seconds" % time_diff elif array_type == 'exon': data_type = 'exon' probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' ###Import probe-level associations exon_db = importSplicingAnnotationDatabase(probeset_annotations_file) start_time = time.time() probeset_seq_db = importProbesetSequences(exon_db, species) end_time = time.time() time_diff = int(end_time - start_time) print "Analyses finished in %d seconds" % time_diff ### Match probesets to mRNAs\= import EnsemblImport if coordinateBasedMatching == True and array_type == 'RNASeq': EnsemblImport.exportTranscriptExonIDAssociations(species) matchTranscriptExonIDsToJunctionIDs( species, array_type, probeset_seq_db ) ### no sequences in probeset_seq_db, just junctionIDs else: #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs importEnsemblTranscriptSequence(species, array_type, probeset_seq_db) try: mRNASeqAlign.importUCSCTranscriptSequences(species, array_type, probeset_seq_db) except Exception: pass ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available probeset_seq_db = {} ### Re-set db ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset if data_type == 'junctions': ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file import_dir = '/AltDatabase/' + species + '/SequenceData/output' g = GrabFiles() g.setdirectory(import_dir) align_files = g.searchdirectory('mRNA_alignments') reAnalyzeRNAProbesetMatches(align_files, species, array_type, pairwise_probeset_combinations)
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations import OBO_import import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()
def getAnnotations(Species,array_type,reannotate_exon_seq,force): """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release).""" global species; species = Species; global test; global test_cluster test = 'no'; test_cluster = ['TC0701360']; data_type = 'mRNA' global ensembl_exon_db; global ensembl_exon_db; global exon_clusters; global exon_region_db ensembl_exon_db,ensembl_annot_db,exon_clusters,intron_clusters,exon_region_db,intron_retention_db,ucsc_splicing_annot_db,ens_transcript_db = EnsemblImport.getEnsemblAssociations(species,data_type,test) ensembl_probeset_db = importCriticalExonLocations(species,array_type,ensembl_exon_db,force) ###Get Pre-computed genomic locations for critical exons ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons(ensembl_probeset_db,exon_clusters,ensembl_exon_db,exon_region_db,intron_retention_db,intron_clusters,ucsc_splicing_annot_db); constitutive_gene_db={} ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type,ensembl_probeset_db,species) print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..." ### Change filenames to reflect junction array type export_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'; ef=filepath(export_filename) export_replacement = string.replace(export_filename,'_probe','_'+array_type+'_probe') er=filepath(export_replacement); shutil.copyfile(ef,er); os.remove(ef) ### Copy file to a new name ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses) if reannotate_exon_seq == 'yes': JunctionArray.reAnnotateCriticalExonSequences(species,array_type)