def importUCSCTranscriptSequences(species,array_type,probeset_seq_db): start_time = time.time() if force == 'yes': ### Download mRNA sequence file from website import UI; species_names = UI.getSpeciesInfo() species_full = species_names[species] species_full = string.replace(species_full,' ','_') ucsc_mRNA_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/bigZips','mrna.fa.gz') output_dir = 'AltDatabase/'+species+'/SequenceData/' try: gz_filepath, status = update.download(ucsc_mRNA_dir,output_dir,'') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status except Exception: null=[] ### Occurs when file is not available for this species filename = 'AltDatabase/'+species+'/SequenceData/mrna.fa' output_file = 'AltDatabase/'+species+'/SequenceData/output/'+array_type+'_UCSC-mRNA_alignments.txt' dataw = export.ExportFile(output_file) output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/'+array_type+'_UCSC_mRNA_seqmatches.txt' datar = export.ExportFile(output_file) ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species) print "Begining generic fasta import of",filename #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289'] #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC' fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0 fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0 for line in open(fn,'rU').xreadlines(): try: data, newline= string.split(line,'\n') except ValueError: continue if len(data)>0: if data[0] != '#': try: if data[0] == '>': if len(sequence) > 1: if accession in ucsc_mrna_to_gene: gene_found = 'no' for ens_gene in ucsc_mrna_to_gene[accession]: if ens_gene in probeset_seq_db: sequence = string.upper(sequence); gene_found = 'yes' mRNA_seq = sequence[1:]; mRNA_length = len(mRNA_seq) k+=1; probeset_seq_data = probeset_seq_db[ens_gene] results = simpleSeqMatchProtocol(probeset_seq_data,mRNA_seq) for (call,probeset) in results: dataw.write(string.join([probeset,str(call),accession],'\t')+'\n') if gene_found == 'yes': values = [accession,mRNA_seq]; values = string.join(values,'\t')+'\n' datar.write(values) values = string.split(data,' '); accession = values[0][1:] sequence = '|'; continue except IndexError: null = [] try: if data[0] != '>': sequence = sequence + data except IndexError: print kill; continue datar.close() end_time = time.time(); time_diff = int(end_time-start_time) print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def importmiRNAMap(parse_sequences,force): """ Added in AltAnalyze version 2.0, this database provides target sequences for several species and different databases, including miRanda, RNAhybrid and TargetScan. For more information see: http://mirnamap.mbc.nctu.edu.tw/html/about.html""" gz_filepath = verifyFileAdvanced('miRNA_targets_',species) if force == 'yes' or len(gz_filepath)==0: import UI; species_names = UI.getSpeciesInfo() species_full = species_names[species] species_full = string.replace(species_full,' ','_') miRNAMap_dir = update.getFTPData('mirnamap.mbc.nctu.edu.tw','/miRNAMap2/miRNA_Targets/'+species_full,'.txt.tar.gz') output_dir = 'AltDatabase/miRBS/'+species+'/' gz_filepath, status = update.download(miRNAMap_dir,output_dir,'') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status fn=filepath(string.replace(gz_filepath,'.tar.gz','')); x=0; count=0 for line in open(fn,'rU').readlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: try: miRNA, ensembl_transcript_id, target_start, target_end, miRNA_seq, alignment, target_seq, algorithm, c1, c2, c3 = t #if 'GGCTCCTGTCACCTGGGTCCGT'in target_seq: #print 'a'; sys.exit() #if 'TCF7L1' in symbol or 'TCF3' in symbol: #if '-422a' in miRNA: #print miRNA;sys.exit() #print symbol, mir; sys.exit() if ensembl_transcript_id in ens_gene_to_transcript: geneids = ens_gene_to_transcript[ensembl_transcript_id] target_seq = string.upper(string.replace(target_seq,'-','')) target_seq = string.replace(target_seq,'U','T') for ensembl_geneid in geneids: if parse_sequences == 'yes': if (miRNA,ensembl_geneid) in combined_results: combined_results[(miRNA,ensembl_geneid)].append(target_seq) else: y = MicroRNATargetData(ensembl_geneid,'',miRNA,target_seq,algorithm); count+=1 try: microRNA_target_db[miRNA].append(y) except KeyError: microRNA_target_db[miRNA] = [y] except Exception: x=1 ### Bad formatting print count, 'miRNA-target relationships added for mirnamap' return count
for line in open(fn,'rU').xreadlines(): counts+=1 if counts>10: break except Exception: counts=0 if species_name == 'counts': ### Used if the file cannot be downloaded from http://www.altanalyze.org return counts elif counts == 0: if species_name in filename: server_folder = species_name ### Folder equals species unless it is a universal file elif 'Mm' in filename: server_folder = 'Mm' ### For PicTar else: server_folder = 'all' print 'Downloading:',server_folder,filename update.downloadCurrentVersion(filename,server_folder,'txt') else: return counts if __name__ == '__main__': species = 'Hs'; #species_full = 'Drosophila_melanogaster' filename = 'AltDatabase/ucsc/'+species+'/polyaDb.txt' verifyFile(filename,species) ### Makes sure file is local and if not downloads. sys.exit() importEnsExonStructureData(species,[],[],[]);sys.exit() reformatPolyAdenylationCoordinates(species,'no');sys.exit() #test = 'yes' #test_gene = ['ENSG00000140153','ENSG00000075413'] import UCSCImport; import update knownAlt_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/database','knownAlt.txt.gz') polyA_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/database','polyaDb.txt.gz') output_dir = 'AltDatabase/ucsc/'+species + '/' UCSCImport.downloadFiles(knownAlt_dir,output_dir); UCSCImport.downloadFiles(polyA_dir,output_dir);sys.exit() ensembl_ucsc_splicing_annotations = importEnsExonStructureData(species,ensembl_gene_coordinates,ensembl_annotations,exon_annotation_db)
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db): start_time = time.time() if force == 'yes': ### Download mRNA sequence file from website import UI species_names = UI.getSpeciesInfo() species_full = species_names[species] species_full = string.replace(species_full, ' ', '_') ucsc_mRNA_dir = update.getFTPData( 'hgdownload.cse.ucsc.edu', '/goldenPath/currentGenomes/' + species_full + '/bigZips', 'mrna.fa.gz') output_dir = 'AltDatabase/' + species + '/SequenceData/' try: gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir, '') if status == 'not-removed': try: os.remove(gz_filepath ) ### Not sure why this works now and not before except OSError: status = status except Exception: null = [] ### Occurs when file is not available for this species filename = 'AltDatabase/' + species + '/SequenceData/mrna.fa' output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_UCSC-mRNA_alignments.txt' dataw = export.ExportFile(output_file) output_file = 'AltDatabase/' + species + '/SequenceData/output/sequences/' + array_type + '_UCSC_mRNA_seqmatches.txt' datar = export.ExportFile(output_file) ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species) print "Begining generic fasta import of", filename #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289'] #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC' fn = filepath(filename) sequence = '|' ucsc_mRNA_hit_len = {} ucsc_probeset_null_hits = {} k = 0 fn = filepath(filename) sequence = '|' ucsc_mRNA_hit_len = {} ucsc_probeset_null_hits = {} k = 0 for line in open(fn, 'rU').xreadlines(): try: data, newline = string.split(line, '\n') except ValueError: continue if len(data) > 0: if data[0] != '#': try: if data[0] == '>': if len(sequence) > 1: if accession in ucsc_mrna_to_gene: gene_found = 'no' for ens_gene in ucsc_mrna_to_gene[accession]: if ens_gene in probeset_seq_db: sequence = string.upper(sequence) gene_found = 'yes' mRNA_seq = sequence[1:] mRNA_length = len(mRNA_seq) k += 1 probeset_seq_data = probeset_seq_db[ ens_gene] results = simpleSeqMatchProtocol( probeset_seq_data, mRNA_seq) for (call, probeset) in results: dataw.write( string.join([ probeset, str(call), accession ], '\t') + '\n') if gene_found == 'yes': values = [accession, mRNA_seq] values = string.join(values, '\t') + '\n' datar.write(values) values = string.split(data, ' ') accession = values[0][1:] sequence = '|' continue except IndexError: null = [] try: if data[0] != '>': sequence = sequence + data except IndexError: print kill continue datar.close() end_time = time.time() time_diff = int(end_time - start_time) print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db): start_time = time.time() if force == "yes": ### Download mRNA sequence file from website import UI species_names = UI.getSpeciesInfo() species_full = species_names[species] species_full = string.replace(species_full, " ", "_") ucsc_mRNA_dir = update.getFTPData( "hgdownload.cse.ucsc.edu", "/goldenPath/currentGenomes/" + species_full + "/bigZips", "mrna.fa.gz" ) output_dir = "AltDatabase/" + species + "/SequenceData/" try: gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir, "") if status == "not-removed": try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status except Exception: null = [] ### Occurs when file is not available for this species filename = "AltDatabase/" + species + "/SequenceData/mrna.fa" output_file = "AltDatabase/" + species + "/SequenceData/output/" + array_type + "_UCSC-mRNA_alignments.txt" dataw = export.ExportFile(output_file) output_file = ( "AltDatabase/" + species + "/SequenceData/output/sequences/" + array_type + "_UCSC_mRNA_seqmatches.txt" ) datar = export.ExportFile(output_file) ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species) print "Begining generic fasta import of", filename #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289'] #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC' fn = filepath(filename) sequence = "|" ucsc_mRNA_hit_len = {} ucsc_probeset_null_hits = {} k = 0 fn = filepath(filename) sequence = "|" ucsc_mRNA_hit_len = {} ucsc_probeset_null_hits = {} k = 0 for line in open(fn, "rU").xreadlines(): try: data, newline = string.split(line, "\n") except ValueError: continue if len(data) > 0: if data[0] != "#": try: if data[0] == ">": if len(sequence) > 1: if accession in ucsc_mrna_to_gene: gene_found = "no" for ens_gene in ucsc_mrna_to_gene[accession]: if ens_gene in probeset_seq_db: sequence = string.upper(sequence) gene_found = "yes" mRNA_seq = sequence[1:] mRNA_length = len(mRNA_seq) k += 1 probeset_seq_data = probeset_seq_db[ens_gene] results = simpleSeqMatchProtocol(probeset_seq_data, mRNA_seq) for (call, probeset) in results: dataw.write(string.join([probeset, str(call), accession], "\t") + "\n") if gene_found == "yes": values = [accession, mRNA_seq] values = string.join(values, "\t") + "\n" datar.write(values) values = string.split(data, " ") accession = values[0][1:] sequence = "|" continue except IndexError: null = [] try: if data[0] != ">": sequence = sequence + data except IndexError: print kill continue datar.close() end_time = time.time() time_diff = int(end_time - start_time) print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
if __name__ == '__main__': species = 'Hs' #species_full = 'Drosophila_melanogaster' filename = 'AltDatabase/ucsc/' + species + '/polyaDb.txt' verifyFile(filename, species) ### Makes sure file is local and if not downloads. sys.exit() importEnsExonStructureData(species, [], [], []) sys.exit() reformatPolyAdenylationCoordinates(species, 'no') sys.exit() #test = 'yes' #test_gene = ['ENSG00000140153','ENSG00000075413'] import UCSCImport import update knownAlt_dir = update.getFTPData( 'hgdownload.cse.ucsc.edu', '/goldenPath/currentGenomes/' + species_full + '/database', 'knownAlt.txt.gz') polyA_dir = update.getFTPData( 'hgdownload.cse.ucsc.edu', '/goldenPath/currentGenomes/' + species_full + '/database', 'polyaDb.txt.gz') output_dir = 'AltDatabase/ucsc/' + species + '/' UCSCImport.downloadFiles(knownAlt_dir, output_dir) UCSCImport.downloadFiles(polyA_dir, output_dir) sys.exit() ensembl_ucsc_splicing_annotations = importEnsExonStructureData( species, ensembl_gene_coordinates, ensembl_annotations, exon_annotation_db)