def justConvertFilenames(species, outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ("hide", "Ensembl-Symbol")) import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if ".pdf" in filename or ".png" in filename: fn = string.replace(filename, ".pdf", "") fn = string.replace(fn, ".png", "") newname = string.split(fn, "__") if newname[0] in gene_to_symbol: new_filename = str(filename) if "__" in filename: new_filename = string.split(filename, "__")[1] elif "\\" in filename: new_filename = string.split(filename, "\\")[1] elif "/" in filename: new_filename = string.split(filename, "/")[1] nnname = gene_to_symbol[newname[0]][0] + "-SashimiPlot_" + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass else: continue
def sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=None): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) if events==None: splicing_events,expandedSearch = importSplicingEventsToVisualize(eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i],'__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events)==0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir+'/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir+'/ExpressionInput/'+file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups=[] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample]) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(steady_state_exp_file,bamdir,mopup_events,sample_group_db,groups,expandedSearch) if len(processed_events)>0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def translateToEntrezGene(species,filename): x=0; type = 'pathway' try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene')) except Exception: ens_to_entrez ={} if len(ens_to_entrez)>0: export_file = string.replace(filename,'Ensembl','EntrezGene') export_data = export.ExportFile(export_file) export_data.write('EntrezGene\tOntologyID\n') fn = filepath(filename) for line in open(fn,'rU').xreadlines(): if x==0: x=1 else: data = cleanUpLine(line) try: ensembl,pathway = string.split(data,'\t') type = 'ontology' except Exception: ensembl,null,pathway = string.split(data,'\t') try: entrezs = ens_to_entrez[ensembl] for entrez in entrezs: if type == 'ontology': export_data.write(entrez+'\t'+pathway+'\n') else: export_data.write(entrez+'\tEn\t'+pathway+'\n') except Exception: null=[] export_data.close()
def justConvertFilenames(species, outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename, '.pdf', '') fn = string.replace(fn, '.png', '') newname = string.split(fn, '__') if newname[0] in gene_to_symbol: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename, '__')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = gene_to_symbol[ newname[0]][0] + '-SashimiPlot_' + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass else: continue
def importGeneSymbols(species): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) return gene_to_symbol, symbol_to_gene
def importDataSimple(filename,input_type,MOD=None,Species=None): id_db={} fn = filepath(filename) x=0 for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and x==0: x=0 elif x==0: column_headers = t[1:] if input_type != 'LineageProfiler': try: column_headers = t[2] ### exclude the ID, system code and p-value column headers except Exception: column_headers = 'NA' x=1 else: if x==1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes,source_types,mod_types = GO_Elite.getSourceData() source_data = system_codes[system_code] try: Mod=mod ### global established in upstream functions speciescode = species_code except Exception: Mod=MOD speciescode = Species if source_data == Mod: source_is_mod = True else: source_is_mod = False mod_source = Mod+'-'+source_data+'.txt' gene_to_source_id = gene_associations.getGeneToUid(speciescode,('hide',mod_source)) source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: try: id_db[t[0]] = float(t[2]) except Exception: id_db[t[0]] = 'NA' elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: try: value = t[2] except Exception: value = 'NA' if value == '+': value = 1 elif value == '-': value = -1 try: id_db[mod_id] = float(value) ### If multiple Ensembl IDs in dataset, only record the last associated fold change except Exception: id_db[mod_id] = 'NA' break else: id_db[t[0]]= map(float,t[1:]) ### Applies to LineageProfiler x+=1 #print len(id_db),column_headers return id_db,column_headers
def exportSymbolRelationships(pathway_to_symbol,selected_species,pathway_type,type): if selected_species != None: ### Restrict to selected species only current_species_dirs=selected_species else: current_species_dirs = unique.read_directory('/'+database_dir) for species in current_species_dirs: if '.' not in species: ens_dir = database_dir+'/'+species+'/gene-'+type+'/Ensembl-'+pathway_type+'.txt' ens_data = export.ExportFile(ens_dir) if 'mapp' in type: ens_data.write('GeneID\tSystem\tGeneSet\n') else: ens_data.write('GeneID\tGeneSet\n') try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene')) except Exception: ens_to_entrez ={} if len(ens_to_entrez)>0: entrez_dir = database_dir+'/'+species+'/gene-'+type+'/EntrezGene-'+pathway_type+'.txt' entrez_data = export.ExportFile(entrez_dir) if 'mapp' in type: entrez_data.write('GeneID\tSystem\tGeneSet\n') else: entrez_data.write('GeneID\tGeneSet\n') #print 'Exporting '+pathway_type+' databases for:',species try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: gene_to_source_id={} source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) source_to_gene = lowerSymbolDB(source_to_gene) for pathway in pathway_to_symbol: for symbol in pathway_to_symbol[pathway]: try: genes = source_to_gene[symbol] for gene in genes: if 'mapp' in type: ens_data.write(gene+'\tEn\t'+pathway+'\n') else: ens_data.write(gene+'\t'+pathway+'\n') if gene in ens_to_entrez: for entrez in ens_to_entrez[gene]: if 'mapp' in type: entrez_data.write(entrez+'\tL\t'+pathway+'\n') else: entrez_data.write(entrez+'\t'+pathway+'\n') except Exception: null=[] ens_data.close() try: entrez_data.close() except Exception: null=[]
def importDataSimple(filename, input_type): id_db = {} fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: column_headers = t[1:] if input_type != 'LineageProfiler': column_headers = t[ 2] ### exclude the ID, system code and p-value column headers x = 1 else: if x == 1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes, source_types, mod_types = GO_Elite.getSourceData( ) source_data = system_codes[system_code] if source_data == mod: source_is_mod = True else: source_is_mod = False mod_source = mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( species_code, ('hide', mod_source)) source_to_gene = OBO_import.swapKeyValues( gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: id_db[t[0]] = float(t[2]) elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: id_db[mod_id] = float( t[2] ) ### If multiple Ensembl IDs in dataset, only record the last associated fold chagne else: id_db[t[0]] = map(float, t[1:]) ### Applies to LineageProfiler x += 1 return id_db, column_headers
def justConvertFilenames(species,outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename,'.pdf','') fn = string.replace(fn,'.png','') newname=string.split(fn,'__') if newname[0] in gene_to_symbol: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename,'__')[1] elif '\\' in filename: new_filename = string.split(filename,'\\')[1] elif '/' in filename: new_filename = string.split(filename,'/')[1] nnname=gene_to_symbol[newname[0]][0]+'-SashimiPlot_'+new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname)) except Exception: pass else: continue
def CreateFilesMonocle(filename, rawExpressionFile, species='Hs'): try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) except Exception: gene_to_symbol = {} #Create the files for Monocle setWorkingDirectory(findParentDir(filename)[:-1]) try: os.mkdir(findParentDir(filename)[:-1]) except Exception: None #filename=self.File() x = 0 data_name = findParentDir(filename) + '/data.txt' gene_name = findParentDir(filename) + '/gene.txt' sample_name = findParentDir(filename) + '/sample.txt' gene_names = [] gene_list = [] dat = [] export_cdt = open(sample_name, 'w') export_gene = open(gene_name, 'w') for line in open(filename, 'rU').xreadlines(): data = cleanUpLine(line) headers = string.split(data, '\t') dat.append(line) if data[0] != '#': if x == 1: gen = headers[0] gen = (gen.split(" ")) ge_lt = gen[0] gene = string.join(gen, '\t') gene_names.append(gene) gene_list.append(ge_lt) if x == 0: array_names = [] array_linker_db = {} d = 0 for entry in headers[1:]: if '::' in entry: a = (entry.split("::")) else: a = (entry.split(":")) a = reversed(a) ent = string.join(a, '\t') if (ent[0].isdigit()): ent = 'X' + ent[0:] #print j array_names.append(ent) x = 1 i = 0 eheader = string.join( [''] + ['Group'], '\t') + '\n' ### format column-flat-clusters for export export_cdt.write(eheader) for row in array_names: export_cdt.write(row + '\n') i += 1 export_cdt.close() gheader = string.join( [''] + ['gene_short_name'], '\t') + '\n' ### format column-flat-clusters for export export_gene.write(gheader) export_object = open(data_name, 'w') """ for row in array_names: group=string.split(row,'\t') export_object.write('\t'+group[0]) #print group[0] export_object.write('\n') """ firstRow = True for line in open(rawExpressionFile, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') id = t[0] nid = id proceed = False if firstRow: new_headers = [] headers = t[1:] for i in headers: i = string.replace(i, ':', '-') new_headers.append(i) export_object.write( string.join(['UID'] + new_headers, '\t') + '\n') firstRow = False else: if id in gene_list: proceed = True else: if id in gene_to_symbol: symbol = gene_to_symbol[id][0] if symbol in gene_list: nid = symbol proceed = True if proceed: k = gene_list.index(nid) export_object.write(line) export_gene.write(id + '\n') #export_gene.write(gene_list[k]+'\n') export_object.close() export_gene.close()
def sashmi_plot_list(bamdir, eventsToVisualizeFilename, PSIFilename, events=None): try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene = {} if events == None: splicing_events, expandedSearch = importSplicingEventsToVisualize( eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i], '__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[ i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events) == 0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir + '/ExpressionInput/' + file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups = [] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample] ) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups, expandedSearch) if len(processed_events) > 0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def importDataSimple(filename, input_type, MOD=None, Species=None): id_db = {} fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#' and x == 0: x = 0 elif x == 0: column_headers = t[1:] if input_type != 'LineageProfiler': try: column_headers = t[ 2] ### exclude the ID, system code and p-value column headers except Exception: column_headers = 'NA' x = 1 else: if x == 1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes, source_types, mod_types = GO_Elite.getSourceData( ) source_data = system_codes[system_code] try: Mod = mod ### global established in upstream functions speciescode = species_code except Exception: Mod = MOD speciescode = Species if source_data == Mod: source_is_mod = True else: source_is_mod = False mod_source = Mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( speciescode, ('hide', mod_source)) source_to_gene = OBO_import.swapKeyValues( gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: try: id_db[t[0]] = float(t[2]) except Exception: id_db[t[0]] = 'NA' elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: try: value = t[2] except Exception: value = 'NA' if value == '+': value = 1 elif value == '-': value = -1 try: id_db[mod_id] = float( value ) ### If multiple Ensembl IDs in dataset, only record the last associated fold change except Exception: id_db[mod_id] = 'NA' break else: id_db[t[0]] = map(float, t[1:]) ### Applies to LineageProfiler x += 1 #print len(id_db),column_headers return id_db, column_headers
def generateMAPPFinderScores(species_title, species_id, source, mod_db, system_Codes, permute, resources_to_analyze, file_dirs, parent_root): global mappfinder_output_dir global custom_sets_folder global root root = parent_root global mapp_to_mod_genes global ontology_to_mod_genes global system_codes system_codes = system_Codes criterion_input_folder, criterion_denom_folder, output_dir, custom_sets_folder = file_dirs previous_denominator_file_dir = '' ontology_to_mod_genes = {} mapp_to_mod_genes = {} global test test = 'no' program_type, database_dir = unique.whatProgramIsThis() if resources_to_analyze == 'Gene Ontology': resources_to_analyze = 'GeneOntology' if len(output_dir) == 0: mappfinder_output_dir = 'input/MAPPFinder' else: mappfinder_output_dir = output_dir + '/GO-Elite_results/CompleteResults/ORA' global source_data source_data = source global mod mod = mod_db global species_code species_code = species_id global species_name species_name = species_title global gene_to_mapp global permutations permutations = permute global eliminate_redundant_genes eliminate_redundant_genes = 'yes' global permuted_z_scores global ontology_annotations global original_ontology_z_score_data global original_mapp_z_score_data global input_gene_list global denominator_gene_list global gene_file global denom_file_status global input_count global denom_count global gene_annotations global source_to_gene global use_FET if permutations == "FisherExactTest": use_FET = 'yes' ### Use Fisher's Exact test instead of permutation-based p-values permutations = 0 else: use_FET = 'no' start_time = time.time() gene_annotations = gene_associations.importGeneData(species_code, mod) OBO_date = importVersionData('OBO/') if len(criterion_input_folder) == 0: import_dir = '/input/GenesToQuery/' + species_code import_dir_alt = import_dir[1:] else: import_dir = criterion_input_folder import_dir_alt = criterion_input_folder m = GrabFiles() m.setdirectory(import_dir) try: dir_list = readDirText( import_dir ) #send a sub_directory to a function to identify all files in a directory except Exception: print_out = 'Warning! Input directory location is not a valid folder. Exiting GO-Elite.' ForceCriticalError(print_out) try: denom_dir_list = readDirText(criterion_denom_folder) except Exception: print_out = 'Warning! Denominator directory location is not a valid folder. Exiting GO-Elite.' ForceCriticalError(print_out) if len(dir_list) == 0: error_message = 'No files with the extension ".txt" found in the input directory.' ForceCriticalError(error_message) if len(denom_dir_list) == 0: error_message = 'No files with the extension ".txt" found in the denominator directory.' ForceCriticalError(error_message) inputs_analyzed = 0 for mappfinder_input in dir_list: #loop through each file in the directory permuted_z_scores = {} original_ontology_z_score_data = {} original_mapp_z_score_data = {} print 'Performing over-representation analysis (ORA) on', mappfinder_input gene_file_dir, gene_file = m.searchdirectory(mappfinder_input) ###Import Input gene/source-id lists input_gene_list, source_data_input, error_message = gene_associations.importUIDsForMAPPFinderQuery( import_dir_alt + '/' + gene_file, system_codes, 'no') input_count = len(input_gene_list) if 'WARNING!!!' in error_message: ### Warn the user about SwissProt issues when importing the denominator ForceCriticalError(error_message) if len(criterion_denom_folder) == 0: denom_folder = '/input/GenesToQuery/' + species_code + '/DenominatorGenes' else: denom_folder = criterion_denom_folder error_warning = "\nThe directory\n" + '[' + denom_folder + ']' + "\nwas not found. Please create the directory\nand place an appropriate denominator file\nor files in it." denominator_file_dir = identifyGeneFiles( denom_folder, gene_file) ###input is in input\Genes, denominator in try: denominator_file_dir = identifyGeneFiles( denom_folder, gene_file) ###input is in input\Genes, denominator in denominator_file = string.split(denominator_file_dir, '/')[-1] print 'Using:', denominator_file, 'for the denominator.' except Exception: print_out = "WARNING: No denominator file included in\nthe Denominator directory.\nTo proceed, place all denominator\nIDs in a file in that directory." ForceCriticalError(print_out) if denominator_file_dir == previous_denominator_file_dir: denom_file_status = 'old' else: denom_file_status = 'new' if denom_file_status == 'new': previous_denominator_file_dir = denominator_file_dir denominator_gene_list, source_data_denom, error_message = gene_associations.importUIDsForMAPPFinderQuery( denominator_file_dir, system_codes, 'no') denom_count = len(denominator_gene_list) if 'SwissProt' in error_message and 'WARNING!!!' not in error_message: if len(input_gene_list) == 0: error_message += '\nNo valid input IDs found. Exiting GO-Elite.' try: UI.WarningWindow( error_message, 'Warning!!! Identifier Error' ) ### Only warn, don't force an exit (if SwissProt full IDs are present) except Exception: None sys.exit() else: try: UI.WarningWindow( error_message, 'Warning!!! Identifier Error' ) ### Only warn, don't force an exit (if SwissProt full IDs are present) except Exception: None elif len(error_message) > 0: ForceCriticalError(error_message) if len(denominator_gene_list) == len(input_gene_list): print_out = 'Input and Denominator lists have identical counts.\nPlease load a propper denominator set (containing\nthe input list with all assayed gene IDs) before proceeding.' ForceCriticalError(print_out) original_denominator_gene_list = [] for id in denominator_gene_list: original_denominator_gene_list.append( id ) ###need this to be a valid list not dictionary for permutation analysis if len(source_data_input) > 0: source_data = source_data_input ###over-ride source_data if a source was identified from the input file if source_data != mod: if denom_file_status == 'new': mod_source = mod + '-' + source_data + '.txt' #checkDenominatorMatchesInput(input_gene_list,denominator_gene_list,gene_file) ###This is checked for the source IDs not associated MOD IDs try: gene_to_source_id = gene_associations.getGeneToUid( species_code, mod_source) print mod_source, 'imported' except Exception: try: if mod == 'EntrezGene': mod = 'Ensembl' else: mod = 'EntrezGene' print 'The primary system (MOD) has been switched from', mod_db, 'to', mod, '\n(' + mod_db, 'not supported for the %s ID system).' % source_data mod_source = mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( species_code, mod_source) except Exception: print_out = "WARNING: The primary gene ID system '" + mod + "'\ndoes not support relationships with '" + source_data + "'.\nRe-run using a supported primary ID system." ForceCriticalError(print_out) source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) denominator_gene_list = associateInputSourceWithGene( source_to_gene, denominator_gene_list) ### Introduced the below method in version 1.21 to improve permutation speed (no longer need to search all source IDs) ### Only includes source ID to gene relationships represented in the denominator file (needed for Affymetrix) source_to_gene = OBO_import.swapKeyValues( denominator_gene_list) ###Replace input lists with corresponding MOD IDs input_gene_list = associateInputSourceWithGene( source_to_gene, input_gene_list) checkDenominatorMatchesInput( input_gene_list, denominator_gene_list, gene_file) ###This is for only the associated MOD IDs gd = GrabFiles() gd.setdirectory('/' + database_dir + '/' + species_code + '/gene-mapp') available_genesets = reorganizeResourceList(gd.getAllFiles(mod)) od = GrabFiles() od.setdirectory('/' + database_dir + '/' + species_code + '/gene-go') available_ontologies = reorganizeResourceList(od.getAllFiles(mod)) input_gene_count = len( input_gene_list ) ###Count number of genes associated with source input IDs if len(input_gene_list) == 0 or len(denominator_gene_list) == 0: if len(input_gene_list) == 0: print_out = 'WARNING!!!! None of the input IDs provided map to genes for ' + mappfinder_input + '. Check to make sure the selected species is correct.' print_out += '\nSelected species: ' + species_name print_out += '\nInput ID system: ' + str(source_data_input) print_out += '\nPrimary ID system (MOD): ' + str(mod) ForceCriticalError(print_out) if len(denominator_gene_list) == 0: print_out = 'WARNING!!!! None of the denominator IDs provided map to genes for ' + denominator_file_dir + '. Check to make sure the selected species is correct.' print_out += '\nSelected species: ' + species_name print_out += '\nDenominator ID system: ' + str(source) print_out += '\nPrimary ID system (MOD):' + str(mod) ForceCriticalError(print_out) elif len(available_ontologies) == 0 and len(available_genesets) == 0: print_out = 'WARNING!!!! No Ontology or GeneSets appear to be available for this species. Please supply and re-analyze.' ForceCriticalError(print_out) else: """ Perform permutation analysis and ORA on available GeneSets or Ontologies""" inputs_analyzed += 1 global permute_inputs permute_inputs = [] if permutations != 0 or use_FET == 'no': buildPermutationDatabase(original_denominator_gene_list, input_count) run_status = 0 ### Analyzed ontologies if len(available_ontologies) > 0: print ' Analyzing input ID list with available ontologies' for ontology_dir in available_ontologies: ontology_type = getResourceType(ontology_dir) permuted_z_scores = {} original_ontology_z_score_data = {} #print ontology_type, resources_to_analyze if resources_to_analyze == ontology_type or resources_to_analyze == 'all': ontology_annotations = importOntologyAnnotations( species_code, ontology_type) if ontology_annotations != None: ### Occurs when the files are named or formatted correctly status, ontology_to_mod_genes = performOntologyORA( ontology_dir) run_status += status ### Analyzed gene-sets if len(available_genesets) > 0: print ' Analyzing input ID list with available gene-sets' for geneset_dir in available_genesets: geneset_type = getResourceType(geneset_dir) permuted_z_scores = {} original_mapp_z_score_data = {} if resources_to_analyze == geneset_type or resources_to_analyze == 'all': status, mapp_to_mod_genes = performGeneSetORA(geneset_dir) run_status += status if len(custom_sets_folder) > 0: ### Hence - Analyze User Supplied GeneSets permuted_z_scores = {} original_mapp_z_score_data = {} run_status += performGeneSetORA('UserSuppliedAssociations')[0] permute_inputs = [] permute_mapp_inputs = [] ontology_input_gene_count = [] mapp_input_gene_count = [] if run_status == 0: ### Returns the number of successfully analyzed gene-set databases program_type, database_dir = unique.whatProgramIsThis() print_out = "Warning!!! Either the MOD you have selected: " + mod + "\nis missing the appropriate relationshipfiles necessary to run GO-Elite\nor you have selected an invalid resource to analyze. Either replace\nthe missing MOD files in " + database_dir + '/' + species_code + ' sub-directories or\nselect a different MOD at run-time.' ForceCriticalError(print_out) end_time = time.time() time_diff = formatTime(start_time, end_time) print 'ORA analyses finished in %s seconds' % time_diff return ontology_to_mod_genes, mapp_to_mod_genes ###Return the MOD genes associated with each GO term and MAPP
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] try: gene = string.split(t[0],'|')[0] except Exception: pass #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = t[1:] if '' in exp_vals: ### If missing values present (PSI values) exp_vals = ['0.000101' if i=='' else i for i in exp_vals] useLog = False exp_vals = map(float, exp_vals) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError """else: for gene in tissue_specific_db: if 'Ndufa9:ENSMUSG00000000399:I2.1-E3.1' in gene: print gene, 'dog';sys.exit() print gene;kill""" print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def performEventEnrichment(output_dir, eventDir, species): """Import significant splicing events from metaDataAnalysis.py comparisons and test for their statistical enrichmet relative to the Splicing Factor correlated events.""" import collections import mappfinder event_db = collections.OrderedDict() import UI ### Import the splice-ICGS significant splicing events per signature files = UI.read_directory(eventDir) for file in files: if '.txt' in file and 'PSI.' in file: ls = [] event_db[ file[:-4]] = ls ### This list is subsequently updated below fn = eventDir + '/' + file firstLine = True for line in open(fn, 'rU').xreadlines(): data = line.rstrip() t = string.split(data, '\t') if firstLine: event_index = t.index('Event-Direction') firstLine = False continue uid = t[0] if 'U2AF1-like' in file: if t[1] == "inclusion": ls.append(uid) #ls.append((uid,t[event_index])) else: ls.append(uid) #ls.append((uid,t[event_index])) ### Import the splicing-factor correlated splicing events to identify associated signatures splicing_factor_correlated_scores = {} gene_to_symbol = None files = UI.read_directory(output_dir) for file in files: if '.txt' in file and '_' in file: R_ls = [] if 'ENS' in file: splicing_factor = file[:-4] if gene_to_symbol == None: ### Import only once import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) sf = 'ENS' + string.split(splicing_factor, 'ENS')[1] splicing_factor = string.split(sf, '_')[0] if splicing_factor in gene_to_symbol: splicing_factor = gene_to_symbol[splicing_factor][0] else: splicing_factor = string.split(file[:-4], '_')[0] fn = output_dir + '/' + file firstLine = True for line in open(fn, 'rU').xreadlines(): data = line.rstrip() t = string.split(data, '\t') event = t[0] R_ls.append(event) R = len(R_ls) N = 80000 for signature in event_db: n_ls = event_db[signature] n = len(n_ls) r_ls = set(R_ls).intersection(n_ls) r = len(r_ls) ### Calculate a Z-score try: z = Zscore(r, n, N, R) except ZeroDivisionError: z = 0.0000 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0, n, N, R) except ZeroDivisionError: null_z = 0.000 ### Calculate a Fischer's Exact P-value pval = mappfinder.FishersExactTest(r, n, R, N) ### Store these data in an object zsd = mappfinder.ZScoreData(signature, r, n, z, null_z, n) zsd.SetP(pval) zsd.setAssociatedIDs(r_ls) #print splicing_factor,'\t', signature,'\t', z, pval;sys.exit() if splicing_factor in splicing_factor_correlated_scores: signature_db = splicing_factor_correlated_scores[ splicing_factor] signature_db[ signature] = zsd ### Necessary format for the permutation function else: signature_db = {signature: zsd} splicing_factor_correlated_scores[ splicing_factor] = signature_db results_dir = output_dir + '/SFEnrichmentResults' result_file = results_dir + '/SF-correlated_SignatureScores.txt' try: os.mkdir(results_dir) except: pass eo = open(result_file, 'w') eo.write( string.join([ 'Splicing Factor', 'Signature', 'Number Changed', 'Number Measured', 'Z-score', 'FisherExactP', 'AdjustedP' ], '\t') + '\n') #'Events' ### Perform a permutation analysis to get BH adjusted p-values for splicing_factor in splicing_factor_correlated_scores: sorted_results = [] signature_db = splicing_factor_correlated_scores[splicing_factor] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] if float(zsd.ZScore()) > 1.96 and float( zsd.Changed()) > 2 and float(zsd.PermuteP()) < 0.05: enriched_SFs = {} results = [ splicing_factor, signature, zsd.Changed(), zsd.Measured(), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP() ] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([float(zsd.PermuteP()), results]) sorted_results.sort() ### Sort by p-value for (p, values) in sorted_results: eo.write(string.join(values, '\t') + '\n') if len(sorted_results) == 0: eo.write( string.join([ splicing_factor, 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE' ], '\t') + '\n') eo.close()
def CreateFilesMonocle(filename, rawExpressionFile, species="Hs"): first_row = True key_db = {} key_list = [] fn = filepath(filename) offset = 0 nonNumericsPresent = False try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ("hide", "Ensembl-Symbol")) except Exception: print "gene_symbols present" gene_to_symbol = {} setWorkingDirectory(findParentDir(filename)[:-1]) try: os.mkdir(findParentDir(filename) + "/Monocle") except Exception: None # filename=self.File() x = 0 data_name = findParentDir(filename) + "/Monocle/expressionFile.txt" gene_name = findParentDir(filename) + "/Monocle/geneAnnotations.txt" sample_name = findParentDir(filename) + "/Monocle/sampleGroups.txt" gene_names = [] gene_list = [] dat = [] export_cdt = open(sample_name, "w") export_gene = open(gene_name, "w") for line in open(fn, "rU").xreadlines(): data = cleanUpLine(line) t = string.split(data, "\t") if first_row == True: if "row_clusters-flat" in t and "row_clusters-flat" not in t[0]: headers = string.join(t[2:], "\t") + "\n" offset = 1 else: headers = string.join(t[1:], "\t") + "\n" first_row = False else: key = t[0] if key != "column_clusters-flat": key_list.append(key) try: s = map(float, t[offset + 1 :]) except Exception: nonNumericsPresent = True key_db[key] = t for key in key_list: t = key_db[key] s = [key] if offset == 1: s.append("") temp = [] for value in t[offset + 1 :]: try: temp.append(float(value)) except Exception: pass min1 = min(temp) for value in t[offset + 1 :]: try: s.append(str(float(value) - min1)) except Exception: s.append("0.000101") key_db[key] = s export_object = open(data_name, "w") export_object.write("" + "\t" + headers) ### Header is the same for each file for key in key_list: t = key_db[key] if offset > 0: t = [t[0]] + t[1 + offset :] export_object.write(string.join(t, "\t") + "\n") ### Write z-score values and row names export_object.close() print "File written..." # return input_file array_names = [] array_linker_db = {} d = 0 for entry in headers.split("\t"): entry = cleanUpLine(entry) if "::" in entry: a = entry.split("::") else: a = entry.split(":") # entry=string.join(a,'.') ent = entry + "\t" + a[0] # if(ent[0].isdigit()): # ent='X'+ent[0:] # if '-' in ent: # ent=string.replace(ent,'-','.') # if '+' in ent: # ent=string.replace(ent,'+','.') # print j array_names.append(ent) i = 0 eheader = string.join([""] + ["Group"], "\t") + "\n" ### format column-flat-clusters for export export_cdt.write(eheader) for row in array_names: export_cdt.write(row + "\n") i += 1 export_cdt.close() gheader = string.join([""] + ["gene_short_name"], "\t") + "\n" ### format column-flat-clusters for export export_gene.write(gheader) for key in key_list: if key in gene_to_symbol: symbol = gene_to_symbol[id][0] if symbol in gene_list: nid = symbol proceed = True if proceed: k = gene_list.index(nid) export_object.write(line) export_gene.write(id + "\n") else: export_gene.write(key + "\t" + key + "\n") export_object.close() export_gene.close()
def reformatPolyAdenylationCoordinates(species,force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version={} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for',species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/'+species + '/' if force == 'yes': filename, status = update.download(url,output_dir,'') else: filename = output_dir+'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations; import OBO_import; import EnsemblImport; import export try: ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene') print len(ens_unigene),'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no' except Exception: ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene') print len(ens_entrez),'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') export_bedfile = output_dir+species+'_polyADB_2_predictions.bed' print 'exporting',export_bedfile export_data = export.ExportFile(export_bedfile) header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n' export_data.write(header) fn=filepath(filename); x=0; not_found={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr'+chr strand = '+'; geneid = siteid pos_start = str(int(position)-1); pos_end = position if use_entrez=='no': external_geneid = string.join(string.split(siteid,'.')[:2],'.') else: external_geneid=llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-'+ens_geneid chr,strand,start,end = gene_location_db[ens_geneid] else: not_found[external_geneid]=[] bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n' export_data.write(bed_format) export_data.close()
def CreateFilesMonocle(filename,rawExpressionFile,species='Hs'): first_row = True key_db={} key_list=[] fn=filepath(filename) offset=0 nonNumericsPresent=False try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: print "gene_symbols present" gene_to_symbol={} setWorkingDirectory(findParentDir(filename)[:-1]) try: os.mkdir(findParentDir(filename)+'/Monocle') except Exception: None #filename=self.File() x = 0 data_name=findParentDir(filename)+'/Monocle/expressionFile.txt' gene_name=findParentDir(filename)+'/Monocle/geneAnnotations.txt' sample_name=findParentDir(filename)+'/Monocle/sampleGroups.txt' gene_names = []; gene_list=[]; dat=[]; export_cdt = open(sample_name,'w') export_gene=open(gene_name,'w') for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if first_row == True: if 'row_clusters-flat' in t and 'row_clusters-flat' not in t[0]: headers = string.join(t[2:],'\t')+'\n' offset = 1 else: headers = string.join(t[1:],'\t')+'\n' first_row = False else: key = t[0] if key!='column_clusters-flat': key_list.append(key) try: s = map(float,t[offset+1:]) except Exception: nonNumericsPresent=True key_db[key]=t else: clusters = map(str,t[offset+1:]) for key in key_list: t = key_db[key] s=[key] if offset ==1: s.append('') temp=[] for value in t[offset+1:]: try: temp.append(float(value)) except Exception: pass min1=min(temp) for value in t[offset+1:]: try: s.append(str(float(value)-min1)) except Exception: s.append('0.000101') key_db[key]=s export_object = open(data_name,'w') export_object.write(''+'\t'+headers) ### Header is the same for each file for key in key_list: t = key_db[key] if offset > 0: t = [t[0]]+t[1+offset:] export_object.write(string.join(t,'\t')+'\n') ### Write z-score values and row names export_object.close() print 'File written...' #return input_file array_names = []; array_linker_db = {}; d = 0; i = 0 for entry in headers.split('\t'): entry=cleanUpLine(entry) if '::' in entry: a = (entry.split("::")) elif ':' in entry: a = (entry.split(":")) else: a = (clusters[i],entry) #entry=string.join(a,'.') ent=entry+'\t'+a[0]; #if(ent[0].isdigit()): # ent='X'+ent[0:] #if '-' in ent: # ent=string.replace(ent,'-','.') #if '+' in ent: # ent=string.replace(ent,'+','.') #print j array_names.append(ent); i+=1 i=0 eheader = string.join(['']+['Group'],'\t')+'\n' ### format column-flat-clusters for export export_cdt.write(eheader) for row in array_names: export_cdt.write(row+'\n') i+=1 export_cdt.close() gheader = string.join(['']+ ['gene_short_name'],'\t')+'\n' ### format column-flat-clusters for export export_gene.write(gheader) for key in key_list: if key in gene_to_symbol: symbol = gene_to_symbol[id][0] if symbol in gene_list: nid = symbol proceed = True if proceed: k=gene_list.index(nid) export_object.write(line) export_gene.write(id+'\n') else: export_gene.write(key+'\t'+key+'\n') export_object.close() export_gene.close()
def CreateFilesMonocle(filename, rawExpressionFile, species='Hs'): first_row = True key_db = {} key_list = [] fn = filepath(filename) offset = 0 nonNumericsPresent = False try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) except Exception: print "gene_symbols present" gene_to_symbol = {} setWorkingDirectory(findParentDir(filename)[:-1]) try: os.mkdir(findParentDir(filename) + '/Monocle') except Exception: None #filename=self.File() x = 0 data_name = findParentDir(filename) + '/Monocle/expressionFile.txt' gene_name = findParentDir(filename) + '/Monocle/geneAnnotations.txt' sample_name = findParentDir(filename) + '/Monocle/sampleGroups.txt' gene_names = [] gene_list = [] dat = [] export_cdt = open(sample_name, 'w') export_gene = open(gene_name, 'w') for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if first_row == True: if 'row_clusters-flat' in t and 'row_clusters-flat' not in t[0]: headers = string.join(t[2:], '\t') + '\n' offset = 1 else: headers = string.join(t[1:], '\t') + '\n' first_row = False else: key = t[0] if key != 'column_clusters-flat': key_list.append(key) try: s = map(float, t[offset + 1:]) except Exception: nonNumericsPresent = True key_db[key] = t for key in key_list: t = key_db[key] s = [key] if offset == 1: s.append('') temp = [] for value in t[offset + 1:]: try: temp.append(float(value)) except Exception: pass min1 = min(temp) for value in t[offset + 1:]: try: s.append(str(float(value) - min1)) except Exception: s.append('0.000101') key_db[key] = s export_object = open(data_name, 'w') export_object.write('' + '\t' + headers) ### Header is the same for each file for key in key_list: t = key_db[key] if offset > 0: t = [t[0]] + t[1 + offset:] export_object.write(string.join(t, '\t') + '\n') ### Write z-score values and row names export_object.close() print 'File written...' #return input_file array_names = [] array_linker_db = {} d = 0 for entry in headers.split('\t'): entry = cleanUpLine(entry) if '::' in entry: a = (entry.split("::")) else: a = (entry.split(":")) #entry=string.join(a,'.') ent = entry + '\t' + a[0] #if(ent[0].isdigit()): # ent='X'+ent[0:] #if '-' in ent: # ent=string.replace(ent,'-','.') #if '+' in ent: # ent=string.replace(ent,'+','.') #print j array_names.append(ent) i = 0 eheader = string.join( [''] + ['Group'], '\t') + '\n' ### format column-flat-clusters for export export_cdt.write(eheader) for row in array_names: export_cdt.write(row + '\n') i += 1 export_cdt.close() gheader = string.join( [''] + ['gene_short_name'], '\t') + '\n' ### format column-flat-clusters for export export_gene.write(gheader) for key in key_list: if key in gene_to_symbol: symbol = gene_to_symbol[id][0] if symbol in gene_list: nid = symbol proceed = True if proceed: k = gene_list.index(nid) export_object.write(line) export_gene.write(id + '\n') else: export_gene.write(key + '\t' + key + '\n') export_object.close() export_gene.close()
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = map(float, t[1:]) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def importPAZARAssociations(): pazar_files = unique.read_directory('/BuildDBs/PAZAR') species_db={} tf_to_target={} for file in pazar_files: if '.csv' in file: name = string.join(string.split(file,'_')[1:-1],'_') fn = filepath('BuildDBs/PAZAR/'+file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) try: ### Each line contains the following 11 tab-delim fields: ### Fields are: <PAZAR TF ID> <TF Name> <PAZAR Gene ID> <ensembl gene accession> <chromosome> <gene start coordinate> <gene end coordinate> <species> <project name> <PMID> <analysis method> pazar_tf_id, tf_name, pazar_geneid, ens_gene, chr, gene_start,gene_end,species,project,pmid,analysis_method = string.split(data,'\t') species,genus = string.split(species,' ') species = species[0]+genus[0] tft=TFTargetInfo(tf_name,ens_gene,project,pmid,analysis_method) try: tf_to_target[species,tf_name].append(tft) except Exception: tf_to_target[species,tf_name] = [tft] species_db[species]=[] except Exception: None ### Occurs due to file formatting issues (during an update?) determine_tf_geneids = 'no' if determine_tf_geneids == 'yes': """ The below code is probably most useful for creation of complex regulatory inference networks in Cytoscape """ uniprot_ensembl_db = importUniProtAnnotations(species_db) missing=[] tf_to_target_ens={} for (species,tf_name) in tf_to_target: original_tf_name = tf_name try: ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,tf_name] except Exception: try: tf_name = string.split(tf_name,'_')[0] ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name] except Exception: try: tf_names=[] if '/' in tf_name: tf_names = string.split(tf_name,'/') elif ' ' in tf_name: tf_names = string.split(tf_name,' ') for tf_name in tf_names: ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name] except Exception: missing.append((tf_name,species)) print 'Ensembl IDs found for UniProt Transcription factor names:',len(tf_to_target_ens),'and missing:', len(missing) #print missing[:20] ### Translate all species data to gene symbol to export for all species species_tf_targets={} for (species,tf_name) in tf_to_target: try: tf_db = species_tf_targets[species] tf_db[tf_name] = tf_to_target[species,tf_name] except Exception: tf_db = {} tf_db[tf_name] = tf_to_target[species,tf_name] species_tf_targets[species] = tf_db tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt' tf_data = export.ExportFile(tf_dir) tf_to_symbol={} #print 'Exporting:',tf_dir #print len(species_tf_targets) for species in species_tf_targets: try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: gene_to_source_id={} tf_db = species_tf_targets[species] for tf_name in tf_db: for tft in tf_db[tf_name]: try: for symbol in gene_to_source_id[tft.Ensembl()]: symbol = string.lower(symbol) tf_id = tf_name+'(Source:'+tft.Project()+'-PAZAR'+')' tf_data.write(tf_id+'\t'+symbol+'\n') try: tf_to_symbol[tf_id].append(symbol) except Exception: tf_to_symbol[tf_id] = [symbol] except Exception: null=[]; tf_data.close() tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol) return tf_to_symbol
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations import OBO_import import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()