def justConvertFilenames(species, outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename, '.pdf', '') fn = string.replace(fn, '.png', '') newname = string.split(fn, '__') if newname[0] in gene_to_symbol: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename, '__')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = gene_to_symbol[ newname[0]][0] + '-SashimiPlot_' + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass else: continue
def importGeneSymbols(species): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) return gene_to_symbol, symbol_to_gene
def sashmi_plot_list(bamdir, eventsToVisualizeFilename, PSIFilename, events=None): try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene = {} if events == None: splicing_events, expandedSearch = importSplicingEventsToVisualize( eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i], '__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[ i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events) == 0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir + '/ExpressionInput/' + file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups = [] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample] ) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups, expandedSearch) if len(processed_events) > 0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def importDataSimple(filename, input_type, MOD=None, Species=None): id_db = {} fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#' and x == 0: x = 0 elif x == 0: column_headers = t[1:] if input_type != 'LineageProfiler': try: column_headers = t[ 2] ### exclude the ID, system code and p-value column headers except Exception: column_headers = 'NA' x = 1 else: if x == 1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite from import_scripts import OBO_import system_codes, source_types, mod_types = GO_Elite.getSourceData( ) source_data = system_codes[system_code] try: Mod = mod ### global established in upstream functions speciescode = species_code except Exception: Mod = MOD speciescode = Species if source_data == Mod: source_is_mod = True else: source_is_mod = False mod_source = Mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( speciescode, ('hide', mod_source)) source_to_gene = OBO_import.swapKeyValues( gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: try: id_db[t[0]] = float(t[2]) except Exception: id_db[t[0]] = 'NA' elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: try: value = t[2] except Exception: value = 'NA' if value == '+': value = 1 elif value == '-': value = -1 try: id_db[mod_id] = float( value ) ### If multiple Ensembl IDs in dataset, only record the last associated fold change except Exception: id_db[mod_id] = 'NA' break else: id_db[t[0]] = map(float, t[1:]) ### Applies to LineageProfiler x += 1 #print len(id_db),column_headers return id_db, column_headers
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations from import_scripts import OBO_import from build_scripts import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()