def importBioMarkerGeneAssociations(): try: biomarker_files = unique.read_directory('BuildDBs/BioMarkers/') except Exception: biomarker_files = unique.read_directory('/BuildDBs/BioMarkers/') x=0; marker_symbol_db={} for file in biomarker_files: if '.txt' in file: fn = filepath('BuildDBs/BioMarkers/'+file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x = 1; y=0 for i in t: if 'marker-in' in i: mi = y if 'Symbol' in i: sy = y y+=1 ensembl = t[0]; symbol = string.lower(t[sy]); marker = t[mi] markers = string.split(marker,'|') for marker in markers: try: marker_symbol_db[marker].append(symbol) except Exception: marker_symbol_db[marker]=[symbol] marker_symbol_db = gene_associations.eliminate_redundant_dict_values(marker_symbol_db) return marker_symbol_db
def remoteIndexing(species, fl): """ Begin building strand gff and index files for SashimiPlot based on the AltAnalyze database exon, junction and gene annotations """ global gff_export_obj try: ### When fl is strand dataset information object countsFileDir = fl.CountsFile( ) ### Counts file containing exon and junction positions root_dir = fl.RootDir() ### Root folder location except Exception: ### STRAND proper object may not be supplied with this information. Use the root directory alone to infer these root_dir = fl search_dir = root_dir + '/ExpressionInput' files = unique.read_directory( search_dir) ### all files in ExpressionInput for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countsFileDir = search_dir + '/' + file ### counts file with exon positions PSIFileDir = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt' OutputDir = findParentDir(PSIFileDir) output = OutputDir + "events_sashimi.gff" gff_export_obj = open(output, 'w') ### Sometimes only junctions are in the count file so create strand new file with detected junctions and all exons ### This information and associated featrues is extracted from the counts file featuresEvaluated = extractFeatures(species, countsFileDir) ### Compile and export the coordinates to gff format and index these coordinates for fast retreival by Miso Indexing(featuresEvaluated, PSIFileDir, output)
def importWikiPathways(selected_species,force): if selected_species == None: selected_species = unique.read_directory('/'+database_dir) importSpeciesData() getSourceData() all_species = 'no' if force == 'yes': try: gene_associations.convertAllGPML(selected_species,all_species) ### Downloads GPMLs and builds flat files status = 'built' except IOError: print 'Unable to connect to http://www.wikipathways.org' status = 'failed' status = 'built' if status == 'built': import BuildAffymetrixAssociations for species_code in selected_species: species_name = species_names[species_code] if status == 'built': relationship_types = ['native','mapped'] for relationship_type in relationship_types: #print 'Processing',relationship_type,'relationships' index=0 integrate_affy_associations = 'no' incorporate_previous = 'yes' process_affygo = 'no' counts = BuildAffymetrixAssociations.importWikipathways(source_types,incorporate_previous,process_affygo,species_name,species_code,integrate_affy_associations,relationship_type,'over-write previous') index+=1 print 'Finished integrating updated WikiPathways'
def importCircularRNAEvents(folder, circ_p): dataset_events = {} files = unique.read_directory(folder) for file in files: if 'circRNA.' in file and '.txt' in file: events = [] dataset = file[:-4] fn = unique.filepath(folder + '/' + file) firstRow = True for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if firstRow: index = 0 """ Standard Fields from MultiPath-PSI """ for i in t: if 'PValue' == i: pv = index if 'logFC' == i: lf = index index += 1 firstRow = False else: id = t[0] pval = float(t[pv]) logFC = float(t[lf]) ci = circInformation(id, pval, logFC) if pval < circ_p: events.append(ci) dataset_events[dataset] = events return dataset_events
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) #add in code to prevent folder names from being included dir_list2 = [] for file in dir_list: if '.txt' in file: dir_list2.append(file) return dir_list2
def combineDropSeq(input_dir): import unique files = unique.read_directory(input_dir) combinedGeneExpression = {} for input_file in files: #:70895507-70895600 header = True if '.txt' in input_file: for line in open(input_dir + '/' + input_file, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if header: header_row = line samples = t[1:] header = False else: values = map(float, t[1:]) gene = t[0] if gene in combinedGeneExpression: prior_values = combinedGeneExpression[gene] count_sum_array = [ sum(value) for value in zip(*[prior_values, values]) ] else: count_sum_array = values combinedGeneExpression[gene] = count_sum_array input_file = input_dir + '/test.txt' export_object = open(input_file, 'w') export_object.write(string.join(['UID'] + samples, '\t') + '\n') for gene in combinedGeneExpression: values = string.join(map(str, [gene] + combinedGeneExpression[gene]), '\t') export_object.write(values + '\n') export_object.close()
def getFiles(sub_dir): dir_list = unique.read_directory(sub_dir) dir_list2 = [] ###Only get folder names for entry in dir_list: dir_list2.append(entry) return dir_list2
def readDirText(sub_dir): dir_list = unique.read_directory(sub_dir) dir_list2 = [] ###Code to prevent folder names from being included for entry in dir_list: if entry[-4:] == ".txt": dir_list2.append(entry) return dir_list2
def importReciprocalJunctions(inputpsi,PSIJunctions): ### Also include other predicted splicing events alt_dir = string.split(inputpsi,'AlternativeOutput')[0]+'AlternativeOutput' files = unique.read_directory(alt_dir) added=0 already_added=0 for file in files: if 'ASPIRE-exon-inclusion-results' in file or 'linearregres-exon-inclusion-results' in file: alt_exon_path = alt_dir+'/'+file header=True for line in open(alt_exon_path,'rU').xreadlines(): line = line.rstrip(os.linesep) if header: header=False else: t=string.split(line,'\t') inclusion_junction = t[8] exclusion_junction = t[10] pair = inclusion_junction+' '+exclusion_junction pair = string.replace(pair,':','__') if pair in PSIJunctions: already_added+=1 else: PSIJunctions.append(pair) added+=1 return PSIJunctions
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) dir_list2 = [] for entry in dir_list: if entry[-4:] == ".txt" or entry[-4:] == ".csv": dir_list2.append(entry) return dir_list2
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) dir_list2 = [] for entry in dir_list: if entry[-4:] == ".txt" or entry[-4:] == ".all" or entry[ -5:] == ".data" or entry[-3:] == ".fa": dir_list2.append(entry) return dir_list2
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) #add in code to prevent folder names from being included dir_list2 = [] for entry in dir_list: #if entry[-4:] == ".txt" or entry[-4:] == ".all" or entry[-5:] == ".data" or entry[-3:] == ".fa": dir_list2.append(entry) return dir_list2
def getFiles(sub_dir,directories=True): dir_list = unique.read_directory(sub_dir); dir_list2 = [] for entry in dir_list: if directories: if '.' not in entry: dir_list2.append(entry) else: if '.' in entry: dir_list2.append(entry) return dir_list2
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) #add in code to prevent folder names from being included dir_list2 = [] for file in dir_list: lf = string.lower(file) if '.txt' in lf or '.sif' in lf or '.tab' in lf: dir_list2.append(file) return dir_list2
def read_directory(sub_dir): dir_list = unique.read_directory(sub_dir) dir_list2 = [] ###Code to prevent folder names from being included for entry in dir_list: if entry[-4:] == ".txt" or entry[ -4:] == ".csv" or ".ontology" in entry or '.obo' in entry: dir_list2.append(entry) return dir_list2
def getValidExpFile(altanalyze_rawexp_dir): import unique dir_files = unique.read_directory(altanalyze_rawexp_dir) valid_file = '' for file in dir_files: if 'exp.' in file and 'state.txt' not in file and 'feature' not in file: valid_file = altanalyze_rawexp_dir + '/' + file break return valid_file
def matrixImport(filename): matrix={} original_data={} headerRow=True for line in open(filename,'rU').xreadlines(): original_line = line data = line.rstrip() values = string.split(data,'\t') if headerRow: group_db={} groups=[] if ':' in data: group_sample_list = map(lambda x: string.split(x,':'),values[1:]) index=1 for (g,s) in group_sample_list: try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) else: import ExpressionBuilder search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'groups.' in file: sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file) index=0 for s in values[1:]: g = sample_group_db[s] try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) headerRow = False grouped_values=[] original_data['header'] = original_line else: key = values[0] grouped_floats=[] float_values = [] for g in groups: ### string values gvalues_list=[] for i in group_db[g]: if values[i] != '0': try: gvalues_list.append(float(values[i])) except Exception: pass else: try: gvalues_list.append('') ### Thus are missing values except Exception: pass grouped_floats.append(gvalues_list) matrix[key] = grouped_floats if '\n' not in original_line: original_line+='\n' original_data[key] = original_line last_line = line return matrix,original_data
def deleteNestedOntologyFiles(ontology_type): program_type,database_dir = unique.whatProgramIsThis() current_species_dirs = unique.read_directory('/'+database_dir) for species_code in current_species_dirs: c = GrabFiles(); c.setdirectory('/'+database_dir+'/'+species_code+'/nested') if ontology_type == 'GeneOntology': ontology_type = 'GO' file_dirs = c.searchdirectory('-'+ontology_type) ### list all nested files referencing the Ontology type for file in file_dirs: try: os.remove(filepath(database_dir+'/'+species_code+'/nested/'+file)) except Exception: null=[]
def read_directory(sub_dir): try: dir_list = unique.read_directory(sub_dir) except Exception: dir_list = [] ### Directory does not exist dir_list2 = [] ###Code to prevent folder names from being included for entry in dir_list: if entry[-4:] == ".txt" or entry[-4:] == ".csv": dir_list2.append(entry) return dir_list2
def downloadDomainAssociations(selected_species): paths=[] if selected_species != None: ### Restrict to selected species only current_species_dirs=selected_species else: current_species_dirs = unique.read_directory('/'+database_dir) for species in current_species_dirs: url = 'http://www.genmapp.org/go_elite/Databases/ExternalSystems/Domains/'+species+'_Ensembl-Domain.gz' fln,status = update.downloadSuppressPrintOuts(url,'BuildDBs/Domains/','txt') if 'Internet' not in status: paths.append((species,fln)) return paths
def transferGOSlimGeneAssociations(selected_species): if selected_species != None: ### Restrict to selected species only current_species_dirs=selected_species else: current_species_dirs = unique.read_directory('/'+database_dir) for species_code in current_species_dirs: try: ens_go_file_dir = filepath(database_dir+'/'+species_code+'/gene-go/Ensembl-GOSlim.txt') goslim_ens_file = filepath(database_dir+'/'+species_code+'/uid-gene/Ensembl-goslim_goa.txt') export.copyFile(goslim_ens_file,ens_go_file_dir) translateToEntrezGene(species_code,ens_go_file_dir) except Exception: null=[]
def considerOnlyMammalian(selected_species): supported_mammals = ['Am','Bt', 'Cf', 'Ch', 'Cj', 'Cp', 'Do', 'Ec', 'Ee', 'Et', 'Fc', 'Gg', 'Go', 'Hs', 'La', 'Ma', 'Md', 'Me', 'Mi', 'Ml', 'Mm', 'Oa', 'Oc','Og', 'Op', 'Pc', 'Pp', 'Pt', 'Pv', 'Rn', 'Sa', 'Ss', 'St', 'Tb', 'Tn', 'Tr', 'Ts', 'Tt', 'Vp'] filtered_species=[] if selected_species == None: selected_species = unique.read_directory('/'+database_dir) for i in selected_species: if i in supported_mammals: filtered_species.append(i) return filtered_species
def importDiseaseOntologyGeneAssocations(): disease_ontology_files = unique.read_directory('/BuildDBs/Disease') symbol_to_DO={} for file in disease_ontology_files: if '_do' in file: fn = filepath('BuildDBs/Disease/'+file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if len(t)>1: symbol=string.lower(t[2]); doid = t[4] try: symbol_to_DO[doid].append(symbol) except Exception: symbol_to_DO[doid]=[symbol] return symbol_to_DO
def searchDirectory(directory, var, secondary=None): directory = unique.filepath(directory) files = unique.read_directory(directory) for file in files: if var in file: if secondary == None: return directory + '/' + file break elif secondary in file: return directory + '/' + file break ### if all else fails return directory + '/' + file
def importMiRAssociations(selected_species,force): supported_databases = unique.read_directory('/'+database_dir) if selected_species != None: ### Restrict by selected species supported_databases=selected_species missing_miR_associations=[] found_miR_associations=[] for species in supported_databases: if force == 'yes': try: fn = downloadMiRDatabases(species) found_miR_associations.append((species,fn)) except Exception: missing_miR_associations.append(species) for (species,fn) in found_miR_associations: importMiRGeneAssociations(species,fn)
def remoteSashimiPlot(species, fl, bamdir, genelis): global inputpsi global outputdir try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + '/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir + '/' + file inputpsi = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt' #outputdir=findParentDir(inputpsi)+"sashimiplots" outputdir = root_dir + '/ExonPlots' outputdir = root_dir + '/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass #print bamdir #print countinp #print inputpsi #print genelis Sashimiplottting(bamdir, countinp, inputpsi, genelis) gene_label, gene_sym = genelist(inputpsi) for filename in os.listdir(outputdir): if '.pdf' in filename: newname = string.split(filename, ':') if newname[0] in gene_sym: new_filename = str(filename) if ':' in filename: new_filename = string.split(filename, ':')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = gene_sym[newname[0]] + '-SashimiPlot_' + new_filename os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) else: continue
def remoteIndexing(species,fl): global export_in try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir+'/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir+'/'+file inputpsi = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt' outputdir=findParentDir(inputpsi) output=outputdir+"events_trial.gff" export_in=open(output,'w') Indexing(countinp,inputpsi,output)
def exportIndexes(input_dir): import unique bam_dirs = unique.read_directory(input_dir) print 'Building BAM index files', for file in bam_dirs: if string.lower(file[-4:]) == '.bam': bam_dir = input_dir+'/'+file bamf = pysam.AlignmentFile(bam_dir, "rb" ) ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Indexv lciv9df8scivx print '.', bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir)
def remoteIndexing(species,fl): global export_in try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir+'/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir+'/'+file inputpsi = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt' outputdir=findParentDir(inputpsi) output=outputdir+"events_sashimi.gff" export_in=open(output,'w') ### Sometimes only junctions are in the count file so create a new file with detected junctions and all exons featuresEvaluated = extractFeatures(species,countinp) Indexing(featuresEvaluated,inputpsi,output)
def importSplicingEvents(folder): dataset_events = {} files = unique.read_directory(folder) for file in files: if 'PSI.' in file and '.txt' in file: events = [] dataset = file[:-4] fn = unique.filepath(folder + '/' + file) firstRow = True for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if firstRow: index = 0 """ Standard Fields from MultiPath-PSI """ for i in t: if 'Event-Direction' == i: ed = index if 'ClusterID' == i: ci = index if 'AltExons' == i: ae = index if 'EventAnnotation' == i: ea = index if 'Coordinates' == i: co = index index += 1 firstRow = False else: id = t[0] event_direction = t[ed] clusterID = t[ci] altExons = t[ae] coordinates = t[co] ei = EventInformation(id, event_direction, clusterID, altExons, coordinates) events.append(ei) dataset_events[dataset] = events return dataset_events