def retreiveAllKnownSpliceSites(returnExonRetention=False,DesignatedSpecies=None,path=None): ### Uses a priori strand information when none present import export, unique chromosomes_found={} try: parent_dir = export.findParentDir(bam_file) except Exception: parent_dir = export.findParentDir(path) species = None for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] if species == None: try: species = IndicatedSpecies except Exception: species = DesignatedSpecies splicesite_db={} gene_coord_db={} try: if ExonReference==None: exon_dir = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt' length = verifyFileLength(exon_dir) except Exception: #print traceback.format_exc();sys.exit() length = 0 if length==0: exon_dir = ExonReference refExonCoordinateFile = unique.filepath(exon_dir) firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] spliceEvent = t[-2] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] try: gene_coord_db[geneID,chr].append(int(start)) gene_coord_db[geneID,chr].append(int(stop)) except Exception: gene_coord_db[geneID,chr] = [int(start)] gene_coord_db[geneID,chr].append(int(stop)) if returnExonRetention: if 'exclusion' in spliceEvent or 'exclusion' in spliceEvent: splicesite_db[geneID+':'+exon]=[] else: splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] for i in gene_coord_db: gene_coord_db[i].sort() gene_coord_db[i] = [gene_coord_db[i][0],gene_coord_db[i][-1]] return splicesite_db,chromosomes_found,gene_coord_db
def FilterFile(Guidefile, PSI, turn=0): if 'Clustering' in Guidefile: count = 1 else: count = 0 val = [] head = 0 for line in open(Guidefile, 'rU').xreadlines(): if head > count: line = line.rstrip('\r\n') q = string.split(line, '\t') val.append(q[0]) else: head += 1 continue dire = export.findParentDir(export.findParentDir(Guidefile)[:-1]) output_dir = dire + 'SubtypeAnalyses-Results' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt' output_file = output_dir + '/round' + str( turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt' try: os.mkdir(output_dir + '/round' + str(turn)) except: pass ### already exists if turn == 1: ### No need to filter this file shutil.copyfile(PSI, output_file) else: filterRows(PSI, output_file, filterDB=val) return output_file
def getPathwayAs(pathway_db, species_code, mod): begin_time = time.time() try: export.deleteFolder( 'BuildDBs/WPs') ### clear any remaining pathway files except Exception: null = [] for wpid in pathway_db: file_type = 'gpml' wp_id_data = client.service.getPathwayAs(fileType=file_type, pwId=wpid, revision=0) wp_id_data = base64.b64decode(wp_id_data) gpml_path = filepath('BuildDBs/WPs/' + wpid + '.gpml') outfile = export.ExportFile(gpml_path) outfile.write(wp_id_data) outfile.close() gene_system_list = string.split(wp_id_data, '\n') parent_path = export.findParentDir(gpml_path) pathway_db = gene_associations.getGPMLGraphData( parent_path, species_code, mod) ### get GPML data back os.remove(gpml_path) ### Only store the file temporarily end_time = time.time() time_diff = float(end_time - begin_time) """ try: print "WikiPathways data imported in %d seconds" % time_diff except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command """ return pathway_db
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print 'Running Combat...', expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir( expr_input_dir) + 'Non-Combat/' + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile( expr_input_dir, removeExtra='#') ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() #print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print 'Original expression file over-written with batch effect removal results...' ebat.to_csv(expr_input_dir, sep="\t")
def getPathwayAs(pathway_db,species_code,mod): begin_time = time.time() for wpid in pathway_db: #print [wpid],'pathway_db',len(pathway_db) file_type = 'gpml' #file_type = 'application/gpml+xml' processor_time = str(time.clock()) #try: export.deleteFolder('BuildDBs/WPs') ### clear any remaining pathway files #except Exception: pass #wp_id_data = client.service.getPathwayAs(fileType = file_type,pwId = wpid, revision = 0) kwargs = { 'identifier': 'WP2062', 'version': 0, 'file_format': 'application/gpml+xml'} #wp_id_data = wikipathways_api_client_instance.get_pathway_as(**kwargs) wp_id_data = wikipathways_api_client_instance.get_pathway_as(file_format = file_type,identifier = wpid, version = 0) #wp_id_data = base64.b64decode(wp_id_data) gpml_path = filepath('BuildDBs/WPs/'+processor_time+'/'+wpid+'.gpml') #print gpml_path outfile = export.ExportFile(gpml_path) outfile.write(wp_id_data); outfile.close() gene_system_list = string.split(wp_id_data,'\n') parent_path = export.findParentDir(gpml_path) pathway_db = gene_associations.getGPMLGraphData(parent_path,species_code,mod) ### get GPML data back #os.remove(gpml_path) ### Only store the file temporarily try: export.deleteFolder('BuildDBs/WPs/'+processor_time) ### clear any remaining pathway files except Exception: pass end_time = time.time(); time_diff = float(end_time-begin_time) """ try: print "WikiPathways data imported in %d seconds" % time_diff except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command """ return pathway_db
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) dir = string.replace(dir,'hGlue','') ### Used since the hGlue data is in a sub-directory filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable." if len(sys.argv)<2: try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print 'cannot be downloaded';force_error else: print 'cannot be downloaded';force_error elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file): try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def retreiveAllKnownSpliceSites(): ### Uses a priori strand information when none present import export, unique chromosomes_found={} parent_dir = export.findParentDir(bam_file) species = None for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] if species == None: species = IndicatedSpecies splicesite_db={} refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] return splicesite_db,chromosomes_found
def unzipFiles(filename,dir): import zipfile output_filepath = filepath(dir+'/'+filename) try: zfile = zipfile.ZipFile(output_filepath) for name in zfile.namelist(): if name.endswith('/'):null=[] ### Don't need to export else: if 'EnsMart' in name and 'EnsMart' in dir: dir = export.findParentDir(dir[:-1]) ### Remove EnsMart suffix directory try: outfile = export.ExportFile(filepath(dir+name)) except Exception: outfile = export.ExportFile(filepath(dir+name[1:])) outfile.write(zfile.read(name)); outfile.close() #print 'Zip extracted to:',output_filepath status = 'completed' except Exception, e: try: ### Use the operating system's unzip if all else fails extracted_path = string.replace(output_filepath,'.zip','') try: os.remove(extracted_path) ### This is necessary, otherwise the empty file created above will require user authorization to delete except Exception: null=[] subprocessUnzip(dir,output_filepath) status = 'completed' except IOError: print e print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.' status = 'failed'
def covertAffyFormatToBED(filename, ConversionDB=None): print 'processing:',filename parent = export.findParentDir(filename) if ConversionDB==None: output_file = 'simple_chr.bed' else: output_file = export.findFilename(filename) output_file = string.replace(output_file,'mm9','mm10') export_obj = export.ExportFile(parent+'/'+output_file) fn=filepath(filename); entry_count=0; readfiles = False for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if data[0]=='#': readfiles = False elif readfiles==False: readfiles = True if ConversionDB!=None: export_obj.write(line) ### Write header else: try: t = string.split(data[1:-1],'","') probeset_id,chr,strand,start,stop = t[:5] int(start) if ConversionDB==None: if 'chr' in chr: export_obj.write(chr+'\t'+start+'\t'+stop+'\t'+probeset_id+'\n') else: chr,start,stop = ConversionDB[probeset_id] t = [probeset_id,chr,strand,start,stop] + t[5:] values = '"'+string.join(t,'","')+'"\n' export_obj.write(values) entry_count+=1 except Exception: pass export_obj.close() print entry_count, 'entries saved to:',parent+'/'+output_file
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) dir = string.replace(dir,'hGlue','') ### Used since the hGlue data is in a sub-directory filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename print url file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable." if len(sys.argv)<2: try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print 'cannot be downloaded';force_error else: print 'cannot be downloaded';force_error elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file): try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print "Running Combat...", expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile(expr_input_dir, removeExtra="#") ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() # print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print "Original expression file over-written with batch effect removal results..." ebat.to_csv(expr_input_dir, sep="\t")
def normalizeDataset(filename, output=None, normalization='quantile', platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Normalized/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) exportExpressionData(output, sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir, filename, platform) print 'Exported expression input file to:', output
def retreiveAllKnownSpliceSites(): ### Uses a priori strand information when none present import export, unique chromosomes_found={} parent_dir = export.findParentDir(bam_file) for file in os.listdir(parent_dir): if 'AltAnalyze_report' in file and '.log' in file: log_file = unique.filepath(parent_dir+'/'+file) log_contents = open(log_file, "rU") species_tag = ' species: ' for line in log_contents: line = line.rstrip() if species_tag in line: species = string.split(line,species_tag)[1] splicesite_db={} refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine=True for line in open(refExonCoordinateFile,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions' geneID, exon, chr, strand, start, stop = t[:6] #start = int(start); stop = int(stop) #geneID = string.split(exon,':')[0] splicesite_db[chr,start]=strand splicesite_db[chr,stop]=strand if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr): chromosomes_found[string.replace(chr,'chr','')] = [] return splicesite_db,chromosomes_found
def downloadCurrentVersion(filename, secondary_dir, file_type): import UI file_location_defaults = UI.importDefaultFileLocations() uds = file_location_defaults[ 'url'] ### Get the location of the download site from Config/default-files.csv for ud in uds: url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) filename = export.findFilename(filename) url = url_dir + secondary_dir + '/' + filename file, status = download(url, dir, file_type) continue_analysis = 'yes' if 'Internet' in status: print_out = "File:\n" + url + "\ncould not be found on server or internet connection is unavailable." try: UI.WarningWindow(print_out, 'WARNING!!!') continue_analysis = 'no' except Exception: print url print 'cannot be downloaded' die elif status == 'remove': try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def visualizePathwayAssociations(filename, species, mod_type, wpid, imageExport=True): ### Log any potential problems log_file = filepath('webservice.log') log_report = open(log_file, 'w') if wpid == None: force_invalid_pathway global mod global species_code global graphic_link graphic_link = {} mod = mod_type species_code = species root_dir = export.findParentDir(filename) criterion_name = export.findFilename(filename)[:-4] log_report.write('Filename: %s and WPID %s\n' % (filename, wpid)) if 'GO-Elite/input' in root_dir: root_dir = string.replace(root_dir, 'GO-Elite/input', 'WikiPathways') else: root_dir += 'WikiPathways/' analysis_type = 'Genes' id_db, column_headers = importDataSimple(filename, 'GO-Elite') log_report.write('GO-Elite input ID file imported successfully\n') log_report.write('%d IDs imported\n' % len(id_db)) pathway_db = {} pathway_db[wpid] = PathwayData( None ) ### only need to analyze object (method allows for analysis of any number) pathway_db = getPathwayAs(pathway_db, species_code, mod) log_report.write( 'Pathway data imported from GPML files obtained from webservice\n') id_color_db = getHexadecimalColorRanges( id_db, analysis_type) ### example id_db" is key:gene, value:fold graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'MOD') if imageExport != 'png': file_type = 'pdf' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + criterion_name, WPID=wpid) if imageExport != 'pdf': file_type = 'png' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + criterion_name, WPID=wpid) log_report.write( 'Pathways colored and image data returned. Exiting webservice.\n') log_report.close() return graphic_link
def viewLineageProfilerResults(filename, graphic_links): global graphic_link graphic_link = graphic_links ### This is a list of tuples containing name and file location ### Log any potential problems log_file = filepath('webservice.log') log_report = open(log_file, 'w') root_dir = export.findParentDir(filename) root_dir = string.replace(root_dir, 'ExpressionOutput/Clustering', 'DataPlots') if 'DataPlots' not in root_dir: ### Occurs when directly supplying an input matrix by the user root_dir += '/DataPlots/' try: os.mkdir(root_dir) ### May need to create this directory except Exception: None id_db, column_headers = importDataSimple(filename, 'LineageProfiler') log_report.write('LineageProfiler input ID file imported successfully\n') pathway_db = {} pathway_db['WP2062'] = PathwayData('TissueFateMap') ### MOD and species are not particularly important for Lineage analysis pathway_db = getPathwayAs(pathway_db, 'Hs', 'Ensembl') log_report.write( 'Pathway data imported from GPML files obtained from webservice\n') i = 0 group_id_db = {} ### store the results separately for each sample ### When analyzing z-scores, you can have multiple samples you wish to visualize results for (not so for regulated gene lists) for biological_group in column_headers: group_id_db[biological_group] = db = {} for gene in id_db: group_id_db[biological_group][gene] = id_db[gene][ i] ### get the index value of that biological group (z-score change) i += 1 for biological_group in group_id_db: group_specific = group_id_db[biological_group] analysis_type = 'Lineage' id_color_db = getHexadecimalColorRanges( group_specific, analysis_type) ### example "id_db" is key:tissue, value:z-score graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'Label') file_type = 'png' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + biological_group) file_type = 'pdf' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + biological_group) log_report.write( 'Pathways colored and images saved to disk. Exiting webservice.\n') log_report.close() return graphic_link
def viewLineageProfilerResults(filename, graphic_links): global graphic_link graphic_link = graphic_links ### This is a list of tuples containing name and file location ### Log any potential problems log_file = filepath("webservice.log") log_report = open(log_file, "w") root_dir = export.findParentDir(filename) root_dir = string.replace(root_dir, "ExpressionOutput/Clustering", "DataPlots") if "DataPlots" not in root_dir: ### Occurs when directly supplying an input matrix by the user root_dir += "/DataPlots/" try: os.mkdir(root_dir) ### May need to create this directory except Exception: None id_db, column_headers = importDataSimple(filename, "LineageProfiler") log_report.write("LineageProfiler input ID file imported successfully\n") pathway_db = {} pathway_db["WP2062"] = PathwayData("TissueFateMap") ### MOD and species are not particularly important for Lineage analysis pathway_db = getPathwayAs(pathway_db, "Hs", "Ensembl") log_report.write("Pathway data imported from GPML files obtained from webservice\n") i = 0 group_id_db = {} ### store the results separately for each sample ### When analyzing z-scores, you can have multiple samples you wish to visualize results for (not so for regulated gene lists) for biological_group in column_headers: group_id_db[biological_group] = db = {} for gene in id_db: group_id_db[biological_group][gene] = id_db[gene][ i ] ### get the index value of that biological group (z-score change) i += 1 for biological_group in group_id_db: group_specific = group_id_db[biological_group] analysis_type = "Lineage" id_color_db = getHexadecimalColorRanges( group_specific, analysis_type ) ### example "id_db" is key:tissue, value:z-score graphID_db = getGraphIDAssociations(id_color_db, pathway_db, "Label") file_type = "png" ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, "-" + biological_group) file_type = "pdf" ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, "-" + biological_group) log_report.write("Pathways colored and images saved to disk. Exiting webservice.\n") log_report.close() return graphic_link
def getPathwayAs(pathway_db, species_code, mod): begin_time = time.time() for wpid in pathway_db: #print [wpid],'pathway_db',len(pathway_db) file_type = 'gpml' #file_type = 'application/gpml+xml' processor_time = str(time.clock()) #try: export.deleteFolder('BuildDBs/WPs') ### clear any remaining pathway files #except Exception: pass #wp_id_data = client.service.getPathwayAs(fileType = file_type,pwId = wpid, revision = 0) kwargs = { 'identifier': 'WP2062', 'version': 0, 'file_format': 'application/gpml+xml' } #wp_id_data = wikipathways_api_client_instance.get_pathway_as(**kwargs) wp_id_data = wikipathways_api_client_instance.get_pathway_as( file_format=file_type, identifier=wpid, version=0) #wp_id_data = base64.b64decode(wp_id_data) gpml_path = filepath('BuildDBs/WPs/' + processor_time + '/' + wpid + '.gpml') #print gpml_path outfile = export.ExportFile(gpml_path) outfile.write(wp_id_data) outfile.close() gene_system_list = string.split(wp_id_data, '\n') parent_path = export.findParentDir(gpml_path) pathway_db = gene_associations.getGPMLGraphData( parent_path, species_code, mod) ### get GPML data back #os.remove(gpml_path) ### Only store the file temporarily try: export.deleteFolder( 'BuildDBs/WPs/' + processor_time) ### clear any remaining pathway files except Exception: pass end_time = time.time() time_diff = float(end_time - begin_time) """ try: print "WikiPathways data imported in %d seconds" % time_diff except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command """ return pathway_db
def importConvertedBED(filename): print 'processing:',filename parent = export.findParentDir(filename) fn=filepath(filename); entry_count=0; newCoordinates={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if data[0]!='#': try: t = string.split(data,'\t') chr,start,stop,probeset_id = t int(start) if 'chr' in chr: entry_count+=1 newCoordinates[probeset_id] = chr,start,stop except ZeroDivisionError: pass print entry_count, 'imported and saved.' return newCoordinates
def FilterFile(Guidefile,Guidefile_block,PSI,turn): if 'Clustering' in Guidefile: count=1 flag=True rank_Count=0 prev=0 else: count=0 val=[] head=0 print Guidefile_block for line in open(Guidefile_block,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') if flag: if int(q[1])==prev: continue else: rank_Count+=1 prev=int(q[1]) else: head+=1 continue head=0 print Guidefile for line in open(Guidefile,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') val.append(q[0]) else: head+=1 continue dire = export.findParentDir(PSI) output_dir = dire+'OncoInputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/NMFInput-Round'+str(turn)+'.txt' filterRows(PSI,output_file,filterDB=val) return output_file,rank_Count
def importConvertedBED(filename): print 'processing:', filename parent = export.findParentDir(filename) fn = filepath(filename) entry_count = 0 newCoordinates = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if data[0] != '#': try: t = string.split(data, '\t') chr, start, stop, probeset_id = t int(start) if 'chr' in chr: entry_count += 1 newCoordinates[probeset_id] = chr, start, stop except ZeroDivisionError: pass print entry_count, 'imported and saved.' return newCoordinates
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print "Exported expression input file to:", output
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output==None: output = filename moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t'+moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) exportExpressionData(output,sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir,filename,platform) print 'Exported expression input file to:',output
def visualizePathwayAssociations(filename,species,mod_type,wpid,imageExport=True): ### Log any potential problems log_file = filepath('webservice.log') log_report = open(log_file,'w') if wpid == None: force_invalid_pathway global mod global species_code global graphic_link graphic_link={} mod = mod_type species_code = species root_dir = export.findParentDir(filename) criterion_name = export.findFilename(filename)[:-4] log_report.write('Filename: %s and WPID %s\n' % (filename,wpid)) if 'GO-Elite/input' in root_dir: root_dir = string.replace(root_dir,'GO-Elite/input','WikiPathways') else: root_dir+='WikiPathways/' analysis_type = 'Genes' id_db,column_headers = importDataSimple(filename,'GO-Elite') log_report.write('GO-Elite input ID file imported successfully\n') log_report.write('%d IDs imported\n' % len(id_db)) pathway_db={} pathway_db[wpid] = PathwayData(None) ### only need to analyze object (method allows for analysis of any number) pathway_db = getPathwayAs(pathway_db,species_code,mod) log_report.write('Pathway data imported from GPML files obtained from webservice\n') id_color_db = getHexadecimalColorRanges(id_db,analysis_type) ### example id_db" is key:gene, value:fold graphID_db = getGraphIDAssociations(id_color_db,pathway_db,'MOD') if imageExport != 'png': file_type = 'pdf' ### svg, pdf, png getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid) if imageExport != 'pdf': file_type = 'png' ### svg, pdf, png getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid) log_report.write('Pathways colored and image data returned. Exiting webservice.\n') log_report.close() return graphic_link
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Quantile/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print 'Exported expression input file to:', output
def covertAffyFormatToBED(filename, ConversionDB=None): print 'processing:', filename parent = export.findParentDir(filename) if ConversionDB == None: output_file = 'simple_chr.bed' else: output_file = export.findFilename(filename) output_file = string.replace(output_file, 'mm9', 'mm10') export_obj = export.ExportFile(parent + '/' + output_file) fn = filepath(filename) entry_count = 0 readfiles = False for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if data[0] == '#': readfiles = False elif readfiles == False: readfiles = True if ConversionDB != None: export_obj.write(line) ### Write header else: try: t = string.split(data[1:-1], '","') probeset_id, chr, strand, start, stop = t[:5] int(start) if ConversionDB == None: if 'chr' in chr: export_obj.write(chr + '\t' + start + '\t' + stop + '\t' + probeset_id + '\n') else: chr, start, stop = ConversionDB[probeset_id] t = [probeset_id, chr, strand, start, stop] + t[5:] values = '"' + string.join(t, '","') + '"\n' export_obj.write(values) entry_count += 1 except Exception: pass export_obj.close() print entry_count, 'entries saved to:', parent + '/' + output_file
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() uds = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv for ud in uds: url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status: print_out = "File:\n"+url+"\ncould not be found on server or internet connection is unavailable." try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print url print 'cannot be downloaded';die elif status == 'remove': try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def NMFAnalysis(filename,Rank,turn=0,strategy="conservative"): X=[] header=[] head=0 exportnam=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_versionr.txt'#+str(Rank)+'.txt' export_res=export.ExportFile(exportnam) exportnam_bin=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_binary.txt'#+str(Rank)+'.txt' export_res1=export.ExportFile(exportnam_bin) exportnam_bint=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_binary_t_.txt'#+str(Rank)+'.txt' export_res5=export.ExportFile(exportnam_bint) exportnam2=export.findParentDir(filename)+'/SubtypeAnalyses/round'+str(turn)+'Metadata.txt'#+str(Rank)+'.txt' export_res2=export.ExportFile(exportnam2) exportnam3=export.findParentDir(filename)+'/SubtypeAnalyses/round'+str(turn)+'Annotation.txt'#+str(Rank)+'.txt' export_res3=export.ExportFile(exportnam3) if 'Clustering' in filename: count=1 start=2 else: count=0 start=1 print filename for line in open(filename,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') if head >count: val=[] val2=[] me=0.0 for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) X.append(val) else: export_res1.write(line) export_res.write(line) export_res1.write("\n") export_res.write("\n") header=q head+=1 continue group=defaultdict(list) sh=[] X=np.array(X) mat=[] mat=zip(*X) mat=np.array(mat) nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=10,track_factor=True) nmf_fit = nmf() W = nmf_fit.basis() W=np.array(W) H=nmf_fit.coef() H=np.array(H) sh=W.shape export_res3.write("uid\tUID\tUID\n") if int(Rank)==2: par=1 else: par=2 W=zip(*W) W=np.array(W) sh=W.shape Z=[] for i in range(sh[0]): new_val=[] val=W[i,:] num=sum(i > 0.10 for i in val) if num >40 or num <3: compstd=True else: compstd=False me=np.mean(val) st=np.std(val) #print 'V'+str(i) export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): if compstd: if float(W[i][j])>=float(me+(par*st)): export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) else: if float(W[i][j])>0.1: export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) export_res.write("\t"+str(W[i][j])) Z.append(new_val) export_res.write("\n") export_res1.write("\n") Z=np.array(Z) sh=Z.shape Z_new=[] val1=[] Z1=[] dellst=[] export_res2.write("uid") export_res5.write("uid") for i in range(sh[0]): indices=[] val1=Z[i,:] sum1=sum(val1) flag=False indices=[index for index, value in enumerate(val1) if value == 1] for j in range(sh[0]): val2=[] if i!=j: val2=Z[j,:] sum2=sum([val2[x] for x in indices]) summ2=sum(val2) try: if float(sum2)/float(sum1)>0.5: if summ2>sum1: flag=True #print str(i) except Exception: continue if flag==False: Z1.append(val1) export_res2.write("\t"+'V'+str(i)) export_res5.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res2.write("\n") export_res5.write("\n") Z1=np.array(Z1) Z=Z1 Z=zip(*Z) Z=np.array(Z) sh=Z.shape print "stringency = ",[strategy] for i in range(sh[0]): val1=Z[i,:] #print sum(val1) #if sum(val)>2: if sum(val1)>2: val=[0 if x==1 else x for x in val1] else: val=val1 me=np.mean(val) st=np.std(val) export_res2.write(header[i+1]) export_res5.write(header[i+1]) for j in range(sh[1]): if strategy=="conservative": #print header[i+1] export_res2.write("\t"+str(val1[j])) export_res5.write("\t"+str(val1[j])) else: #print header[i+1] export_res2.write("\t"+str(val[j])) export_res5.write("\t"+str(val[j])) export_res2.write("\n") export_res5.write("\n") Z_new.append(val) Z_new=zip(*Z_new) Z_new=np.array(Z_new) sh=Z_new.shape export_res5.close() Orderedheatmap.Classify(exportnam_bint) return exportnam,exportnam_bin,exportnam2,exportnam3
def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy): """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """ use_adjusted_p=True print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound) NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version print "Running metaData Analyses for finding differential splicing events" rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation) counter=1 dPSI_results_dir=rootdir+CovariateQuery global upd_guides upd_guides=[] name=[] group=[] grplst=[] for filename in os.listdir(dPSI_results_dir): if filename.startswith("PSI."): dPSI_results_fn=os.path.join(dPSI_results_dir, filename) dPSI_comparison_alt_name=string.replace(filename,"PSI.","") omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir) if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison group.append(counter) name.append(string.replace(filename,"PSI.","")) counter+=1 print counter, 'robust splicing subtypes identified in round',AnalysisRound if counter>0: #counter>2 --- changed to 0 to force NMF dire = export.findParentDir(full_PSI_InputFile) output_dir = dire+'OncoInputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt' ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False) header=ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound) #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name) grplst.append(group) ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version header=Correlationdepletion.header_file(NMFResult) output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt" sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header) print "Running Correlation Depletion - Round"+str(AnalysisRound) commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name) Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile) full_PSI_InputFile=Depleted flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed """" else: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) flag=True except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True """ return flag,full_PSI_InputFile
def Enrichment(Inputfile,mutdict,mutfile,Expand,header): import collections import mappfinder X=defaultdict(list) prev="" head=0 group=defaultdict(list) enrichdict=defaultdict(float) mut=export.findFilename(mutfile) dire=export.findParentDir(Inputfile) output_dir = dire+'MutationEnrichment' export.createExportFolder(output_dir) exportnam=output_dir+'/Enrichment_Results.txt' export_enrich=open(exportnam,"w") exportnam=output_dir+'/Enrichment_tophits.txt' export_hit=open(exportnam,"w") export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n") if Expand=="yes": header2=header_file(Inputfile,Expand="yes") for line in open(Inputfile,'rU').xreadlines(): if head >0: line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(1,len(q)): if q[i]==str(1): #group[q[0]].append(header2[i-1]) group[header2[i-1]].append(q[0]) else: head+=1 continue else: for line in open(Inputfile,'rU').xreadlines(): line=line.rstrip('\r\n') line=string.split(line,'\t') #for i in range(1,len(line)): group[line[2]].append(line[0]) total_Scores={} for kiy in mutdict: if kiy =="MDP": print mutdict[kiy] groupdict={} remaining=[] remaining=list(set(header) - set(mutdict[kiy])) groupdict[1]=mutdict[kiy] groupdict[2]=remaining # export_enrich1.write(kiy) for key2 in group: r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy])))) n=float(len(group[key2])) R=float(len(set(mutdict[kiy]))) N=float(len(header)) if r==0 or R==1.0: print kiy,key2,r,n,R,N pval=float(1) z=float(0) null_z = 0.000 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) else: try: z = Zscore(r,n,N,R) except : z = 0.0000 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0,n,N,R) except Exception: null_z = 0.000 try: pval = mappfinder.FishersExactTest(r,n,R,N) zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) except Exception: pval=1.0 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) #pass if kiy in total_Scores: signature_db = total_Scores[kiy] signature_db[key2]=zsd ### Necessary format for the permutation function else: signature_db={key2:zsd} total_Scores[kiy] = signature_db sorted_results=[] mutlabels={} for kiy in total_Scores: signature_db = total_Scores[kiy] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([signature,float(zsd.PermuteP()),results]) sorted_results.sort() ### Sort by p-value prev="" for (sig,p,values) in sorted_results: if sig!=prev: flag=True export_hit.write(string.join(values,'\t')+'\n') if flag: if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 : mutlabels[values[1]]=values[0] flag=False export_hit.write(string.join(values,'\t')+'\n') export_enrich.write(string.join(values,'\t')+'\n') prev=sig if len(sorted_results)==0: export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n') export_enrich.close() #print mutlabels return mutlabels
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq): species = "Hs" row_method = 'hopach' column_method = 'hopach' row_metric = 'correlation' column_metric = 'euclidean' color_gradient = 'yellow_black_blue' contrast = 3 vendor = "RNASeq" GeneSelection = '' PathwaySelection = '' GeneSetSelection = 'None Selected' excludeCellCycle = False #rho_cutoff = 0.4 restrictBy = 'protein_coding' featurestoEvaluate = 'Genes' ExpressionCutoff = 0 CountsCutoff = 0 FoldDiff = 1.2 SamplesDiffering = 4 JustShowTheseIDs = '' removeOutliers = False PathwaySelection = [] array_type = "RNASeq" #rho_cutoff=0.4 gsp = UI.GeneSelectionParameters(species, array_type, vendor) gsp.setGeneSet(GeneSetSelection) gsp.setPathwaySelect(PathwaySelection) gsp.setGeneSelection(GeneSelection) gsp.setJustShowTheseIDs(JustShowTheseIDs) gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff, SamplesDiffering, removeOutliers, featurestoEvaluate, restrictBy, excludeCellCycle, column_metric, column_method, rho_cutoff) #Run splice ICGS """import UI species='Mm'; platform = "3'array"; vendor = 'Ensembl' gsp = UI.GeneSelectionParameters(species,platform,vendor) gsp.setGeneSet('None Selected') gsp.setPathwaySelect('') gsp.setGeneSelection('') gsp.setJustShowTheseIDs('') gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(0,0,1.5,3, False,'PSI','protein_coding',False,'cosine','hopach',0.35)""" FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) #except Exception:Rank=0 graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!!' print traceback.format_exc() Rank = 0 if Rank > 1: print 'Current turn:', turn, 'k =', if turn == 1: Rank = 2 elif Rank > 2: Rank = 30 else: Rank = 2 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print Rank print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
if column_method == 'None': column_method = None elif opt == '--row_metric': row_metric=arg elif opt == '--column_metric': column_metric=arg elif opt == '--ExpressionCutoff': ExpressionCutoff=arg elif opt == '--normalization': normalization=arg elif opt == '--rho': rho_cutoff=float(arg) elif opt == '--CountsCutoff':CountsCutoff=int(float(arg)) elif opt == '--FoldDiff':FoldDiff=float(arg) elif opt == '--SamplesDiffering':SamplesDiffering=int(float(arg)) elif opt == '--removeOutliers': removeOutliers=arg if removeOutliers=='yes' or removeOutliers=='True': removeOutliers = True print "Subtype discovery stringency:",strategy dire = export.findParentDir(EventAnnot) if EnrichmentOnly==False: print 'PSI input files:',EventAnnot print 'Using a rho-cutoff of:',rho_cutoff if filters==True: ### Filter based on a default percentage of samples with detected PSI values EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True) else: SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False) output_dir = dire+'ExpressionInput' export.createExportFolder(output_dir) full_PSI_InputFile=output_dir+"/exp.input.txt" header=header_list(EventAnnot)
def Classify(header,Xobs,output_file,grplst,name,turn): count=0 start=1 Y=[] head=0 for line in open(output_file,'rU').xreadlines(): if head >count: val=[] counter2=0 val2=[] me=0.0 line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) Y.append(val) else: head+=1 continue Xobs=zip(*Xobs) Xobs=np.array(Xobs) Xobs=zip(*Xobs) Xobs=np.array(Xobs) X=grplst X=zip(*X) X=np.array(X) Y=zip(*Y) Y=np.array(Y) dire = export.findParentDir(export.findParentDir(export.findParentDir(output_file)[:-1])[:-1]) output_dir = dire+'SVMOutputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt' export_class1=open(exportnam1,"w") exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt' export_class2=open(exportnam2,"w") regr = LinearSVC() regr.fit(Xobs,X[:,0]) q=regr.predict(Y) count=1 if len(X[:,0])>2: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) export_class1.write("uid") export_class2.write("uid") for ni in name: sub=string.split(ni,"_")[0] export_class1.write("\t"+"R"+str(turn)+"-"+sub) export_class2.write("\t"+"R"+str(turn)+"-"+sub) export_class1.write("\n") export_class2.write("\n") for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) export_class2.write(header[iq+1]) for jq in range(0,len(X[:,0])): export_class1.write("\t"+str(prob_[iq][jq])) if prob_[iq][jq]>0: export_class2.write("\t"+str(1)) else: export_class2.write("\t"+str(0)) export_class1.write("\n") export_class2.write("\n") else: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) export_class1.write("uid"+"\t") export_class2.write("uid"+"\t") export_class1.write("group") export_class2.write("R"+str(turn)+"-V1"+"\t"+"R"+str(turn)+"-V2") export_class1.write("\n") export_class2.write("\n") for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) export_class2.write(header[iq+1]) export_class1.write("\t"+str(prob_[iq])) if prob_[iq]>0.5: export_class2.write("\t"+str(1)+"\t"+str(0)) else: if prob_[iq]<-0.5: export_class2.write("\t"+str(0)+"\t"+str(1)) else: export_class2.write("\t"+str(0)+"\t"+str(0)) export_class1.write("\n") export_class2.write("\n") export_class2.close() Orderedheatmap.Classify(exportnam2)
def Mergeresults(filename): Newlist = defaultdict(list) Newval = {} genelst = defaultdict(list) Allcomp = {} dire = export.findParentDir(filename) output = dire + "/Motifresults_merged.txt" #MergedOutput="/Volumes/Pass/MotifAnalyses/Bridger/Exons_MotifAnalyses/merged_output_allpvalues_nofold.txt" #output="/Volumes/Pass/MotifAnalyses/Bridger/Exons_MotifAnalyses/merged_output_allpvalues_nofold_upd.txt" output1 = open(output, "w") output = dire + "/Motifresults_zscores.txt" output2 = open(output, "w") output1.write("signature" + "\t" + "gene" + "\t" + "technique" + "\t" + "p-value" + "\t" + "log-transformed" + "\t" + "signature" + "\t" + "gene" + "\t" + "technique" + "\t" + "p-value" + "\t" + "log-transformed" + "\t" + "signature" + "\t" + "gene" + "\t" + "technique" + "\t" + "p-value" + "\t" + "log-transformed" + "\t" + "signature" + "\t" + "gene" + "\t" + "technique" + "\t" + "p-value" + "\t" + "log-transformed" + "\n") output2.write("signature" + "\t" + "gene" + "\t" + "cisbp-zscore" + "\t" + "CLIPseq-zscore" + "\t" + "GE-zscore" + "\n") for lin in open(filename, 'rU').xreadlines(): genes = [] s = lin.rstrip('\r\n') s1 = string.split(s, '\t') sig = s1[0] if s1[2] == "GE": genes = [s1[1]] else: genes = string.split(s1[1], ":") tool = s1[2] if 'Cisbp_denovo' in tool: tool = "Cisbp_denovo" if "UpstreamIntron_known" in sig: sig = string.replace(sig, "UpstreamIntron_known", "Upstream") if "Intron_known" in s1[0]: sig = string.replace(sig, "Intron_known", "Combined_intron_new") if "Exons_known" in s1[0]: sig = string.replace(sig, "Exons_known", "Exon") if "DownstreamIntron_known" in s1[0]: sig = string.replace(sig, "DownstreamIntron_known", "Downstream") for i in range(len(genes)): if tool not in genelst[sig, genes[i].upper()]: genelst[sig, genes[i].upper()].append(tool) Newval[sig, tool, genes[i].upper()] = float(s1[3]) if tool == "GE": sig1 = "Exon:" + sig Newval[sig1, tool, genes[i].upper()] = float(s1[3]) genelst[sig1, genes[i].upper()].append(tool) sig1 = "Combined_intron_new:" + sig Newval[sig1, tool, genes[i].upper()] = float(s1[3]) genelst[sig1, genes[i].upper()].append(tool) zscoredt = {} cisbp = [] clipseq = [] ge = [] for sig, genes in genelst: tools = [] cisbpact = True cisbpden = True tools = genelst[sig, genes] # if genes=="MBNL1": # print tools,sig a = len(tools) if 'Cisbp_Actual' in tools and 'Cisbp_denovo' in tools: a = a - 1 if Newval[sig, "Cisbp_Actual", genes] < Newval[sig, "Cisbp_denovo", genes]: cisbpden = False else: cisbpact = False pval = 0.0 count = 0 if a > 1: pval = 0.0 count = 0 if "Cisbp_Actual" in tools and cisbpact: count += 1 # print str(Newval[sig,"Cisbp_Actual",genes]) pval = 0.0 - math.log10(Newval[sig, "Cisbp_Actual", genes]) output1.write(sig + "\t" + genes + "\t" + "Cisbp_Actual" + "\t" + str(Newval[sig, "Cisbp_Actual", genes]) + "\t" + str(pval) + "\t") zscoredt[sig, genes] = [ pval, ] cisbp.append(pval) else: output1.write(sig + "\t" + genes + "\t" + "Cisbp_Actual" + "\t" + "NA" + "\t" + "NA" + "\t") if 'Cisbp_denovo' in tools and cisbpden: count += 1 #print str(Newval[sig,"Cisbp_denovo",genes]) pval = 0.0 - math.log10(Newval[sig, "Cisbp_denovo", genes]) output1.write(sig + "\t" + genes + "\t" + "Cisbp_denovo" + "\t" + str(Newval[sig, "Cisbp_denovo", genes]) + "\t" + str(pval) + "\t") zscoredt[sig, genes] = [ pval, ] cisbp.append(pval) else: output1.write(sig + "\t" + genes + "\t" + "Cisbp_denovo" + "\t" + "NA" + "\t" + "NA" + "\t") if (sig, genes) not in zscoredt: zscoredt[sig, genes] = [ 0.0, ] cisbp.append(0.0) if "Clipseq" in tools: count += 1 #print str(Newval[sig,"Clipseq",genes]) pval = 0.0 - math.log10(Newval[sig, "Clipseq", genes]) output1.write(sig + "\t" + genes + "\t" + "Clipseq" + "\t" + str(Newval[sig, "Clipseq", genes]) + "\t" + str(pval) + "\t") zscoredt[sig, genes].append(pval) clipseq.append(pval) else: output1.write(sig + "\t" + genes + "\t" + "Clipseq" + "\t" + "NA" + "\t" + "NA" + "\t") zscoredt[sig, genes].append(0.0) clipseq.append(0.0) if "GE" in tools: count += 1 #print str(Newval[sig,"GE",genes]) pval = 0.0 - math.log10(Newval[sig, "GE", genes]) output1.write(sig + "\t" + genes + "\t" + "GE" + "\t" + str(Newval[sig, "GE", genes]) + "\t" + str(pval) + "\n") zscoredt[sig, genes].append(pval) ge.append(pval) else: output1.write(sig + "\t" + genes + "\t" + "GE" + "\t" + "NA" + "\t" + "NA" + "\n") zscoredt[sig, genes].append(0.0) ge.append(0.0) meancis = np.mean(cisbp) meanclip = np.mean(clipseq) meange = np.mean(ge) sdcis = np.std(cisbp) sdclip = np.std(clipseq) sdge = np.std(ge) for sig, genes in zscoredt: scores = [] scores = zscoredt[sig, genes] if len(scores) == 3: val1 = (float(scores[0]) - float(meancis)) / float(sdcis) val2 = (float(scores[1]) - float(meanclip)) / float(sdclip) val3 = (float(scores[2]) - float(meange)) / float(sdge) output2.write(sig + "\t" + genes + "\t" + str(val1) + "\t" + str(val2) + "\t" + str(val3) + "\n") else: print "error in zscore calculation" print sig, genes
def FilterGuideGeneFile(Guidefile,Guidefile_block,expressionInputFile,iteration,platform,uniqueIDs,symbolIDs): """ Filters the original input expression file for Guide3 genes/events. Needed Since NMF only can deal with positive values [Guide3 has negative values]""" root_dir = export.findParentDir(expressionInputFile)[:-1] if 'ExpressionInput' in root_dir: root_dir = export.findParentDir(root_dir) if 'Clustering' in Guidefile: count=1 flag=True rank_Count=0 prev=0 else: count=0 val=[] head=0 for line in open(Guidefile_block,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') #val.append(q[0]) if flag: if int(q[1])==prev: continue else: rank_Count+=1 prev=int(q[1]) else: head+=1 continue head=0 for line in open(Guidefile,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') n=len(q) if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') uid = q[0] if uid not in uniqueIDs: if uid in symbolIDs: uid = symbolIDs[uid] val.append(uid) else: continue val.append(uid) if platform != "PSI" and head==2: rank_Count=rank_Count+int(q[1]) print rank_Count head=head+1 else: head+=1 if platform != "PSI" and q[0]=="column_clusters-flat": rank_Count=int(q[n-1]) continue output_dir = root_dir+'/NMF-SVM' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/NMFInput-Round'+str(iteration)+'.txt' filterRows(expressionInputFile,output_file,filterDB=val) return output_file,rank_Count
def latteralMerge(files_to_merge,original_filename,outputPath = None): """ Merging files can be dangerous, if there are duplicate IDs (e.g., gene symbols). To overcome issues in redundant gene IDs that are improperly matched (one row with zeros and the other with values), this function determines if a lateral merge is more appropriate. The latter merge: 1) Checks to see if the IDs are the same with the same order between the two or more datasets 2) merges the two or more matrices without looking at the genes. Note: This function is attempts to be memory efficient and should be updated in the future to merge blocks of row IDs sequentially.""" files_to_merge_revised = [] for filename in files_to_merge: ### If a sparse matrix - rename and convert to flat file if '.h5' in filename or '.mtx' in filename: from import_scripts import ChromiumProcessing import export file = export.findFilename(filename) export_name = file[:-4]+'-filt' if file == 'filtered_feature_bc_matrix.h5' or file == 'raw_feature_bc_matrix.h5' or file =='filtered_gene_bc_matrix.h5' or file == 'raw_gene_bc_matrix.h5': export_name = export.findParentDir(filename) export_name = export.findFilename(export_name[:-1]) elif file == 'matrix.mtx.gz' or file == 'matrix.mtx': parent = export.findParentDir(filename) export_name = export.findParentDir(parent) export_name = export.findFilename(export_name[:-1]) else: export_name = string.replace(file,'.mtx.gz','') export_name = string.replace(export_name,'.mtx','') export_name = string.replace(export_name,'.h5','') export_name = string.replace(export_name,'_matrix','') filename = ChromiumProcessing.import10XSparseMatrix(filename,'species',export_name) files_to_merge_revised.append(filename) files_to_merge = files_to_merge_revised print 'Files to merge:',files_to_merge includeFilenames = True file_uids = {} for filename in files_to_merge: firstRow=True fn=filepath(filename); x=0 if '/' in filename: file = string.split(filename,'/')[-1][:-4] else: file = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if '\t' in data: t = string.split(data,'\t') elif ',' in data: t = string.split(data,',') else: t = string.split(data,'\t') if firstRow: firstRow = False else: uid = t[0] try: file_uids[file].append(uid) except: file_uids[file] = [uid] perfectMatch = True for file1 in file_uids: uids1 = file_uids[file1] for file2 in file_uids: uids2 = file_uids[file2] if uids1 != uids2: print file1,file2 perfectMatch = False if perfectMatch: print 'All ordered IDs match in the files ... performing latteral merge instead of key ID merge to prevent multi-matches...' firstRow=True increment = 5000 low = 1 high = 5000 added = 1 eo = open(output_dir+'/MergedFiles.txt','w') import collections def exportMergedRows(low,high): uid_values=collections.OrderedDict() for filename in files_to_merge: fn=filepath(filename); x=0; file_uids = {} if '/' in filename: file = string.split(filename,'/')[-1][:-4] else: file = string.split(filename,'\\')[-1][:-4] firstRow=True row_count = 0 uids=[] ### Over-ride this for each file for line in open(fn,'rU').xreadlines(): row_count+=1 if row_count<=high and row_count>=low: data = cleanUpLine(line) if '\t' in data: t = string.split(data,'\t') elif ',' in data: t = string.split(data,',') else: t = string.split(data,'\t') if firstRow and low==1: file = string.replace(file,'_matrix_CPTT','') if includeFilenames: header = [s + "."+file for s in t[1:]] ### add filename suffix else: header = t[1:] try: uid_values[row_count]+=header except: uid_values[row_count]=header uids.append('UID') firstRow=False else: uid = t[0] try: uid_values[row_count] += t[1:] except: uid_values[row_count] = t[1:] uids.append(uid) i=0 for index in uid_values: uid = uids[i] eo.write(string.join([uid]+uid_values[index],'\t')+'\n') i+=1 print 'completed',low,high uid_list = file_uids[file] while (len(uid_list)+increment)>high: exportMergedRows(low,high) high+=increment low+=increment eo.close() return True else: print 'Different identifier order in the input files encountered...' return False
py2app_dirs = py2app_ge_dirs + py2app_aa_dirs for i in py2app_aa_dirs: i = string.replace(i,'AltAnalyze.app','AltAnalyzeViewer.app') py2app_dirs.append(i) if ('linux' in sys.platform or 'posix' in sys.platform) and getattr(sys, 'frozen', False): ### For PyInstaller application_path = os.path.dirname(sys.executable) #application_path = sys._MEIPASS ### should be the same as the above else: if '..' in __file__: """ Indicates the file callin unique.py is in a subdirectory """ try: if '.py' in __file__: import export application_path = export.findParentDir(string.split(__file__,'..')[0][:-1]) else: application_path = os.getcwd() except Exception: application_path = os.getcwd() else: application_path = os.path.dirname(__file__) if len(application_path)==0: application_path = os.getcwd() if 'AltAnalyze?' in application_path: application_path = string.replace(application_path,'//','/') application_path = string.replace(application_path,'\\','/') ### If /// present application_path = string.split(application_path,'AltAnalyze?')[0]
def processBarcodes(viral_barcode_file, cell_cluster_file, reference_48mers): eo = export.ExportFile(viral_barcode_file[:-4] + '-cleaned.txt') parent = export.findParentDir(viral_barcode_file) eom = export.ExportFile(parent + '/MultiLin-cells.txt') ### Import a file with the sample names in the groups file in the correct order viral_barcodes = {} repair = {} short = {} cluster_header = [] cell_clusters = {} for line in open(cell_cluster_file, 'rU').xreadlines(): data = cleanUpLine(line) cell, cluster, cluster_name = string.split(data, '\t') cell_clusters[cell] = cluster_name if cluster_name not in cluster_header: cluster_header.append(cluster_name) cells_with_virus = {} for line in open(viral_barcode_file, 'rU').xreadlines(): data = cleanUpLine(line) cellular, viral = string.split(data, '\t') if cellular in cell_clusters: try: if viral not in cells_with_virus[cellular]: cells_with_virus[cellular].append(viral) except Exception: cells_with_virus[cellular] = [viral] if len(viral) < 48: #if len(viral)<38: if viral not in repair: repair[viral] = [cellular] else: if cellular not in repair[viral]: repair[viral].append(cellular) else: #short[viral[:35]]=viral try: if cellular not in viral_barcodes[viral]: viral_barcodes[viral].append(cellular) except Exception: viral_barcodes[viral] = [cellular] ### Repair the short sequences for viral_short in repair: cellular_barcodes = repair[viral_short] if viral_short[:35] in short: viral = short[viral_short[:35]] for cellular in cellular_barcodes: try: if cellular not in viral_barcodes[viral]: viral_barcodes[viral].append(cellular) except Exception: viral_barcodes[viral] = [cellular] print len(viral_barcodes), 'unique viral barcodes present' #print cells_with_virus['ACGCCGATCTGTTGAG'] #print cells_with_virus['CAGAATCCAAACTGCT'] #sys.exit() if reference_48mers != None: valid_barcodes = 0 for viral in viral_barcodes: if viral in reference_48mers: valid_barcodes += 1 print valid_barcodes, 'unique valid viral barcodes present' #""" ### If the viral barcodes have frequent errors - associate the error with the reference in a cell-specific manner ### Only one virus for cell should be present unless it is a doublet print len(cells_with_virus), 'cells with viral barcodes' doublet_cell = {} mismatch_to_match = {} cells_with_valid_barcodes = 0 viral_barcodes_overide = {} cellular_barcodes_overide = {} for cellular in cells_with_virus: cell_5prime = {} cell_3prime = {} ref_sequences = [] if len(cells_with_virus[cellular]) > 1: for i in cells_with_virus[cellular]: try: cell_5prime[i[:10]].append(i) except Exception: cell_5prime[i[:10]] = [i] try: cell_3prime[i[-10:]].append(i) except Exception: cell_3prime[i[-10:]] = [i] if reference_48mers == None: ref_sequences.append(i) elif i in reference_48mers: ref_sequences.append(i) if len(ref_sequences) > 0: cells_with_valid_barcodes += 1 ### Determine how many cells have valid viral barcodes cell_5prime_ls = [] cell_3prime_ls = [] for i in cell_5prime: cell_5prime_ls.append([len(cell_5prime[i]), i]) for i in cell_3prime: cell_3prime_ls.append([len(cell_3prime[i]), i]) cell_5prime_ls.sort() cell_3prime_ls.sort() for seq in ref_sequences: if cell_5prime_ls[-1][1] in seq and cell_3prime_ls[-1][ 1] in seq: ref_seq = seq try: viral_barcodes_overide[ref_seq].append(cellular) except: viral_barcodes_overide[ref_seq] = [cellular] cellular_barcodes_overide[cellular] = [ref_seq] for y in cell_5prime[cell_5prime_ls[-1][1]]: mismatch_to_match[y] = ref_seq for y in cell_3prime[cell_3prime_ls[-1][1]]: mismatch_to_match[y] = ref_seq else: for i in cells_with_virus[cellular]: if reference_48mers == None: cells_with_valid_barcodes += 1 elif i in reference_48mers: cells_with_valid_barcodes += 1 ### Determine how many cells have valid viral barcodes try: viral_barcodes_overide[i].append(cellular) except: viral_barcodes_overide[i] = [cellular] viral_barcodes = viral_barcodes_overide cells_with_virus = cellular_barcodes_overide ### Update the viral_barcodes dictionary viral_barcodes2 = {} cells_with_virus2 = {} for v in viral_barcodes: cell_barcodes = viral_barcodes[v] proceed = False if v in mismatch_to_match: v = mismatch_to_match[v] proceed = True elif reference_48mers == None: proceed = True elif v in reference_48mers: proceed = True if proceed: if v in viral_barcodes2: for c in cell_barcodes: if c not in viral_barcodes2: viral_barcodes2[v].append(c) else: viral_barcodes2[v] = cell_barcodes print cells_with_valid_barcodes, 'cells with valid viral barcodes.' viral_barcodes = viral_barcodes2 ### Update the cells_with_virus dictionary for v in viral_barcodes: cell_barcodes = viral_barcodes[v] for c in cell_barcodes: if c in cells_with_virus2: if v not in cells_with_virus2[c]: cells_with_virus2[c].append(v) else: cells_with_virus2[c] = [v] cells_with_virus = cells_with_virus2 for c in cells_with_virus: if len(cells_with_virus[c]) > 1: doublet_cell[c] = [] print len(doublet_cell), 'doublets' #print cells_with_virus['ACGCCGATCTGTTGAG'] #print cells_with_virus['CAGAATCCAAACTGCT'] #sys.exit() print len(cells_with_virus), 'updated cells with virus' print len(viral_barcodes), 'updated unique viral barcodes' #""" #reference_48mers={} multi_cell_mapping = 0 unique_cells = {} multiMappingFinal = {} import collections import unique event_db = collections.OrderedDict() for cluster in cluster_header: event_db[cluster] = '0' k_value = 1 import unique cluster_hits_counts = {} cluster_pairs = {} custom = [] cells_per_pattern = {} for viral in viral_barcodes: clusters = [] k = len(unique.unique(viral_barcodes[viral])) if k > k_value: proceed = True if reference_48mers == None: proceed = True elif len(reference_48mers) > 0: if viral in reference_48mers: proceed = True else: proceed = False if proceed: viral_cluster_db = copy.deepcopy(event_db) ### copy this multi_cell_mapping += 1 cell_tracker = [] multilin = [] all_cells = [] for cell in viral_barcodes[viral]: #if cell not in doublet_cell: cell_tracker.append(cell) try: unique_cells[cell].append(viral) except: unique_cells[cell] = [viral] if cell in cell_clusters: cluster = cell_clusters[cell] if 'Multi-Lin' == cluster: multilin.append(cell) all_cells.append(cell) viral_cluster_db[cluster] = '1' clusters.append(cluster) c1 = unique.unique(clusters) c2 = string.join(c1, '|') try: cells_per_pattern[c2] += all_cells except: cells_per_pattern[c2] = all_cells #if c1 == ['Multi-Lin c4-Mast']: #if c1 == ['MultiLin','MEP','Myelo-1'] or c1 == ['MultiLin','MEP','Myelo-2'] or c1 == ['MultiLin','MEP','Myelo-4']: #if 'Multi-Lin c4-Mast' in c1 and ('ERP-primed' not in c1 and 'MEP' not in c1 and 'MKP-primed' not in c1 and 'MKP' not in c1 and 'ERP' not in c1) and 'Monocyte' not in c1 and 'e-Mono' not in c1 and ('Gran' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 and 'Myelo-3' in c1 and 'Myelo-4' in c1): #if 'Multi-Lin' in c1 and ('e-Mono' in c1 or 'Monocyte' in c1) and ('ERP-primed' in c1 or 'MEP' in c1 or 'MKP-primed' in c1 or 'MKP' in c1) and ('Gran' in c1 or 'Myelo-4' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 or 'Myelo-3' in c1): if 'Multi-Lin' in c1: for cell in multilin: eom.write( string.join(c1, '|') + '\t' + cell + '\t' + viral + '\n') custom += viral_barcodes[viral] #print 'custom:',custom multiMappingFinal[viral] = viral_cluster_db ### Count the number of cluster pairs to make a weighted network for c1 in clusters: for c2 in clusters: if c1 != c2: try: cx = cluster_pairs[c1] try: cx[c2] += 1 except: cx[c2] = 1 except: cx = {} cx[c2] = 1 cluster_pairs[c1] = cx clusters = string.join(unique.unique(clusters), '|') try: cluster_hits_counts[clusters] += 1 except Exception: cluster_hits_counts[clusters] = 1 #sys.exit() #print custom for cluster in cluster_pairs: cluster_counts = [] cx = cluster_pairs[cluster] for c2 in cx: count = cx[c2] cluster_counts.append([count, c2]) cluster_counts.sort() cluster_counts.reverse() #print cluster, cluster_counts print len(multiMappingFinal) final_ranked_cluster_hits = [] for clusters in cluster_hits_counts: final_ranked_cluster_hits.append( [cluster_hits_counts[clusters], clusters]) final_ranked_cluster_hits.sort() final_ranked_cluster_hits.reverse() for (counts, clusters) in final_ranked_cluster_hits: try: print str(counts) + '\t' + clusters + '\t' + str( len(unique.unique(cells_per_pattern[clusters]))) #print cells_per_pattern[clusters];sys.exit() except: print str(counts) + '\t' + clusters eo.write(string.join(['UID'] + cluster_header, '\t') + '\n') for viral_barcode in multiMappingFinal: cluster_db = multiMappingFinal[viral_barcode] hits = [] for cluster in cluster_db: hits.append(cluster_db[cluster]) eo.write(string.join([viral_barcode] + hits, '\t') + '\n') eo.close() eo = export.ExportFile(viral_barcode_file[:-4] + '-cells-' + str(k_value) + '.txt') for cell in unique_cells: #eo.write(cell+'\t1\t1\t'+str(len(unique_cells[cell]))+'\t'+string.join(unique_cells[cell],'|')+'\n') eo.write(cell + '\t1\t1\t\n') eo.close()
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp, forceBroadClusters, turn): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber * 0.25)) print 'Number varying samples to identify:', gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!! Setting Rank=0' #print traceback.format_exc() Rank = 0 if Rank > 1: ### ADJUST THE RANKS - MUST UPDATE!!!! if turn == 1: if force_broad_round1: #Rank=2 Rank = Rank else: if Rank > 2: Rank = 30 else: if Rank > 2: Rank = 30 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def parseResultfolders(motifdir, GEdir, SFlist): sfs = [] for lin in open(SFlist, 'rU').xreadlines(): s = lin.rstrip('\r\n') s1 = string.split(s, '\t') sfs.append(s1[0]) mappingdict = defaultdict(list) allden = [] for filename in os.listdir(motifdir): name = filename mapping = [] dellst = [] if "._" not in filename and "Events" not in filename: fol = os.path.join(motifdir, filename) if os.path.isdir(fol): #for filename2 in os.listdir(fol): #filnam2=os.path.join(fol,filename2) #if "._" not in filnam2: # if os.path.isdir(filnam2): # #print filnam2 # flag=0 # if "._" not in filename2: # name=filename+":"+filename2 # flag=1 # # if flag==1: for filename3 in os.listdir(fol): if filename3 == "finalResults.tab": clipres = os.path.join(fol, filename3) for lin in open(clipres, 'rU').xreadlines(): q = lin.rstrip('\r\n') q1 = string.split(q, '\t') clipnam = q1[0] + ":" + q1[1] + ":" + q1[2] mappingdict[name, clipnam, "Clipseq"] = q1[11] if filename3 == "output_TF_strand": knownrbp = os.path.join(fol, filename3) for filename4 in os.listdir(knownrbp): if filename4 == "knownResults.txt": filenam4 = os.path.join(knownrbp, filename4) try: head = 0 for line in open(filenam4, 'rU').xreadlines(): q = line.rstrip('\r\n') q1 = string.split(q, '\t') if head == 0: motif = q1.index('Motif Name') pval = q1.index('P-value') head = 1 continue else: mappingdict[ name, q1[motif], "Cisbp_Actual"] = q1[pval] except Exception: continue if filename3 == "output1": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output2": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output3": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output4": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output5": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #print name,motname,col[2] #sys.exit() #mappingdict[name,motname,"Cisbp_denovo"]=col[2] mapping.sort(key=lambda x: x[0]) mapping.sort(key=lambda x: x[1]) #prev="" #output=os.path.join(motifdir,"test.txt") #output_w=open(output,"a") for i in range(len(mapping)): if mapping[i][0] not in dellst: mot = string.split(mapping[i][0], ";")[1] genes = [] genes = string.split(mot, ":")[1:] allden.append([filename, mot, genes, mapping[i][1]]) #output_w.write(mapping[i][0]+"\t"+str(mapping[i][1])) # output_w.write("\n") dellst.append(mapping[i][0]) final = {} for i in range(len(allden)): de = [] de = allden[i] for q in de[2]: if q in final: if de[3] < final[q][1]: final[q] = [de[0], de[3], de[1]] else: final[q] = [de[0], de[3], de[1]] for genes in final: de = [] de = final[genes] mappingdict[de[0], de[2], "Cisbp_denovo"] = str(de[1]) for filename in os.listdir(GEdir): if "GE" in filename and "._GE" not in filename: InputFile = os.path.join(GEdir, filename) name = string.replace(filename, "GE.", "") name = string.replace(name, "_vs_Others.txt", "") head = 0 for line in open(InputFile, 'rU').xreadlines(): q = line.rstrip('\r\n') q1 = string.split(q, '\t') if head == 0: symbol = q1.index('Symbol') adjp = q1.index('adjp') head = 1 continue else: if q1[symbol] in sfs: mappingdict[name, q1[symbol], "GE"] = q1[adjp] dire = export.findParentDir(motifdir) output_dir = dire + 'MotifResults' export.createExportFolder(output_dir) output = output_dir + "/Motifresults.txt" #output=os.path.join(motifdir,"merged_output_allpvalues_nofold.txt") output1 = open(output, "w") #output1.write("signature"+"\t"+"gene"+"\t"+"tool"+"\t"+"p-value"+"\n") for name, gene, key in mappingdict: output1.write(name + "\t" + gene + "\t" + key + "\t" + mappingdict[name, gene, key] + "\n") output1.close() return output
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"): root_dir = export.findParentDir(NMFinputDir)[:-1] if 'ExpressionInput' in root_dir: root_dir = export.findParentDir(root_dir) if 'NMF-SVM' in root_dir: root_dir = export.findParentDir(root_dir) export.findFilename(NMFinputDir) X=[] header=[] head=0 exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt' export_res=export.ExportFile(exportnam) exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt' export_res1=export.ExportFile(exportnam_bin) exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt' export_res5=export.ExportFile(exportnam_bint) MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt' export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt') export_res4=open(string.replace(MF_input,'exp.','groups.'),"w") export_res7=open(string.replace(MF_input,'exp.','comps.'),"w") exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt' export_res2=export.ExportFile(exportnam2) exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt' export_res3=export.ExportFile(exportnam3) #if 'Clustering' in NMFinputDir: # count=1 # start=2 #else: count=0 start=1 #print Rank for line in open(NMFinputDir,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') if head >count: val=[] val2=[] me=0.0 for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) #if q[1]==prev: X.append(val) else: export_res1.write(line) export_res.write(line) export_res1.write("\n") #export_res4.write(line) #export_res4.write("\n") export_res.write("\n") header=q head+=1 continue group=defaultdict(list) sh=[] X=np.array(X) #print X.shape mat=[] #mat=X mat=zip(*X) mat=np.array(mat) #print mat.shape #model = NMF(n_components=15, init='random', random_state=0) #W = model.fit_transform(mat) nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95) nmf_fit = nmf() W = nmf_fit.basis() W=np.array(W) #np.savetxt("basismatrix2.txt",W,delimiter="\t") H=nmf_fit.coef() H=np.array(H) # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t") #print W.shape sh=W.shape export_res3.write("uid\tUID\tUID\n") if int(Rank)==2: par=1 else: par=2 #for i in range(sh[1]): # val=W[:,i] # me=np.mean(val) # st=np.std(val) # export_res2.write(header[i+1]) # for j in range(sh[0]): # if float(W[i][j])>=float(me+(par*st)): # # export_res2.write("\t"+str(1)) # else: # export_res2.write("\t"+str(0)) # # export_res2.write("\n") if platform != 'PSI': sh=W.shape Z=[] export_res5.write("uid") export_res2.write("uid") for i in range(sh[1]): export_res5.write("\t"+'V'+str(i)) export_res2.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res5.write("\n") export_res2.write("\n") export_res3.write("\n") for i in range(sh[0]): new_val=[] val=W[i,:] export_res2.write(header[i+1]) export_res5.write(header[i+1]) export_res4.write(header[i+1]) flag=True for j in range(sh[1]): if W[i][j]==max(val) and flag: export_res5.write("\t"+str(1)) export_res2.write("\t"+str(1)) new_val.append(1) export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j)) flag=False else: export_res5.write("\t"+str(0)) export_res2.write("\t"+str(0)) new_val.append(0) Z.append(new_val) export_res5.write("\n") export_res2.write("\n") export_res4.write("\n") W=zip(*W) W=np.array(W) sh=W.shape Z=zip(*Z) Z=np.array(Z) for i in range(sh[0]): export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): export_res.write("\t"+str(W[i][j])) export_res1.write("\t"+str(Z[i][j])) export_res.write("\n") export_res1.write("\n") export_res.close() export_res1.close() export_res2.close() export_res5.close() Orderedheatmap.Classify(exportnam_bint) return exportnam,exportnam_bin,exportnam2,exportnam3 else: W=zip(*W) W=np.array(W) sh=W.shape Z=[] for i in range(sh[0]): new_val=[] val=W[i,:] num=sum(i > 0.10 for i in val) if num >40 or num <3: compstd=True else: compstd=False me=np.mean(val) st=np.std(val) #print 'V'+str(i) export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): if compstd: if float(W[i][j])>=float(me+(par*st)): export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) else: if float(W[i][j])>0.1: export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) export_res.write("\t"+str(W[i][j])) Z.append(new_val) export_res.write("\n") export_res1.write("\n") # Z=zip(*Z) Z=np.array(Z) sh=Z.shape Z_new=[] val1=[] Z1=[] dellst=[] export_res2.write("uid") export_res5.write("uid") for i in range(sh[0]): indices=[] val1=Z[i,:] sum1=sum(val1) flag=False indices=[index for index, value in enumerate(val1) if value == 1] for j in range(sh[0]): val2=[] if i!=j: val2=Z[j,:] sum2=sum([val2[x] for x in indices]) summ2=sum(val2) try: if float(sum2)/float(sum1)>0.5: if summ2>sum1: flag=True #print str(i) except Exception: continue if flag==False: Z1.append(val1) export_res2.write("\t"+'V'+str(i)) export_res5.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res2.write("\n") export_res5.write("\n") Z1=np.array(Z1) Z=Z1 Z=zip(*Z) Z=np.array(Z) sh=Z.shape for i in range(sh[0]): val1=Z[i,:] #print sum(val1) #if sum(val)>2: if sum(val1)>2: val=[0 if x==1 else x for x in val1] else: val=val1 me=np.mean(val) st=np.std(val) export_res2.write(header[i+1]) export_res5.write(header[i+1]) for j in range(sh[1]): if strategy=="conservative": export_res2.write("\t"+str(val1[j])) export_res5.write("\t"+str(val1[j])) else: export_res2.write("\t"+str(val[j])) export_res5.write("\t"+str(val[j])) export_res2.write("\n") export_res5.write("\n") Z_new.append(val) Z_new=zip(*Z_new) Z_new=np.array(Z_new) sh=Z_new.shape export_res5.close() Orderedheatmap.Classify(exportnam_bint) if strategy=="conservative": return exportnam,exportnam_bin,exportnam2,exportnam3 else: return exportnam,exportnam_bin,exportnam2,exportnam3