def importPGF(dir,species,filename): fn=filepath(filename); probe_db = {}; x=0 psr_file = dir+'/'+species+'/'+array_type+'/'+species+'_probeset-psr.txt' psr_file = string.replace(psr_file,'affymetrix/LibraryFiles/','') try: eo = export.ExportFile(filepath(psr_file)) except Exception: eo = export.ExportFile(filepath(psr_file[1:])) for line in open(fn,'rU').xreadlines(): if line[0] != '#': data = cleanUpLine(line); x+=1 t = string.split(data,'\t') if len(t)==2 or len(t)==3: if len(t[0])>0: probeset = t[0]; type = t[1] eo.write(probeset+'\t'+t[-1]+'\n') ### Used for HTA array where we need to have PSR to probeset IDs else: try: probe = t[2] #if probeset == '10701621': print probe try: probe_db[probeset].append(probe) except KeyError: probe_db[probeset] = [probe] except Exception: null=[] eo.close() new_file = dir+'/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' new_file = string.replace(new_file,'affymetrix/LibraryFiles/','') headers = 'probeset\t' + 'probe\n'; n=0 try: data = export.ExportFile(filepath(new_file)) except Exception: data = export.ExportFile(filepath(new_file[1:])) data.write(headers) for probeset in probe_db: for probe in probe_db[probeset]: data.write(probeset+'\t'+probe+'\n'); n+=1 data.close() print n, 'Entries exported for', new_file
def unzipFiles(filename,dir): import zipfile output_filepath = filepath(dir+'/'+filename) try: zfile = zipfile.ZipFile(output_filepath) for name in zfile.namelist(): if name.endswith('/'):null=[] ### Don't need to export else: if 'EnsMart' in name and 'EnsMart' in dir: dir = export.findParentDir(dir[:-1]) ### Remove EnsMart suffix directory try: outfile = export.ExportFile(filepath(dir+name)) except Exception: outfile = export.ExportFile(filepath(dir+name[1:])) outfile.write(zfile.read(name)); outfile.close() #print 'Zip extracted to:',output_filepath status = 'completed' except Exception, e: try: ### Use the operating system's unzip if all else fails extracted_path = string.replace(output_filepath,'.zip','') try: os.remove(extracted_path) ### This is necessary, otherwise the empty file created above will require user authorization to delete except Exception: null=[] subprocessUnzip(dir,output_filepath) status = 'completed' except IOError: print e print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.' status = 'failed'
def exportCorrelationResults(exp_input): input_file = export.findFilename(exp_input) if '.txt' in exp_output_file: corr_output_file = string.replace(exp_output_file, 'DATASET', 'LineageCorrelations') else: ### Occurs when processing a non-standard AltAnalyze file corr_output_file = exp_output_file + '/' + input_file corr_output_file = string.replace( corr_output_file, '.txt', '-' + coding_type + '-' + compendiumPlatform + '.txt') if analysis_type == 'AltExon': corr_output_file = string.replace(corr_output_file, coding_type, 'AltExon') filename = export.findFilename(corr_output_file) score_data = export.ExportFile(corr_output_file) if use_scipy: zscore_output_dir = string.replace(corr_output_file, '.txt', '-zscores.txt') probability_data = export.ExportFile(zscore_output_dir) #adjustPValues() replacePearsonPvalueWithZscore() ### Make title row headers = ['Sample_name'] for tissue in tissue_comparison_scores: for (r, p, sample) in tissue_comparison_scores[tissue]: headers.append(sample) break title_row = string.join(headers, '\t') + '\n' score_data.write(title_row) if use_scipy: probability_data.write(title_row) ### Export correlation data tissue_scores = {} tissue_probabilities = {} tissue_score_list = [] ### store and rank tissues according to max(score) for tissue in tissue_comparison_scores: scores = [] probabilities = [] for (r, p, sample) in tissue_comparison_scores[tissue]: scores.append(r) probabilities.append(p) tissue_score_list.append((max(scores), tissue)) tissue_scores[tissue] = string.join(map(str, [tissue] + scores), '\t') + '\n' ### export line if use_scipy: tissue_probabilities[tissue] = string.join( map(str, [tissue] + probabilities), '\t') + '\n' tissue_score_list.sort() tissue_score_list.reverse() for (score, tissue) in tissue_score_list: score_data.write(tissue_scores[tissue]) if use_scipy: probability_data.write(tissue_probabilities[tissue]) score_data.close() if use_scipy: probability_data.close() print filename, 'exported...' return zscore_output_dir
def statisticallyFilterTransposedFile(input_file, output_file, threshold, minGeneCutoff=499, binarize=True): """ The input file is a large expression matrix with the rows as cells and the columns as genes to filter """ if 'exp.' in input_file: counts_file = string.replace(input_file, 'exp.', 'geneCount.') else: counts_file = input_file[:-4] + '-geneCount.txt' import export eo = export.ExportFile(counts_file) eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n') eo_full = export.ExportFile(output_file) sample_expressed_genes = {} header = True count_sum_array = [] cells_retained = 0 for line in open(input_file, 'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: t = string.split(data, ',') else: t = string.split(data, '\t') if header: eo_full.write(line) gene_len = len(t) genes = t[1:] header = False else: cell = t[0] values = map(float, t[1:]) binarized_values = [] for v in values: if v > threshold: if binarize: ### do not count the individual read counts, only if a gene is expressed or not binarized_values.append(1) else: binarized_values.append( v ) ### When summarizing counts and not genes expressed else: binarized_values.append(0) genes_expressed = sum(binarized_values) if genes_expressed > minGeneCutoff: eo_full.write(line) cells_retained += 1 eo.write(cell + '\t' + str(genes_expressed) + '\n') eo.close() eo_full.close() print cells_retained, 'Cells with genes expressed above the threshold'
def annotateMetaProbesetGenes(summary_exp_file, expression_file, metaprobeset_file, species): metaprobeset_cv_file = string.replace(metaprobeset_file, species + '_', species + '_Conversion_') metaprobeset_cv_file = string.replace(metaprobeset_cv_file, '.mps', '.txt') fn = filepath(metaprobeset_cv_file) uid_db = {} for line in open(fn, 'rU').xreadlines(): data = UI.cleanUpLine(line) uid, ens_gene = string.split(data, '\t') uid_db[uid] = ens_gene export_data = export.ExportFile(expression_file) fn = filepath(summary_exp_file) x = 0 for line in open(fn, 'rU').xreadlines(): if line[0] == '#': null = [] elif x == 0: export_data.write(line) x += 1 else: data = cleanUpLine(line) t = string.split(data, '\t') uid = t[0] ens_gene = uid_db[uid] export_data.write(string.join([ens_gene] + t[1:], '\t') + '\n') export_data.close()
def getColoredPathwayTest(): fileType = 'png' ### svg, pdf graphId = "ffffff90" wpid = "WP2062" color = "0000ff" graphId = ["ffffff90", "ffffffe5"] color = ["0000ff", "0000ff"] ### revision = 0 is the most current version #file = client.service.getColoredPathway(pwId=wpid,revision=0,graphId=graphId,color=color,fileType=fileType) kwargs = { 'identifier': 'WP2062', 'version': 0, 'element_identifiers': ["ffffff90", "ffffffe5"], 'colors': ["#0000FF", "#0000FF"], 'file_format': 'image/svg+xml' } file = wikipathways_api_client_instance.get_colored_pathway( identifier=wpid, version=0, element_identifiers=graphId, colors=color, file_format=fileType) #file = base64.b64decode(file) ### decode this file outfile = export.ExportFile(wpid + '.png') outfile.write(file) outfile.close()
def getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type): """ Adapted from compareProteinFeatures but for one isoform and returns genomic coordinates for each feature This function is designed to export all unique isoforms rather than just comparison isoforms """ import export export_file = 'AltDatabase/ensembl/'+species+'/ProteinFeatureIsoform_complete.txt' export_data = export.ExportFile(export_file) failed = 0 worked = 0 failed_ac=[] for gene in protein_ft_db: transcript_feature_db={} for ft in protein_ft_db[gene]: try: ft_name = ft.PrimaryAnnot(); annotation = ft.SecondaryAnnot() for (mRNA,type) in gene_transcript_db[gene]: try: protein,protein_seq = mRNA_protein_seq_db[mRNA] error = False except Exception: failed_ac.append(mRNA) error = True if error == False: if ft.DomainSeq() in protein_seq: #if coordinate_type == 'genomic': pos1_genomic = ft.GenomicStart(); pos2_genomic = ft.GenomicStop() #else: pos1 = str(ft.DomainStart()); pos2 = str(ft.DomainEnd()) ### There are often many features that overlap within a transcript, so consistently pick just one if mRNA in transcript_feature_db: db = transcript_feature_db[mRNA] if (pos1,pos2) in db: db[pos1, pos2].append([pos1_genomic, pos2_genomic, protein,ft_name,annotation]) else: db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]] else: db={} db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]] transcript_feature_db[mRNA] = db #values = [mRNA, protein, pos1, pos2,ft_name,annotation]; unique_entries.append(values) worked+=1 except IOError: failed+=1 for transcript in transcript_feature_db: db = transcript_feature_db[transcript] for (pos1,pos2) in db: db[pos1,pos2].sort() ### Pick the alphabetically listed first feature pos1_genomic, pos2_genomic, protein,ft_name,annotation = db[pos1,pos2][0] values = [transcript, protein, pos1, pos2,pos1_genomic, pos2_genomic, ft_name,annotation] export_data.write(string.join(values,'\t')+'\n') export_data.close() print failed,'features failed to have corresponding aligned genomic locations out of', worked+failed failed_ac = unique.unique(failed_ac) print len(failed_ac),'mRNAs without identified/in silico derived proteins' ### Appear to be ncRNAs without ATGs print failed_ac[:20]
def exportCorrelationResults(): corr_output_file = string.replace(exp_output_file, 'DATASET', 'TissueCorrelations') corr_output_file = string.replace(corr_output_file, '.txt', '-' + coding_type + '.txt') if analysis_type == 'exonLevel': corr_output_file = string.replace(corr_output_file, coding_type, 'AltExon') data = export.ExportFile(corr_output_file) ### Make title row headers = ['Sample_name'] for tissue in tissue_comparison_scores: for (r, sample) in tissue_comparison_scores[tissue]: headers.append(sample) break title_row = string.join(headers, '\t') + '\n' data.write(title_row) ### Export correlation data for tissue in tissue_comparison_scores: values = [tissue] for (r, sample) in tissue_comparison_scores[tissue]: values.append(str(r)) values = string.join(values, '\t') + '\n' data.write(values) data.close() print corr_output_file, 'exported...'
def reformatHeatmapFile(input_file): import unique export_file = string.replace(input_file, 'Clustering-', 'Input-') eo = export.ExportFile(export_file) first_row = True fn = filepath(input_file) for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if first_row == True: if 'column_clusters-flat' not in t: array_names = [] for i in t[2:]: array_names.append(string.replace(i, ':', '-')) #array_names.append(i) elif 'column_clusters-flat' in t: array_clusters = t[2:] unique_clusters = unique.unique(array_clusters) ind = 0 headers = [] for c in array_clusters: headers.append(c + '::' + array_names[ind]) ind += 1 headers = string.join(['uid'] + headers, '\t') + '\n' eo.write(headers) first_row = False else: values = string.join([t[0]] + t[2:], '\t') + '\n' eo.write(values) return export_file, len(unique_clusters)
def checkForDuplicateIDs(input_file): first_row = True key_db = {} key_list = [] fn = filepath(input_file) for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if first_row == True: headers = line first_row = False else: key = t[0] key_list.append(key) key_db[key] = t if len(key_db) != len(key_list): print 'Duplicate IDs present... writing a cleaned-up version of the input file:' ### Duplicate IDs present input_file = input_file[:-4] + '-clean.txt' export_text = export.ExportFile( input_file) ### create a new input file export_text.write(headers) ### Header is the same for each file for key in key_db: t = key_db[key] export_text.write(string.join(t, '\t') + '\n') ### Write z-score values and row names export_text.close() print 'File written...' return input_file
def translateToEntrezGene(species,filename): x=0; type = 'pathway' try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene')) except Exception: ens_to_entrez ={} if len(ens_to_entrez)>0: export_file = string.replace(filename,'Ensembl','EntrezGene') export_data = export.ExportFile(export_file) export_data.write('EntrezGene\tOntologyID\n') fn = filepath(filename) for line in open(fn,'rU').xreadlines(): if x==0: x=1 else: data = cleanUpLine(line) try: ensembl,pathway = string.split(data,'\t') type = 'ontology' except Exception: ensembl,null,pathway = string.split(data,'\t') try: entrezs = ens_to_entrez[ensembl] for entrez in entrezs: if type == 'ontology': export_data.write(entrez+'\t'+pathway+'\n') else: export_data.write(entrez+'\tEn\t'+pathway+'\n') except Exception: null=[] export_data.close()
def filterRows(input_file, output_file, filterDB=None, logData=False): orderlst = {} counter = [] export_object = export.ExportFile(output_file) firstLine = True Flag = 0 #print len(filterDB) for line in open(input_file, 'rU').xreadlines(): flag1 = 0 data = cleanUpLine(line) values = string.split(data, '\t') if firstLine: firstLine = False k = values.index('UID') if Flag == 0: export_object.write(line) else: if values[k] in filterDB: counter = [ index for index, value in enumerate(filterDB) if value == values[k] ] for it in range(0, len(counter)): orderlst[counter[it]] = line try: for i in range(0, len(orderlst)): export_object.write(orderlst[i]) except Exception: print i, filterDB[i] export_object.close() print 'Filtered rows printed to:', output_file
def matchTranscriptExonIDsToJunctionIDs(species, array_type, gene_junction_db): """ Matches junctionIDs to precomputed transcript-level exonID strings - simpler and more accurate than importEnsemblTranscriptSequence""" output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_coordinte-mRNA_alignments.txt' dataw = export.ExportFile(output_file) filename = 'AltDatabase/ensembl/' + species + '/mRNA-ExonIDs.txt' fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = line.strip() gene, transcript, protein, exonIDs = string.split(data, '\t') exonIDs += '|' ### such that the last exon is propperly searchable if gene in gene_junction_db: junctions_data = gene_junction_db[gene] for jd in junctions_data: junctionIDs = string.split(jd.Probeset() + '|', ':')[-1] junctionIDs = string.replace( junctionIDs, '-', '|' ) ### this is the format of the transcript ExonID string if x == 0: x = 1 print junctionIDs, exonIDs if junctionIDs in exonIDs: dataw.write( string.join([jd.Probeset(), '1', transcript], '\t') + '\n') else: dataw.write( string.join([jd.Probeset(), '0', transcript], '\t') + '\n') dataw.close()
def exportExpressionData(filename, sample_db): export_text = export.ExportFile(filename) all_genes_db = {} sample_list = [] for sample in sample_db: sample_list.append(sample) gene_db = sample_db[sample] for geneid in gene_db: all_genes_db[geneid] = [] sample_list.sort() ### Organize these alphabetically rather than randomly column_header = string.join( ['ProbeName'] + sample_list, '\t') + '\n' ### format column-names for export export_text.write(column_header) for geneid in all_genes_db: values = [] for sample in sample_list: try: values.append( sample_db[sample][geneid]) ### protein_expression except Exception: values.append(0) export_text.write( string.join([geneid] + map(str, values), '\t') + '\n') export_text.close()
def getPathwayAs(pathway_db, species_code, mod): begin_time = time.time() try: export.deleteFolder( 'BuildDBs/WPs') ### clear any remaining pathway files except Exception: null = [] for wpid in pathway_db: file_type = 'gpml' wp_id_data = client.service.getPathwayAs(fileType=file_type, pwId=wpid, revision=0) wp_id_data = base64.b64decode(wp_id_data) gpml_path = filepath('BuildDBs/WPs/' + wpid + '.gpml') outfile = export.ExportFile(gpml_path) outfile.write(wp_id_data) outfile.close() gene_system_list = string.split(wp_id_data, '\n') parent_path = export.findParentDir(gpml_path) pathway_db = gene_associations.getGPMLGraphData( parent_path, species_code, mod) ### get GPML data back os.remove(gpml_path) ### Only store the file temporarily end_time = time.time() time_diff = float(end_time - begin_time) """ try: print "WikiPathways data imported in %d seconds" % time_diff except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command """ return pathway_db
def unzipFiles(filename,dir): import zipfile output_filepath = filepath(dir+filename) try: zfile = zipfile.ZipFile(output_filepath) for name in zfile.namelist(): if name.endswith('/'):null=[] ### Don't need to export else: try: outfile = export.ExportFile(dir+name) except Exception: outfile = export.ExportFile(dir+name[1:]) outfile.write(zfile.read(name)); outfile.close() #print 'Zip extracted to:',output_filepath status = 'completed' except Exception, e: print e print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.' status = 'failed'
def exportANOVAStats(filename,matrix_pvalues_list): import export export_name = filename[:-4]+'-stats.txt' ee=export.ExportFile(export_name) ee.write('SplicingEvent\tANOVA rawp\tANOVA adjp\tDriving Group(s)\n') for ls in matrix_pvalues_list: ee.write(string.join(ls,'\t')+'\n') ee.close()
def exportEnsemblUniprot(filename): import export export_data = export.ExportFile(filename) export_data.write(string.join(['ensembl', 'uniprot'], '\t') + '\n') for uniprot in uniprot_ensembl_db: for ensembl in uniprot_ensembl_db[uniprot]: export_data.write(string.join([ensembl, uniprot], '\t') + '\n') export_data.close()
def createSchemaTextFile(species, platform, schema_text, DBname): schema_filename = filepath('AltDatabase/' + species + '/' + platform + '/' + DBname + '_schema.sql') export_data = export.ExportFile(schema_filename) ### We will need to augment the database with protein feature annotations for export_data.write(schema_text) export_data.close()
def returnANOVAFiltered(original_data, matrix_pvalues): import export eo = export.ExportFile(filename[:-4] + '-ANOVA.txt') eo.write(original_data['header']) for key in matrix_pvalues: eo.write(original_data[key]) last_line = original_data[key] eo.close()
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def exportVersionData(version,version_date,dir,force='application-path'): new_file = dir+'version.txt' new_file_default = filepath(new_file,force=force) ### can use user directory local or application local print new_file_default;sys.exit() try: data.write(str(version)+'\t'+str(version_date)+'\n'); data.close() except: data = export.ExportFile(new_file) data.write(str(version)+'\t'+str(version_date)+'\n'); data.close()
def exportSelectedIDs(input_IDs): expressionFile = outputDir+'/networks/IDList.txt' data = export.ExportFile(expressionFile) data.write('UID\tSystemCode\n') for id in input_IDs: if 'HMDB' in id: id = hmdb_symbol_db[id] data.write(id+'\tEn\n') data.close() return expressionFile
def extractFeatures(species,countinp): import export ExonsPresent=False lastgene = None lastend = None genes_detected={} count=0 first_last_exons = {} ### Make a fake junction comprised of the first and last exon if 'counts.' in countinp: feature_file = string.replace(countinp,'counts.','features.') fe = export.ExportFile(feature_file) firstLine = True for line in open(countinp,'rU').xreadlines(): if firstLine: firstLine=False else: feature_info = string.split(line,'\t')[0] fe.write(feature_info+'\n') junction_annotation = string.split(feature_info,'=')[0] if '-' in junction_annotation: geneid = string.split(junction_annotation,':')[0] genes_detected[geneid]=[] if ExonsPresent == False: exon = string.split(feature_info,'=')[0] if '-' not in exon: ExonsPresent = True ### Add exon-info if necessary exons_file = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt') firstLine = True for line in open(exons_file,'rU').xreadlines(): if firstLine: firstLine=False else: line = line.rstrip('\n') t = string.split(line,'\t') gene,exon,chr,strand,start,end = t[:6] if gene!=lastgene: if len(genes_detected)==0 or gene in genes_detected: ### restrict to detected genes first_last_exons[gene] = [(chr,start)] if len(genes_detected)==0 or lastgene in genes_detected: ### restrict to detected genes try: first_last_exons[lastgene].append(lastend) except Exception: pass ### occurs for the first gene if ExonsPresent == False: fe.write(gene+':'+exon+'='+chr+':'+start+'-'+end+'\n') lastgene = gene; lastend = end if len(genes_detected)==0 or lastgene in genes_detected: first_last_exons[lastgene].append(lastend) ### Add a fake junction for the whole gene for gene in first_last_exons: (chr,start),end = first_last_exons[gene] fe.write(gene+':E1.1-E100.1'+'='+chr+':'+start+'-'+end+'\n') fe.close() return feature_file
def exportVersionData(version,version_date,dir): ### Used by the module UI program_type,database_dir = unique.whatProgramIsThis(); parent_dir = '' if program_type == 'AltAnalyze': parent_dir = 'AltDatabase/goelite/' elif 'OBO' in dir or 'Config' in dir: parent_dir = '' else: parent_dir = database_dir dir = parent_dir+dir global current_version; current_version = version global current_version_date; current_version_date = version_date new_file = dir+'version.txt' data = export.ExportFile(new_file) data.write(str(version)+'\t'+str(version_date)+'\n'); data.close()
def transposeMatrix(input_file): arrays = [] import export eo = export.ExportFile(input_file[:-4] + '-transposed.txt') for line in open(input_file, 'rU').xreadlines(): data = cleanUpLine(line) values = string.split(data, '\t') arrays.append(values) t_arrays = zip(*arrays) for t in t_arrays: eo.write(string.join(t, '\t') + '\n') eo.close()
def exportPathwayData(original_pathway_z_score_data, gene_file, headers, resource_name, pathway_type): if resource_name == 'GeneOntology': resource_name = 'GO' ### Makes the output filename compatible with GenMAPP-CS plugin filenames if resource_name == 'WikiPathways': resource_name = 'local' ### Makes the output filename compatible with GenMAPP-CS plugin filenames new_file = mappfinder_output_dir + '/' + gene_file[: -4] + '-' + resource_name + '.txt' global sort_results data = export.ExportFile(new_file) data.write(headers) sort_results = [] #print "Results for",len(original_pathway_z_score_data),"pathways exported to",new_file for pathway in original_pathway_z_score_data: zsd = original_pathway_z_score_data[pathway] try: results = [ zsd.Changed(), zsd.Measured(), zsd.InPathway(), zsd.PercentChanged(), zsd.PercentPresent(), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP() ] except AttributeError: print pathway, len(permuted_z_scores[pathway]) kill try: ###This is unnecessary, unless using the non-nested GO associations (which can have out of sync GOIDs) if pathway_type == 'Ontology': s = ontology_annotations[pathway] annotations = [ s.OntologyID(), s.OntologyTerm(), s.OntologyType() ] results = annotations + results else: results = [pathway] + results results = string.join(results, '\t') + '\n' sort_results.append( [float(zsd.ZScore()), -1 / float(zsd.Measured()), results]) except KeyError: null = [] sort_results.sort() sort_results.reverse() for values in sort_results: results = values[2] data.write(results) data.close()
def exportSymbolRelationships(pathway_to_symbol,selected_species,pathway_type,type): if selected_species != None: ### Restrict to selected species only current_species_dirs=selected_species else: current_species_dirs = unique.read_directory('/'+database_dir) for species in current_species_dirs: if '.' not in species: ens_dir = database_dir+'/'+species+'/gene-'+type+'/Ensembl-'+pathway_type+'.txt' ens_data = export.ExportFile(ens_dir) if 'mapp' in type: ens_data.write('GeneID\tSystem\tGeneSet\n') else: ens_data.write('GeneID\tGeneSet\n') try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene')) except Exception: ens_to_entrez ={} if len(ens_to_entrez)>0: entrez_dir = database_dir+'/'+species+'/gene-'+type+'/EntrezGene-'+pathway_type+'.txt' entrez_data = export.ExportFile(entrez_dir) if 'mapp' in type: entrez_data.write('GeneID\tSystem\tGeneSet\n') else: entrez_data.write('GeneID\tGeneSet\n') #print 'Exporting '+pathway_type+' databases for:',species try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: gene_to_source_id={} source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) source_to_gene = lowerSymbolDB(source_to_gene) for pathway in pathway_to_symbol: for symbol in pathway_to_symbol[pathway]: try: genes = source_to_gene[symbol] for gene in genes: if 'mapp' in type: ens_data.write(gene+'\tEn\t'+pathway+'\n') else: ens_data.write(gene+'\t'+pathway+'\n') if gene in ens_to_entrez: for entrez in ens_to_entrez[gene]: if 'mapp' in type: entrez_data.write(entrez+'\tL\t'+pathway+'\n') else: entrez_data.write(entrez+'\t'+pathway+'\n') except Exception: null=[] ens_data.close() try: entrez_data.close() except Exception: null=[]
def exportOntologyRelationships(nested_ontology_gene,gene_to_source_id,mod,source_type,ontology_type): program_type,database_dir = unique.whatProgramIsThis() if ontology_type == 'GeneOntology': ontology_type = 'GO' new_file = database_dir+'/'+species_code+'/nested/'+mod+'_to_Nested-'+ontology_type+'.txt' data = export.ExportFile(new_file) title = [mod,'ontology_id']; title_str = string.join(title,'\t') data.write(title_str+'\n') for ontology_id in nested_ontology_gene: for gene in nested_ontology_gene[ontology_id]: output_list = [gene,ontology_id] output_str = string.join(output_list,'\t') data.write(output_str+'\n') data.close() print new_file, 'saved to disk'
def filterPSIValues(filename, percentCutoff=0.75, filterStatus=True): """ Filter the PSI file to only include events in which >75% of the samples have PSI values """ firstRow = True header = True rows = 0 filtered = 0 ### Filtered Export file new_file = filename[:-4] + '-' + str(int(100 * percentCutoff)) + 'p.txt' if filterStatus: ea = export.ExportFile(new_file) for line in open(filename, 'rU').xreadlines(): data = line.rstrip() t = string.split(data, '\t') if header: header = False eventindex = t.index( 'EventAnnotation' ) ### This field is the last annotation column before sample PSI values t = [t[1]] + t[eventindex + 1:] header_length = len(t) - 1 minimum_values_present = int( float((header_length) - 1.0) * percentCutoff) not_detected = header_length - minimum_values_present new_line = line if filterStatus: ea.write(new_line) else: if filterStatus: ### If sufficient PSI detected samples, write to new file t = [t[1]] + t[eventindex + 1:] missing_values_at_the_end = (header_length + 1) - len(t) missing = missing_values_at_the_end + t.count('') if missing < not_detected: new_line = line ea.write(new_line) filtered += 1 rows += 1 if filterStatus: ea.close() return new_file, header_length else: ### Just return the number of samples in the file return header_length