def importBioMarkerGeneAssociations(): try: biomarker_files = unique.read_directory('BuildDBs/BioMarkers/') except Exception: biomarker_files = unique.read_directory('/BuildDBs/BioMarkers/') x=0; marker_symbol_db={} for file in biomarker_files: if '.txt' in file: fn = filepath('BuildDBs/BioMarkers/'+file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x = 1; y=0 for i in t: if 'marker-in' in i: mi = y if 'Symbol' in i: sy = y y+=1 ensembl = t[0]; symbol = string.lower(t[sy]); marker = t[mi] markers = string.split(marker,'|') for marker in markers: try: marker_symbol_db[marker].append(symbol) except Exception: marker_symbol_db[marker]=[symbol] marker_symbol_db = gene_associations.eliminate_redundant_dict_values(marker_symbol_db) return marker_symbol_db
def importPAZARcompiled(): """ Skips over the above function when these tf-target file is downlaoded directly """ tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt' tf_to_symbol={} fn = filepath(tf_dir) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) tf_id,symbol = string.split(data,'\t') try: tf_to_symbol[tf_id].append(symbol) except Exception: tf_to_symbol[tf_id] = [symbol] tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol) return tf_to_symbol
def importAmandeusPredictions(force): if force == 'yes': downloadAmadeusPredictions() x=0 tf_symbol_db={} fn = filepath('BuildDBs/Amadeus/symbol-Metazoan-Amadeus.txt') for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: symbol,system,tf_name = string.split(data,'\t') symbol = string.lower(symbol) try: tf_symbol_db[tf_name].append(symbol) except Exception: tf_symbol_db[tf_name]=[symbol] tf_symbol_db = gene_associations.eliminate_redundant_dict_values(tf_symbol_db) return tf_symbol_db
def importPhenotypeOntologyGeneAssociations(): x=0 pheno_symbol={}; phen=[] fn = filepath('BuildDBs/Pheno/HMD_HumanPhenotype.rpt') for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: t = string.split(data,'\t') hs_symbol=t[0]; hs_entrez=t[1]; mm_symbol=t[2]; mgi=t[3]; pheno_ids=t[4] hs_symbol = string.lower(hs_symbol) mm_symbol = string.lower(mm_symbol) symbols = [mm_symbol,hs_symbol] pheno_ids = string.split(pheno_ids,' '); phen+=pheno_ids for pheno_id in pheno_ids: if len(pheno_id)>0: for symbol in symbols: try: pheno_symbol[pheno_id].append(symbol) except Exception: pheno_symbol[pheno_id]=[symbol] phen = unique.unique(phen) pheno_symbol = gene_associations.eliminate_redundant_dict_values(pheno_symbol) return pheno_symbol
def importPAZARAssociations(): pazar_files = unique.read_directory('/BuildDBs/PAZAR') species_db={} tf_to_target={} for file in pazar_files: if '.csv' in file: name = string.join(string.split(file,'_')[1:-1],'_') fn = filepath('BuildDBs/PAZAR/'+file) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) try: ### Each line contains the following 11 tab-delim fields: ### Fields are: <PAZAR TF ID> <TF Name> <PAZAR Gene ID> <ensembl gene accession> <chromosome> <gene start coordinate> <gene end coordinate> <species> <project name> <PMID> <analysis method> pazar_tf_id, tf_name, pazar_geneid, ens_gene, chr, gene_start,gene_end,species,project,pmid,analysis_method = string.split(data,'\t') species,genus = string.split(species,' ') species = species[0]+genus[0] tft=TFTargetInfo(tf_name,ens_gene,project,pmid,analysis_method) try: tf_to_target[species,tf_name].append(tft) except Exception: tf_to_target[species,tf_name] = [tft] species_db[species]=[] except Exception: None ### Occurs due to file formatting issues (during an update?) determine_tf_geneids = 'no' if determine_tf_geneids == 'yes': """ The below code is probably most useful for creation of complex regulatory inference networks in Cytoscape """ uniprot_ensembl_db = importUniProtAnnotations(species_db) missing=[] tf_to_target_ens={} for (species,tf_name) in tf_to_target: original_tf_name = tf_name try: ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,tf_name] except Exception: try: tf_name = string.split(tf_name,'_')[0] ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name] except Exception: try: tf_names=[] if '/' in tf_name: tf_names = string.split(tf_name,'/') elif ' ' in tf_name: tf_names = string.split(tf_name,' ') for tf_name in tf_names: ens_gene = uniprot_ensembl_db[species,tf_name] tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name] except Exception: missing.append((tf_name,species)) print 'Ensembl IDs found for UniProt Transcription factor names:',len(tf_to_target_ens),'and missing:', len(missing) #print missing[:20] ### Translate all species data to gene symbol to export for all species species_tf_targets={} for (species,tf_name) in tf_to_target: try: tf_db = species_tf_targets[species] tf_db[tf_name] = tf_to_target[species,tf_name] except Exception: tf_db = {} tf_db[tf_name] = tf_to_target[species,tf_name] species_tf_targets[species] = tf_db tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt' tf_data = export.ExportFile(tf_dir) tf_to_symbol={} #print 'Exporting:',tf_dir #print len(species_tf_targets) for species in species_tf_targets: try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: gene_to_source_id={} tf_db = species_tf_targets[species] for tf_name in tf_db: for tft in tf_db[tf_name]: try: for symbol in gene_to_source_id[tft.Ensembl()]: symbol = string.lower(symbol) tf_id = tf_name+'(Source:'+tft.Project()+'-PAZAR'+')' tf_data.write(tf_id+'\t'+symbol+'\n') try: tf_to_symbol[tf_id].append(symbol) except Exception: tf_to_symbol[tf_id] = [symbol] except Exception: null=[]; tf_data.close() tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol) return tf_to_symbol