예제 #1
0
def importBioMarkerGeneAssociations():
    try:
        biomarker_files = unique.read_directory('BuildDBs/BioMarkers/')
    except Exception:
        biomarker_files = unique.read_directory('/BuildDBs/BioMarkers/')
    x=0; marker_symbol_db={}
    for file in biomarker_files:
        if '.txt' in file:
            fn = filepath('BuildDBs/BioMarkers/'+file)
            for line in open(fn,'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data,'\t')
                if x==0:
                    x = 1; y=0
                    for i in t:
                        if 'marker-in' in i: mi = y
                        if 'Symbol' in i: sy = y
                        y+=1
                ensembl = t[0]; symbol = string.lower(t[sy]); marker = t[mi]
                markers = string.split(marker,'|')
                for marker in markers:
                    try: marker_symbol_db[marker].append(symbol)
                    except Exception: marker_symbol_db[marker]=[symbol]
    marker_symbol_db = gene_associations.eliminate_redundant_dict_values(marker_symbol_db)
    return marker_symbol_db
예제 #2
0
def importPAZARcompiled():
    """ Skips over the above function when these tf-target file is downlaoded directly """
    tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt'
    tf_to_symbol={}
    fn = filepath(tf_dir)
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        tf_id,symbol = string.split(data,'\t')
        try: tf_to_symbol[tf_id].append(symbol)
        except Exception: tf_to_symbol[tf_id] = [symbol]
    tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol)
    return tf_to_symbol
예제 #3
0
def importAmandeusPredictions(force):
    if force == 'yes':
        downloadAmadeusPredictions()
        
    x=0
    tf_symbol_db={}
    fn = filepath('BuildDBs/Amadeus/symbol-Metazoan-Amadeus.txt')
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            symbol,system,tf_name = string.split(data,'\t')
            symbol = string.lower(symbol)
            try: tf_symbol_db[tf_name].append(symbol)
            except Exception: tf_symbol_db[tf_name]=[symbol]
    tf_symbol_db = gene_associations.eliminate_redundant_dict_values(tf_symbol_db)
    return tf_symbol_db
예제 #4
0
def importPhenotypeOntologyGeneAssociations():
    x=0
    pheno_symbol={}; phen=[]
    fn = filepath('BuildDBs/Pheno/HMD_HumanPhenotype.rpt')
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            t = string.split(data,'\t')
            hs_symbol=t[0]; hs_entrez=t[1]; mm_symbol=t[2]; mgi=t[3]; pheno_ids=t[4]
            hs_symbol = string.lower(hs_symbol)
            mm_symbol = string.lower(mm_symbol)
            symbols = [mm_symbol,hs_symbol]
            pheno_ids = string.split(pheno_ids,' '); phen+=pheno_ids
            for pheno_id in pheno_ids:
                if len(pheno_id)>0:
                    for symbol in symbols:
                        try: pheno_symbol[pheno_id].append(symbol)
                        except Exception: pheno_symbol[pheno_id]=[symbol]
    phen = unique.unique(phen)
    pheno_symbol = gene_associations.eliminate_redundant_dict_values(pheno_symbol)
    return pheno_symbol
예제 #5
0
def importPAZARAssociations():
    pazar_files = unique.read_directory('/BuildDBs/PAZAR')
    species_db={}
    tf_to_target={}
    for file in pazar_files:
        if '.csv' in file:
            name = string.join(string.split(file,'_')[1:-1],'_')
            fn = filepath('BuildDBs/PAZAR/'+file)
            for line in open(fn,'rU').xreadlines():
                data = cleanUpLine(line)
                try:
                    ### Each line contains the following 11 tab-delim fields:
                    ### Fields are: <PAZAR TF ID>  <TF Name>  <PAZAR Gene ID>  <ensembl gene accession>  <chromosome>  <gene start coordinate>  <gene end coordinate>  <species>  <project name>  <PMID>  <analysis method> 
                    pazar_tf_id, tf_name, pazar_geneid, ens_gene, chr, gene_start,gene_end,species,project,pmid,analysis_method = string.split(data,'\t')
                    species,genus = string.split(species,' ')
                    species = species[0]+genus[0]
                    tft=TFTargetInfo(tf_name,ens_gene,project,pmid,analysis_method)
                    try: tf_to_target[species,tf_name].append(tft)
                    except Exception: tf_to_target[species,tf_name] = [tft]
                    species_db[species]=[]
                except Exception:
                    None ### Occurs due to file formatting issues (during an update?)

    determine_tf_geneids = 'no'
    if determine_tf_geneids == 'yes':
        """ The below code is probably most useful for creation of complex regulatory inference networks in Cytoscape """
        uniprot_ensembl_db = importUniProtAnnotations(species_db)
        missing=[]
        tf_to_target_ens={}
        for (species,tf_name) in tf_to_target:
            original_tf_name = tf_name
            try:
                ens_gene = uniprot_ensembl_db[species,tf_name]
                tf_to_target_ens[ens_gene]=tf_to_target[species,tf_name]
            except Exception:
                try:
                    tf_name = string.split(tf_name,'_')[0]
                    ens_gene = uniprot_ensembl_db[species,tf_name]
                    tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name]
                except Exception:
                    try:
                        tf_names=[]
                        if '/' in tf_name:
                            tf_names = string.split(tf_name,'/')
                        elif ' ' in tf_name:
                            tf_names = string.split(tf_name,' ')
                        for tf_name in tf_names:
                            ens_gene = uniprot_ensembl_db[species,tf_name]
                            tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name]          
                    except Exception: missing.append((tf_name,species))
        print 'Ensembl IDs found for UniProt Transcription factor names:',len(tf_to_target_ens),'and missing:', len(missing)
        #print missing[:20]
        
    ### Translate all species data to gene symbol to export for all species
    species_tf_targets={}
    for (species,tf_name) in tf_to_target:
        try:
            tf_db = species_tf_targets[species]
            tf_db[tf_name] = tf_to_target[species,tf_name]
        except Exception:
            tf_db = {}
            tf_db[tf_name] = tf_to_target[species,tf_name]
            species_tf_targets[species] = tf_db
        
    tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt'
    tf_data = export.ExportFile(tf_dir)
    tf_to_symbol={}
    #print 'Exporting:',tf_dir
    #print len(species_tf_targets)
    for species in species_tf_targets:
        try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        except Exception: gene_to_source_id={}
        tf_db = species_tf_targets[species]
        for tf_name in tf_db:
            for tft in tf_db[tf_name]:
                try:
                    for symbol in gene_to_source_id[tft.Ensembl()]:
                        symbol = string.lower(symbol)
                        tf_id = tf_name+'(Source:'+tft.Project()+'-PAZAR'+')'
                        tf_data.write(tf_id+'\t'+symbol+'\n')
                        try: tf_to_symbol[tf_id].append(symbol)
                        except Exception: tf_to_symbol[tf_id] = [symbol]
                except Exception: null=[]; 
    tf_data.close()
    tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol)
    return tf_to_symbol