Пример #1
0
def findEnrichedAnnotations_file(fgnd_file, bgnd_file, annotation_file, motifs):
    """  Check to see if the given motifs are enriched in the fg,
         as compared to the bg.  Return motif {}, each entry
         is [#of proteins with motif, |fg|, # of times bg
         permutation is as good as fg].  Do 1000 background
         permutations.

    @param fgnd_file: file of foreground genes
    @param bgnd_file: file of backgound genes
    @param annotation_file: one annotation per line; 
    @param motifs: {} of motifs you are checking for enrichment
    @return: [#of proteins with motif, |fg|, # of times bg is as good as fg] for each motif
    """

    fg_ls = utils_graph.getNodes(fgnd_file)
    bg_ls = utils_graph.getNodes(bgnd_file)
    annotations = utils_motif.protein2annotation_forMotifs(annotation_file,
                                                           motifs)
    return findEnrichedAnnotations(fs_ls, bg_ls, annotations, motifs)   
Пример #2
0
def annotate(entrez_gene_ls_file, gene2go_file):
    genes = utils_graph.getNodes(entrez_gene_ls_file)
    tax_id = getTaxID(genes, gene2go_file)
    informative_terms = getInformativeTerms(tax_id, 0)
    gene2go = parseGene2GO(gene2go_file, genes)   
    termId2termName = getTermIDtoGOterm()
    child2Parents = getChild2Parents(tax_id, termId2termName)
    getGOAnnotations(tax_id, gene2go, child2Parents)
    termID2term = term_id2term()
    distances = getMaxDistanceFromRoot()
    for protein in gene2go.keys():
        for category in gene2go[protein].keys():
            for goTerm in gene2go[protein][category].keys():
                print protein + '\t' + termID2term[goTerm].keys()[0] + '\t' + str(distances[goTerm]) + '\t' + category
Пример #3
0
def mkFASTAfromFile(geneLsFile):
    genes = utils_graph.getNodes(geneLsFile)
    query = ''
    count = 0
    for gene in genes:
        query = query + gene + ','
        count += 1
        if count % 500 == 0:
            wget_name = 'ncbi.query_' + str(count/500)
            wget_files.append(wget_name)
            wget_fasta(query, wget_name)
        query = ''
        wget_fasta(query, wget_name)
    wget_name = 'ncbi.query_' + str(count/500 + 1)
    wget_fasta(query, wget_name)
    wget_files.append(wget_name)
    for f in wget_files:
        parseWget(f, fout)
Пример #4
0
def prepAndColor(pathway, gene_file, html_color):
    """ Color these genes for this pathway.
        Does not color Cell Communication (path:hsa01430).

    @param pathway: KEGG pathway, format path:hsa04010
    @param gene_file: one gene per line
    @param html_color_1: color, html format
    """
    gene_dict = utils_graph.getNodes(gene_file)
    obj_ls = []
    fore_gnd = []
    back_gnd = []
    for gene in gene_dict.keys():
        obj_ls.append(gene)
        fore_gnd.append('black')
        back_gnd.append(html_color)

    wsdl = 'http://soap.genome.jp/KEGG.wsdl'
    serv = WSDL.Proxy(wsdl)
    url = serv.color_pathway_by_objects(pathway, obj_ls, fore_gnd, back_gnd)
    name = pathway.split(':')[-1] + '_' + html_color
    os.system('wget --output-document=' + name + ' ' + url)
    return name
Пример #5
0
""" Make a plot of host ELM sequence frequencies
    for uniq influenza ELM sequences."""
from collections import defaultdict
import utils, os, utils_graph

good_elms = utils_graph.getNodes('working/Jul7/good_phylogeny_elms')

def write_file(fname, uniq, this_host_freqs, that_host_freqs, this, that):
    used = {}
    with open(fname, 'w') as f:
        f.write('ELM\tSeq\tHost\tFreq\n')
        for elmseq in uniq:
            protein, elm, seq = elmseq.split(':')
            #new_seq = utils.mk_sub(seq)
            #new_seq = seq
            key = elm + ':' + seq
            #seq = new_seq
            if 'LIG' in key and key not in used and elm in good_elms:
                
                used[key] = True
                this_val = float(0)
                that_val = float(0)
                if key in this_host_freqs:
                    this_val = this_host_freqs[key]
                if key in that_host_freqs:
                    that_val = that_host_freqs[key]
                diffpos = max([float(0), this_val - that_val])
                diffneg = max([float(0), that_val - this_val])
                if this_val and that_val:
                    # f.write('%s\t%s\t%s\t%.10f\n' %
                    #         (elm, seq, this, this_val))
Пример #6
0
   to pick an arbitrary cutoff, and look
   at hub enrichment compared to all hits
   in the file.  All hits is not the best
   way to go; really I need to ~1700 genes
   that were tested & could be knocked
   down w/o killing the cell, but these
   are not available.
"""
import utils_stats, utils_graph

flu_rnai_file = '../Thesis/Data/Network/Flu/cell09/all_rnai'
hubs_file = '../Thesis/Data/Hubs2/HPRD.entrez.expand.hubs20'
net_file = '../Thesis/Data/Network/Human/HPRD/hprd_new.intr.ls.entrez'


network_genes = set(utils_graph.getNodes(net_file))
hubs = set(utils_graph.getNodes(hubs_file))

all_rnai = {}
replication_rnai = {}
with open(flu_rnai_file) as f:
    for line in f:
        [entrez, delNS1, 
         vRNA, replication] = [float(x) for x in line.strip().split('\t')]
        ID = str(int(entrez))
        if vRNA < float(0):
            replication_rnai[ID] = True
        all_rnai[ID] = True

bg = set(network_genes & set(all_rnai.keys()))
rep_set = set(replication_rnai.keys())
Пример #7
0
""" For the input ELMs, take the JS
    divergence for chicken/human H5N1
    to human & chicken. For which ELMs
    does the hypothesis holds. Sample equally
    from flus to avoid biases. """
import sys, utils, os, utils_graph
from collections import defaultdict

elm_file = sys.argv[1]

working_elms = utils_graph.getNodes(elm_file)

flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []
elm2seqs = defaultdict(dict)
flus = ("human", "chicken")
for flu in flus:
    # flu_elm_file = os.path.join('results',
    #                             flu + '.H5N1.elms')
    if "human" in flu:
        flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms")
    else:
        flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms")
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False)
    for elmseq in seen_seqs[flu]:
        elm, seq = elmseq.split(":")
        elm2seqs[elm][elmseq] = True

counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init")
Пример #8
0
import utils_motif, utils_graph

use_elms = utils_graph.getNodes('use_elms')

human = utils_motif.protein2annotation('human.H1N1.elms',
                                       {'ELM':True})
human_conserved = utils_motif.protein2annotation('human.H1N1.elms.90',
                                                 {'ELM':True})
swine = utils_motif.protein2annotation('swine.H1N1.elms',
                                       {'ELM':True})
swine_conserved = utils_motif.protein2annotation('swine.H1N1.elms.90',
                                                 {'ELM':True})

def get_entropy(afile):
    entropy = {}
    with open(afile) as f:
        for line in f:
            [elm, entropy_st] = line.strip().split('\t')
            if not elm in entropy:
                entropy[elm] = {}
            entropy[elm] = float(entropy_st)
    return entropy

def get_best_seq(seqs):
    ls = []
    for seq in seqs:
        ls.append([seqs[seq],seq])
    ls.sort()
    #if len(ls) > 1:
    #    print ls[0], ls[1]
    
Пример #9
0
#!/usr/bin/env python

"""For each HCV protein, calcuate the likelyhood
   of the GO BP similarity between predictions
   and gold standard. Do this for H1H2 & H1.
"""
import sys, utils_stats, utils_graph, utils_humanVirus, random, os

hhe_file = sys.argv[1]
hhp_file = sys.argv[2]
background_file = sys.argv[3]
out_file = sys.argv[4]

# this takes a long time
# utils_stats.gene_set_go_sim(background_file, 'results/HPRD.ls.entrez.gosim')

hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(hhe_file)
pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(hhp_file)
all_hps = utils_graph.getNodes(background_file)

for pred_type in ('h1', 'h1h2'):
    for vp in pred2vp2hp[pred_type].keys():
        if hhe_vp2hp.has_key(vp):
            hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]).keys()
            preds = pred2vp2hp[pred_type][vp].keys()
            go_pval = utils_stats.gene_set_go_sim_pval(preds, hhe,
                                                       'results/HPRD.ls.entrez.gosim')
            print('%s\t%s\t%.3f' %
                  (vp, pred_type, go_pval))
Пример #10
0
""" Are the 295 genes from the chandra 2010
    nature paper enriched in HPRD hubs?
"""
import utils_graph, utils_stats

rnai_file = '../Thesis/Data/Network/Flu/nature2010/rnai_hits'
hubs_file = '../Thesis/Data/Hubs2/HPRD.entrez.expand.hubs20'
net_file = '../Thesis/Data/Network/Human/HPRD/hprd_new.intr.ls.entrez'
party_file = '../Thesis/Data/Hubs2/2.party'
date_file = '../Thesis/Data/Hubs2/2.date'

rnai_genes = set(utils_graph.getNodes(rnai_file))
network_genes = set(utils_graph.getNodes(net_file))
hubs = set(utils_graph.getNodes(hubs_file))
party = set(utils_graph.getNodes(party_file))
date = set(utils_graph.getNodes(date_file))

print len(party & rnai_genes), len(date & rnai_genes)


Пример #11
0
    "ELM",
    "../../Data/ELM/Human/human.website.elm",
    "ELM",
    "../../Data/ProfileScan/all.ProfileScan.scanHPRD.notNCBI",
    "ProfileScan",
    "../../Data/Network/Human/HPRD/hprd.intr",
    "../../Data/human.hprd.prosite",
    "../../Data/Network/Human/HPRD/version2entrezgeneid",
    "../../Data/Binding_Relations/ELM.ProfileScan.pairs",
    "some out 1",
    "some out 2",
]
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

virus_elm2protein = utils_motif.annotation2protein(sys.argv[1], {sys.argv[2]: True})
study_hps = utils_graph.getNodes(sys.argv[8])
human_elm2protein = utils_motif.annotation2protein_forProteins(sys.argv[3], {sys.argv[4]: True}, study_hps)
human_cd2protein = utils_motif.annotation2protein_forProteins(sys.argv[5], {sys.argv[6]: True}, study_hps)
network = utils_graph.getEdges(sys.argv[7])
version2geneid = utils_humanVirus.get_version2entrez(sys.argv[9])
elm2cd = utils_humanVirus.get_elm2prosites(sys.argv[10])
outf1 = sys.argv[11]
outf2 = sys.argv[12]

vp_to_h1_to_h2 = {}
with open(outf1, "w") as f:
    for elm in virus_elm2protein.keys():
        if human_elm2protein.has_key(elm):
            h2_noRestrictions = human_elm2protein[elm]
            h2 = {}
            h1 = {}
Пример #12
0
"""Use Jensen-Shannon divergence 
   to make a dendrogram for eukaryotic hosts.
   Choose to cluster the ELM sequences before
   calculating JS divergence.
   To skip clusteirng, enter NA as the first
   argument. Otherwise, enter a closest flu 
   distance file computed by flu_project_host_flu_closest.py.
"""
import itertools, sys, os, utils, random, global_settings, numpy, utils_plot, utils_graph
from collections import defaultdict

results_dir = sys.argv[1] # working/runs/Jun24/
out_file = sys.argv[2]

f = os.path.join(results_dir, 'test_host_seqs')
use_seqs = utils_graph.getNodes(f)
    
counts = utils.count_host_seqs(global_settings.PLT_GENOMES,
                               results_dir, use_seqs, '.init')

ls = []
for host in counts:
    ls.append(counts[host])
all_elmSeqs = {}
for host in counts:
    for elmSeq in counts[host]:
        all_elmSeqs[elmSeq] = True

host_vecs = utils.mk_count_vecs(counts, all_elmSeqs)
host_dists = utils.mk_count_dists(host_vecs)
utils_plot.phylogeny_js(os.path.join(results_dir,
Пример #13
0
    "../../Data/prosite.id2name",
    "../../Data/ProfileScan/MyLists/" "outfile",
]
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

elm2prosite = utils_humanVirus.get_elm2prosites(sys.argv[1])
cd2nameFile = sys.argv[2]
mylistdir = sys.argv[3]
outfile = sys.argv[4]
cd2name = {}
f = open(cd2nameFile)
for line in f:
    [id, name] = line.strip().split("\t")
    cd2name[id] = name
f.close()

elms = elm2prosite.keys()
elms.sort()
with open(outfile, "w") as f:
    f.write("ELM\tBinding PROSITE or Entrez Gene IDs\n")
    for elm in elms:
        for cd in elm2prosite[elm].keys():
            if cd2name.has_key(cd):
                f.write(elm + "\t" + cd2name[cd] + "\n")
            else:
                genes = utils_graph.getNodes(mylistdir + cd)
                genes_to_print = ""
                for gene in genes.keys():
                    genes_to_print = genes_to_print + gene + ";"
                f.write(elm + "\t" + genes_to_print.strip(";") + "\n")
Пример #14
0
"""Look up stats for the 35
   ELM sequences unique to mammal
   flu."""
import utils_graph

mU = utils_graph.getNodes("working/Jul1_year/mU")
species = ("chicken", "duck", "swine", "human", "equine")
Пример #15
0
    for each vp and all.
"""
import utils_scripting, utils_humanVirus, utils_graph, sys, utils_stats

req_args = ["niaid triplet file", "prediction file", "human proteins in study", "output file"]
examples = [
    "../../Runs/Clustering.domain.s/all_niaid_triplets",
    "../../Runs/Conservation70_Cutoff.2_Window10",
    "../../Data/human.hprd.prosite",
    "some out file",
]
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(sys.argv[1])
pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(sys.argv[2])
all_hps = utils_graph.getNodes(sys.argv[3])

with open(sys.argv[4], "w") as fout:
    fout.write("Prediction Type\tVP\tHHE\tHHP\tMatch\tPrecsion\tRecall\tRandomPrecision\tPval\n")
    for predtype in pred2vp2hp.keys():
        for vp in pred2vp2hp[predtype].keys():
            if hhe_vp2hp.has_key(vp):
                hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps])

                hhe_len = len(hhe.keys())
                preds = pred2vp2hp[predtype][vp]
                preds_len = len(preds.keys())
                match = utils_graph.intersectLists([hhe, preds])
                match_len = len(match.keys())
                precision = int(round(float(100) * float(match_len) / float(preds_len)))
                if hhe_len > 0: