/
statistic.py
109 lines (86 loc) · 3.39 KB
/
statistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
import networkx as nx
from dag import *
import config
import utils
# Compute the percentage of genes in the network that has at least one GO term
# in any one of its neighbour
def compute_term_in_neighbour_ratio():
total_gene = len(network.nodes())
count = 0
for gene in network.nodes():
terms = gene_annotation[gene]
for neighbour in network.neighbors(gene):
nterms = gene_annotation[neighbour]
if len( set(terms).intersection(set(nterms)) )>0:
count += 1
break
print "Percentage of genes in the network that has at least one GO term in any one of its neighbour: %f" % (float(count)/total_gene)
# Compute the average number of terms of genes in the network
def compute_avg_term_num():
total_gene = len(network.nodes())
num = 0
for gene in network.nodes():
num += len( gene_annotation[gene] )
print "Average term num: %f" % (float(num)/total_gene)
# Compute the similarity of 2 gene based on their annotations using the maximum
# term sim
def compute_gene_sim_max(terms1, terms2):
# Use max similarity between two terms as similarity of two genes
max_sim = -1.0
for t1 in terms1:
for t2 in terms2:
sim = sim_cache[t1][t2]
if sim > max_sim:
max_sim = sim
return max_sim
# Compute the similarity of 2 gene based on their annotations using the total
# term similarities
def compute_gene_sim_total(terms1, terms2):
total_sim = 0.0
for t1 in terms1:
for t2 in terms2:
sim = sim_cache[t1][t2]
total_sim += sim
return total_sim
def compute_avg_sim():
print "Gene, Neighbor sim avg, Non-neighbor sim avg"
for gene in network.nodes():
terms = gene_annotation[gene]
neighbors = network.neighbors(gene)
# Compute avg sim with its neighbors
neighbor_num = len(neighbors)
sim_avg = 0.0
for neighbor in neighbors:
nterms = gene_annotation[neighbor]
sim = compute_gene_sim_total(nterms, terms)
#sim = compute_gene_sim_max(nterms, terms)
sim_avg += sim
sim_avg /= neighbor_num
# Compute the avg sim with non-neighbor gene
count = 0
non_sim_avg = 0.0
for node in network.nodes():
if not node in neighbors:
count += 1
nterms = gene_annotation[node]
sim = compute_gene_sim_total(nterms, terms)
#sim = compute_gene_sim_max(nterms, terms)
non_sim_avg += sim
non_sim_avg /= count
print "%s, %f, %f" % (gene, sim_avg, non_sim_avg)
if __name__ == "__main__":
dag = DAG(config.go_fpath)
gene_annotation = utils.get_annotation(config.annotation_fpath, config.filtered_annotation_fpath, dag.get_root().id)
term_ic = utils.calculate_ic(gene_annotation, dag, config.ic_fpath)
network = utils.create_network(config.network_fpath)
# Remove unannotated gene from network
for node in network.nodes():
if not node in gene_annotation:
network.remove_node(node)
# Remove individual nodes by get the largest indepedent connected component
network = nx.connected_component_subgraphs(network)[0]
sim_cache = utils.read_sim(config.simcache_fpath)
#compute_term_in_neighbour_ratio()
#compute_avg_term_num()
compute_avg_sim()