-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_HSPInt_graph.py
executable file
·162 lines (120 loc) · 5.28 KB
/
build_HSPInt_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os, re
from cPickle import dump,load, HIGHEST_PROTOCOL
import networkx as nx
import configurations as conf
import util
import sys
#, query, target, query_start, query_end, target_start, target_end, align_length, identity):
# reads the relevant information from a line of the all-to-all from BLASTp the format of the result is shown below:
#[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11]
# query, subject, %id, alignlen, mismatch, gapopen, qst, qend, sst, send, Evalue, bitscore
def read_HSP(line):
splitArray=line.split("\t")
hsp = {}
hsp["query_id"] = splitArray[0]
hsp["target_id"] = splitArray[1]
hsp["query_start"]= int(splitArray[6])
hsp["query_end"] = int(splitArray[7])
hsp["query_len"]= hsp["query_end"]-hsp["query_start"]
hsp["target_start"]= int(splitArray[8])
hsp["target_end"] = int(splitArray[9])
hsp["target_len"] = hsp["target_end"]-hsp["target_start"]
hsp["EValue"] = float(splitArray[10])
return hsp
def nodeName(hsp, focus):
name=""
if focus=="target":
name=(hsp["target_id"],str(hsp["target_start"]),str(hsp["target_end"]))
elif focus == "query":
name=hsp["query_id"],str(hsp["query_start"]),str(hsp["query_end"])
return name
#looks at the four intervals provided by the two hsp (each hsp has two intervals in the form (q,s,e), (t,s,e)), and returns a list of nodeName pairs to add edges
def findOverlapIntervals(name1, name2, cutoffRatio):
nodeNamePairs=[]
interval1Start=int(name1[1])
interval1End=int(name1[2])
interval2Start=int(name2[1])
interval2End=int(name2[2])
overlap=util.overlap(interval1Start,interval1End,interval2Start,interval2End)
intervalLen1=interval1End-interval1Start
intervalLen2=interval2End-interval2Start
overlapRatio1=float(overlap)/float(intervalLen1)
overlapRatio2=float(overlap)/float(intervalLen2)
maxOverlapRatio=max(overlapRatio1, overlapRatio2)
if(name1[0]!=name2[0]):
print "Error!!! ", name1, name2
#add the nodeNamePair to nodeNamePairs
if maxOverlapRatio>cutoffRatio:
#print name1,name2
nodeNamePairs.append( (name1,name2) )
return nodeNamePairs
def addToDict(dictionary, key, append):
if key in dictionary:
#add the element to list and remove duplicates
#print dictionary
#print append
dictionary[key].append(append)
dictionary[key]=list(set(dictionary[key]))
else:
dictionary[key]=[append]
def build_graph(blastInfoFilename,blastdir, hspIntGraphdir, cutoffRatio, evalueCutoff):
#Generate the output folder
util.generateDirectories(hspIntGraphdir)
g=nx.Graph()
#read the file
f=open(os.path.join(blastdir,blastInfoFilename),"r")
content=f.read()
f.close()
#a dictionary that stores node names by the protein names
nodeNames={}
#add the HSP edges
for i, line in enumerate(content.split("\n")):
if(i%(len(content.split("\n"))/10)==0):
#sys.stdout.write(str(int(float(10*i)/float(len(content.split("\n"))))))
sys.stdout.write("*")
sys.stdout.flush()
if len(line)>0:
hsp=read_HSP(line)
goodeval=hsp["EValue"]<evalueCutoff
notsameprotein = (hsp["query_id"]!=hsp["target_id"])
if goodeval and notsameprotein:
#Add the nodes (p_1,s_1,e_1) and (p_2,s_2,e_2) and create an edge between them
g.add_node(nodeName(hsp,"query"))
g.add_node(nodeName(hsp,"target"))
g.add_edge(nodeName(hsp,"query"),nodeName(hsp,"target"), eValue=hsp["EValue"])
#add the two node names to the nodeNames dictionary and take away the duplicates
addToDict(nodeNames,nodeName(hsp,"query")[0],nodeName(hsp,"query"))
addToDict(nodeNames,nodeName(hsp,"target")[0],nodeName(hsp,"target"))
sys.stdout.write("\n")
sys.stdout.flush()
#add the Interval edges
proteins=nodeNames.keys()
for protein in proteins:
# if(i%(len(proteins)/10)==0):
# sys.stdout.write("*")
# sys.stdout.flush()
subNodeNames=nodeNames[protein]
for i in xrange(len(subNodeNames)-1):
for j in xrange(i+1, len(subNodeNames)):
name1 = subNodeNames[i]
name2 = subNodeNames[j]
overlapPairs=findOverlapIntervals(name1, name2, cutoffRatio)
for overlapPair in overlapPairs:
g.add_edge(overlapPair[0],overlapPair[1])
# sys.stdout.write("\n")
# sys.stdout.flush()
#save the HSPIntGraph
splitFilename=blastInfoFilename.split(".")
fileExt="."+splitFilename[len(splitFilename)-1]
outputFile=blastInfoFilename.replace(fileExt,"")+'_HSPIntGraph.gpickle'
outputPath=os.path.join(hspIntGraphdir, outputFile)
with open(outputPath,'wb') as fout:
dump(g, fout, HIGHEST_PROTOCOL)
return outputFile
def main(blastInfoFilename):
#blastdir=conf.blastdir
hspIntGraphdir=conf.hspIntGraphdir
cutoffRatio=conf.cutoffRatio
evalueCutoff=conf.evalueCutoff
blastdir=conf.blastdir
return build_graph(blastInfoFilename,blastdir, hspIntGraphdir, cutoffRatio, evalueCutoff)