-
Notifications
You must be signed in to change notification settings - Fork 41
/
repToHomologSubjectPsiBlast.py
executable file
·99 lines (97 loc) · 3.53 KB
/
repToHomologSubjectPsiBlast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#from __future__ import division, with_statement
'''
Copyright 2010, 陈同 (chentong_biology@163.com).
Please see the license file for legal information.
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = 'chentong_biology@163.com'
#=========================================================
import sys
import os
from ctIO import readRep
def main():
print >>sys.stderr, "To detect the conservation among orthologs,\
use repetitions as the query and its related orthologs as db(after \
makeblastdb). "
if len(sys.argv) != 3:
print >>sys.stderr,'Using python %s repfile dbpath/' % sys.argv[0]
sys.exit(0)
#-------------------------------------
file = sys.argv[1]
if file.find('LCSs') != -1:
label = '.LCSs'
elif file.find('HCSs') != -1:
label = '.HCSs'
#patched at 20110922. Before not give the inital value to [label].
#So it will give an error when dealing with non 'LCSs' and 'HCSs'
#files.
else:
label = ''
noOrtho = 0
path = sys.argv[2]
repDict = {}
readRep(sys.argv[1], repDict)
for locus, valueL in repDict.items():
tmppath = path + locus
#print tmppath
if not os.path.exists(tmppath):
noOrtho += 1
continue
#-------------------------------
midlen = 30
short = locus+label+'.short'
long = locus +label+'.long'
fhshort = open(short, 'w')
fhlong = open(long, 'w')
group = 0
for groupD in valueL:
group += 1
tmpDict = {}
groupDKeyL = groupD.keys()
groupDKeyL.sort()
for pos in groupDKeyL:
seq = groupD[pos]
if seq not in tmpDict:
tmpDict[seq] = [str(pos[0])]
else:
tmpDict[seq].append(str(pos[0]))
#-------------------------------------------
tmpDictKeyL = tmpDict.keys()
tmpDictKeyL.sort()
for seq in tmpDictKeyL:
lenseq = len(seq)
pos = ':'.join(tmpDict[seq])
if lenseq <= midlen:
print >>fhshort, '>%s.%s.%s\n%s' % \
(locus, str(group), pos, seq)
else:
print >>fhlong, '>%s.%s.%s\n%s' % \
(locus, str(group), pos, seq)
#--------END one group------------------------------
fhshort.close()
fhlong.close()
cmdshort = ' '.join(('psiblast -query', short, '-db', tmppath, \
'-out', short+'.out', '-num_iterations 5','-evalue 20000',\
'-matrix PAM30', '-comp_based_stats 0', '-word_size 2'))
cmdlong = ' '.join(('psiblast -query', long, '-db', tmppath, \
'-out', long+'.out', '-num_iterations 5'))
os.system(cmdshort)
os.system(cmdlong)
cmdshort = ' '.join(('psiblast -query', short, '-db', tmppath, \
'-out', short+'.table', '-num_iterations 5','-evalue 20000',\
'-matrix PAM30', '-comp_based_stats 0', '-word_size 2',
'-outfmt 7'))
cmdlong = ' '.join(('psiblast -query', long, '-db', tmppath, \
'-out', long+'.table', '-num_iterations 5', '-outfmt 7'))
#print cmd
#break
os.system(cmdshort)
os.system(cmdlong)
#------------END one locus
print noOrtho
#-------------END----all-----------------
if __name__ == '__main__':
main()