/
consensus_search.py
163 lines (119 loc) · 5.32 KB
/
consensus_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
"""
A very simple consensus sequence search with required letters.
------------------------------------------------------------------------------
Information
===========
This script searches a fasta genome for instances of a consensus sequence while
also requiring a particular 3' sequence.
Under the hood, we use pygr (for loading the genome) and motility (for
performing the search).
Download
========
If you're on our `github page <https://github.com/uci-cbcl/consensus_search/>`_,
you can click on the "Download Zip" button. Otherwise, try this link::
https://github.com/uci-cbcl/consensus_search/archive/master.zip
Usage
=====
To perform the search used in the *genesis* paper, first download the xenopus genome::
wget ftp://ftp.xenbase.org/pub/Genomics/JGI/Xentr7.1/xenopus_tropicalis_v7.1.tar.gz
tar xfz xenopus_tropicalis_v7.1.tar.gz
mv 20100930/sequences/Xenopus_tropicalis.main_genome.scaffolds.fasta .
rm -r xenopus_tropicalis_v7.1.tar.gz 20100930
Then install `motility <https://github.com/ctb/motility>`_ and
pygr (`easy_install pygr`).
Finally, you can reproduce the results from the paper by running::
python consensus_search.py --genome Xenopus_tropicalis.main_genome.scaffolds.fasta \
--consensus GGAACTGGCCCCTGCAAACA --required_3p_seq NGG --mismatches 5 \
--outfile results.bed
This script will search for mismatches to the tyrosinase site above allowing for a
degenerate NGG PAM sequence at the 3' end. To search for mismatch sites to your own
target site of interest, substitute your sequence in place of the above consensus when
prompted.
Note that any of the IUPAC letters can be used in the
consensus and required sequences. Specifically,
========== ==============
IUPAC code Allowed letter
========== ==============
A Adenine
C Cytosine
G Guanine
T Thymine
U Uracil (converted to T for DNA search)
R Purine (A or G)
Y Pyrimidine (C or T)
M C or A
K T, or G
W T, or A
S C or G
B C, T, or G (not A)
D A, T, or G (not C)
H A, T, or C (not G)
V A, C, or G (not T)
N Any base (A, C, G, or T)
========== ==============
For additional usage instructions, run::
python consensus_search.py --help
This script is also available at https://github.com/uci-cbcl/consensus-search
and any updates can be found there.
Or shoot me an email at jake.biesinger@gmail.com
"""
__author__ = "Jake Biesinger, jake.biesinger@gmail.com"
__license__ = "Apache"
import argparse
import sys
try:
import motility
except ImportError:
print "Couldn't import motility! please download and install it from https://github.com/ctb/motility"
sys.exit()
try:
from pygr.seqdb import SequenceFileDB
except ImportError:
print "Couldn't import pygr! Please install it by typing `sudo easy_install pygr`"
sys.exit()
DNA_LETTERS = 'ACGT'
IUPAC_LETTERS = dict(A='A', C='C', G='G', T='T', U='T', R='AG', Y='CT', S='GC',
W='AT', K='GT', M='AC', B='CGT',
D='AGT', H='ACT', V='ACG', N='ACGT')
IUPAC_SCORES = {k: [0 if l in v else -1 for l in DNA_LETTERS]
for k, v in IUPAC_LETTERS.items()}
# required letters are just PWM entries with -1000000 in them instead of -1
REQUIRED_SCORES = {k: [elem * 1000000 for elem in v]
for k, v in IUPAC_SCORES.items()}
def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--genome", required=True,
help="fasta file with genome to search")
parser.add_argument("--mismatches", required=True, type=int,
help="The number of mismatches to allow")
parser.add_argument("--consensus", required=True,
help="The consensus sequence to search for")
parser.add_argument("--required_3p_seq", required=True,
help="the sequence *required* on the 3' end")
parser.add_argument("--outfile", required=True,
help="where to save the results")
return parser
def main(argv=None):
parser = make_parser()
args = parser.parse_args(argv)
genome = SequenceFileDB(args.genome)
pwm = [IUPAC_SCORES[l] for l in args.consensus]
pwm.extend([REQUIRED_SCORES[l] for l in args.required_3p_seq])
pwm = motility.PWM(pwm)
# find all matches
with open(args.outfile, 'w') as outfile:
for chrom in genome.keys():
chromseq = str(genome[chrom])
print "searching ", chrom, "of length", len(chromseq)
if len(chromseq) < len(pwm):
print 'chromosome/fragment', chrom, 'is too short'
continue
matches = pwm.find(chromseq, -args.mismatches)
for start, stop, strand, seq in matches:
score = pwm.calc_score(seq)
outfile.write('\t'.join(
[chrom, str(start), str(stop), seq, str(score),
'+' if strand == 1 else '-']) + '\n')
if __name__ == '__main__':
main()