-
Notifications
You must be signed in to change notification settings - Fork 0
/
chromosome_simulator.py
261 lines (212 loc) · 14.2 KB
/
chromosome_simulator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import argparse
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import random
import sys
import re
import os.path
import markov_gen
def parse_params(args):
parser = argparse.ArgumentParser(description = "Generate simulated chromosome")
# parser.add_argument('-c', '--cutoff', type = int, help = "Limit model to first c non-N bases")
parser.add_argument('-k', type = int, help = "Order of Markov chain", default = 5)
parser.add_argument('-s', '--seed', '-rng_seed', dest = 'seed', type = int, help = "RNG seed", default = None)
parser.add_argument('-n', '--negative_strand', action = "store_true", help = "Use repeats on negative string", default = False)
parser.add_argument('--family_file', help = "List of repeat families to use", default = None)
parser.add_argument('-m', '--mask', action = "store_true", help = "Turn masking on (all repeats printed as lower case).", default = False)
parser.add_argument('--mc', '--mc_file', dest = 'mc_file', help = "MC File (by default -- look in local directory; generates if not found).", default = None)
parser.add_argument('-S', '--suppress_pmck', action = "store_true", help = "Suppress the generation of a .pmc<k> file to store the markov chain for re-use")
parser.add_argument('--mi', '--max_interval', dest = "max_interval", type = int, help = "Maximum allowed length of interval between repeats; -1 value (default) means no maximum", default = -1)
parser.add_argument('--mi2', '--min_interval', dest = "min_interval", type = int, help = "Min allowed length of interval between repeats", default = 0)
parser.add_argument('--rn', '--retain_n', dest = "retain_n", action = 'store_true', help = "If used, will use the whole chromosome. Otherwise, cuts of Ns at either end.", default = False)
parser.add_argument('--nr', '--num_repeats', dest = 'num_repeats', type = int, help = "Specify the number of repeats. Simulation will terminate either 1000 bases or max interval bases past the nth instance of a repeat (excluding any other repeats in that range).", default = None)
parser.add_argument('-l', '--max_length', dest = 'max_length', type = int, help = "Maximum allowed length of simulated sequence.", default = None)
parser.add_argument('--lc', '--low_complexity', dest = 'low_complexity', action = 'store_true', help = "Keep low complexity and simple repeats (kept by default)", default = False)
parser.add_argument('--rb', '--rep_base', dest = 'rep_base', help = "Replace each TE with a ful copy of its ancestral seqeunce in the specified RepBase file", default = None)
parser.add_argument('-f', '--family_min', dest = "family_min", type = int, help = "Number of elements per family", default = 2)
parser.add_argument('--nf', '--num_family', dest = 'num_family', type = int, help = "Number of families", default = None)
#parser.add_argument('-o', '--output', help = "Output file (Default: replace chomosome file \".fa\" with \".sim.fa\")")
parser.add_argument("seq_file", help = "Sequence file (must be .fa)")
parser.add_argument("repeat_file", help = "RepeatMasker file (.fa.out)")
parser.add_argument("output", help = "Output file")
return parser.parse_args(args)
def nextRepeat(rpt_file, use_negative = True, S = {}, E = {}, I = {}):
"""Generator: each invokation returns the chromosome, start, finish, starand,
and family for the next repeat of the repeatmasker .fa.out files. S, if not empty,
is a filter for which repeats to use."""
fp = open(rpt_file)
fp.readline()
fp.readline()
for line in fp:
if line.rstrip():
A = re.split("\s+", line.strip())
chr, start, finish, strand, family, rpt_class, rpt_id = A[4], int(A[5])-1, int(A[6]), A[8], A[9], A[10], A[14]
if strand == '-' and not use_negative:
continue
if S and any([s in family for s in S]):
continue
if E and any([e in rpt_class for e in E]):
continue
if I and not family in I:
continue
if (strand == '+' or use_negative) and ((family in S) or not S) and not (rpt_class in E):
yield chr, start, finish, strand, family, rpt_class, int(rpt_id)
# fa_out_header: The fixed header lines for the .fa.out file
fa_out_header = "\tSW\tperc\tperc\tperc\tquery\tposition in query\tmatching\trepeat\tposition in repeat\n\tscore\tdiv.\tdel.\tins.\tsequence\tbegin\tend\t(left)\trepeat\tclass/family\tbegin\tend (left)\tID\n"
# fa_out_template: A template for creating lines for the .fa.out file.
fa_out_template = "\t0\t0\t0\t0\t{chr}\t{start}\t{finish}\t({left})\t{strand}\t{family}\t{rpt_class}\t0\t0\t(0)\t{rpt_id}\n"
def generate_chromosome(seq, markov_list, chr_start, chr_finish, rpt_gen, mask = False, max_interval = -1, min_interval = 0,num_repeats = None, max_length = None, limiting_chr = None, rep_base_hash = None):
"""
Generate a syntehtic sequence with real repeats:
* seq: A sequence (as a string).
* markov_list: List of the k+1 i-th order markov chains (from the markov_gen module).
* start/finish: Defined the coordinates of our actual template sequence. (We are ignoring anything that occurs before/faster.
* Allows us to cut of a prefix and/or suffix.
* rpt_gen: A generating function returning the repeat information (created by nextRepeat)
* mask: If true, all repeats will be lower-case. Otherwise, upper case.)
* max_interval: Maximum inter-repeat length.
* min_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this,
* extend the length.
* max_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this,
* cut the length.
"""
last_end = chr_start
if max_interval == -1:
max_interval = len(seq)
sim_seq = "" # Simulated sequence
fa_out = [] # Hold the new .fa.out file contents (by line)
rpt_count = 0 # Count of repeats (so we can quit when we reach num_repeats, if applicable)
for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen:
if limiting_chr and chr not in limiting_chr: # Skip if we are on the wrong chromsome
continue
if start >= chr_finish: # Quit if we have gone past the allowed range (repeats are assumed to be sorted by start)
break
if start < chr_start or finish > chr_finish: # Skip if we are outside the allowed range
continue
if start < last_end: # Skip if this repeat overlapped the last one
continue
rpt_count += 1
# Add the next inter-TE sequence
inter_seq_len = max(min_interval, min(start - last_end, max_interval))
sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)
# Add the next sequence
if rep_base_hash:
rpt_seq = rep_base_hash[family]
else:
rpt_seq = seq[start:finish]
fa_out.append([chr, len(sim_seq)+1, len(sim_seq) + len(rpt_seq), strand, family, rpt_class, rpt_id]) # Coords adjusted for biologist notation
sim_seq += rpt_seq.lower() if mask else rpt_seq.upper()
if rpt_count == num_repeats:
break
last_end = max(last_end, finish)
# Add final sequence on
final_seq_len = max(min_interval, min(chr_finish - last_end, max_interval))
sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len)
sim_seq_len = len(sim_seq)
fa_out_str = fa_out_header
for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out:
fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left = sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id)
return sim_seq, fa_out_str
bases = set("ACGTacgt")
def loadSeqAndChain(seq_file, k, suppress_save = False, mc_file = None, retain_n = False):
"""Load the sequence and the Markov Chain List.
Load the MC list from a file if it exists. If not, create the chain
and save it to the file for the next use (skip the save if suppressed).
Parameters:
* seq_file: The sequence file.
* k: The order of the markov chain.
* suppress_save: Boolean. If true, don't save the generated MC file. (Can't imagine why we would want this.)
* mc_file: The name of the mc_file to use. (Derive from seq_file if not provided.)
* retrain_n: If false, we will be cutting of the largest possible N* prefix and suffix.
Return: A tuple:
1. The chromosome sequence.
2. The markov chain
3. Where we will start in the template sequence (in case a prefix has been removed).
4. Where we will end in the templace sequence (in case a suffix has been removed).
"""
template_seq = str(SeqIO.read(seq_file, 'fasta').seq)
# Cut out all the maximul prefix and suffix of ambiguity codes -- which will have no effect on the Markov chain construction.
start, finish = 0, len(template_seq)
if not retain_n: # Cut down the chromsome to the first real base at each end -- eliminate trailing Ns.
while template_seq[start] not in bases: start += 1
while template_seq[finish-1] not in bases: finish -= 1
mc_file = re.sub("\.(fa|fasta)$", ".pmc%d" % (k), seq_file) if mc_file is None else mc_file
if os.path.exists(mc_file):
markov_list = markov_gen.read_pmck(mc_file)
else:
markov_list = markov_gen.MarkovArray(k, template_seq)
if not suppress_save:
markov_gen.pickle_markov_list(markov_list, mc_file)
return template_seq, markov_list, start, finish
def readRepBase(file):
return {R.id:"".join([x for x in str(R.seq) if x.upper() in {'A', 'C', 'G', 'T'}]) for R in SeqIO.parse(file, 'fasta')}
low_complexity = {'Low_complexity', 'Simple', 'Satellite'}
def select_families(repeat_file, f, num_fams, use_3prime, filter_set, toss_low, rep_base_hash):
"""Used to select those families that have at least f members on the template chromosome.
Parameters:
* repeat_file: the .fa.out file
* f minimum number of allowed instances in a family.
* num_fams: Number of families to be choosen
* use_3prime: if false, ignore instances on the 3' strand
* filter_set: families that should be ignored
* toss_low: if true, ignore low-complexity families
* rep_base_hash: a hash table mapping family names to their rep_base sequences
Returns:
* List of families chosen
"""
C = {} # Family name -> count
for T in nextRepeat(repeat_file, use_3prime, filter_set, E = low_complexity if toss_low else {}):
if rep_base_hash and not T[4] in rep_base_hash:
continue
if T[4] in C:
C[T[4]] += 1
else:
C[T[4]] = 1
L = [k for k in C if C[k] >= f]
if num_fams == None:
return L
if num_fams > len(L):
sys.stderr.write("Not enough families for f\n")
exit(1);
return L[:num_fams]
def create_chromosome_file(seq_file, repeat_file, output_file, k = 5, use_3prime = True, filter_file = "rpt_list.txt", mask = False, seed = None, suppress = False, max_interval = -1, min_interval = 0, retain_n = False, num_repeats = None, max_length = None, toss_low = False, rep_base = None, f = 1, num_fams = None):
"""
Create a simualted chrosome with real repeat sequences from a chromsoe file.
Parameters:
* seq_file: fasta <seq>.fa, file containing the template sequence.
-- Assumed to exist a file <seq>.fa.out containing the repeatmasker annotations.
* k: Use a k-order markov chain. There must exists a markov chain file <seq>.pmc<k>.
* output_file: Fasta file to print sequence to.
* use_3prime: If false, only sequence on the 5' strand will be used. Default: True
* filter_file: A list of the repeats that should be used. If empty: all repeats. Default: "rpt_list.txt"
* mask: If true: copied repeats will be in lower case. Default: False
* seed: RNG seed
"""
if not output_file.endswith(".fa"):
output_file += ".fa"
random.seed(args.seed)
# First: load in the template sequence, markov chain, and the start/end coords of what we are using.
template_seq, markov_list, chr_start, chr_finish = loadSeqAndChain(args.seq_file, args.k, suppress, args.mc_file, args.retain_n)
# Read in the set of families to be ignored
filter_set = {y.strip() for line in open(filter_file) for y in re.split("\s+", line.rstrip())} if filter_file else {}
# Read in the RepBase sequence: maps name -> RepBaseSequence
rep_base_hash = readRepBase(rep_base) if rep_base else None # Hash of repeats ID -> sequences)
# Pick which families we are using.
selected = select_families(repeat_file, f, num_fams, use_3prime, filter_set, toss_low, rep_base_hash)
# Create a sequence generator
rpt_gen = nextRepeat(repeat_file, use_3prime, filter_set, E = low_complexity if toss_low else {}, I = selected)
# Create the simulated sequence
simulated_sequence, fa_out = generate_chromosome(seq = template_seq, markov_list = markov_list, chr_start = chr_start, chr_finish = chr_finish, rpt_gen = rpt_gen, mask = mask, max_interval = max_interval, min_interval = min_interval, num_repeats = num_repeats, max_length = max_length, rep_base_hash = rep_base_hash)
# Write output to file
SeqIO.write([SeqRecord(seq = Seq(simulated_sequence), id = "seq_file", description = "Simulated sequence from %s using order %d markov chain" % (seq_file, len(markov_list)-1))], output_file, 'fasta')
open(output_file + ".out", "w").write(fa_out)
if __name__ == "__main__":
args = parse_params(sys.argv[1:])
create_chromosome_file(seq_file = args.seq_file, k = args.k, output_file = args.output,
repeat_file = args.repeat_file, use_3prime = args.negative_strand,
filter_file = args.family_file, mask = args.mask, seed = args.seed,
max_interval = args.max_interval, min_interval = args.min_interval, num_repeats = args.num_repeats,
max_length = args.max_length, toss_low = not args.low_complexity,
rep_base = args.rep_base, f = args.family_min, num_fams = args.num_family)