def generate_chromosome(seq, markov_list, coord_adjust, rpt_gen, mask=False, max_interval=None, num_repeats=None, max_length=None, limiting_chr=None, sim_type=0): """ Generate a syntehtic sequence with real repeats: * seq: A sequence (as a string). * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module). * coord_adjust: Size of the prefix that has been cut off the template sequence (requiring that * that .fa.out coordinate be adjusted). * rpt_gen: A generating function returning the repeat information (created by nextRepeat) * mask: If true, all repeats will be lower-case. Otherwise, upper case.) * max_interval: Maximum inter-repeat length. """ current_coord = coord_adjust if max_interval == -1: max_interval = len(seq) s = [] # Hold the sequence (in chunks) fa_out = [] # Hold the new .fa.out file contents (by line) rpt_count = 0 length = min(len(seq), max_length) if max_length else len(seq) for chr, start, finish, strand, family, rpt_class, rpt_id, ancestor_seq, modern_seq in rpt_gen: if limiting_chr and chr not in limiting_chr: continue if start >= current_coord: rpt_count += 1 inter_seq_len = min(start - current_coord, max_interval) inter_seq = markov_gen.generate_sequence(markov_list, inter_seq_len) assert len(inter_seq) == inter_seq_len s.append(inter_seq) coord_adjust += max(0, start - current_coord - max_interval) if sim_type == 0: rpt_seq = seq[start:finish] if sim_type == 1: rpt_seq = re.sub("-", "", ancestor_seq) elif sim_type == 2: rpt_seq = "".join([ m if m != '-' else a for a, m in zip(ancestor_seq, modern_seq) if a != '-' ]) else: # sim_type == 3 rpt_seq = "".join([ a if a != '-' else m for a, m in zip(ancestor_seq, modern_seq) if m != '-' ]) rpt_seq = "".join([ a if a.upper() in {'A', 'C', 'T', 'G'} else 'N' for a in rpt_seq ]) s.append(rpt_seq.lower() if mask else rpt_seq.upper()) new_finish = start + len(rpt_seq) fa_out.append([ chr, start + 1 - coord_adjust, new_finish - coord_adjust, strand, family, rpt_class, rpt_id ]) coord_adjust += finish - new_finish if num_repeats and rpt_count == num_repeats: break current_coord = finish if num_repeats: max_interval = min(1000, max_interval) tail_length = min(max_interval, length - current_coord) if tail_length > 0: s.append(markov_gen.generate_sequence(markov_list, tail_length)) sim_seq = "".join(s) sim_seq_len = len(sim_seq) fa_out_str = fa_out_header for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out: fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left=sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id) return sim_seq, fa_out_str
def generate_chromosome(seq, markov_list, chr_start, chr_finish, rpt_gen, mask = False, max_interval = -1, min_interval = 0,num_repeats = None, max_length = None, limiting_chr = None, rep_base_hash = None): """ Generate a syntehtic sequence with real repeats: * seq: A sequence (as a string). * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module). * start/finish: Defined the coordinates of our actual template sequence. (We are ignoring anything that occurs before/faster. * Allows us to cut of a prefix and/or suffix. * rpt_gen: A generating function returning the repeat information (created by nextRepeat) * mask: If true, all repeats will be lower-case. Otherwise, upper case.) * max_interval: Maximum inter-repeat length. * min_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this, * extend the length. * max_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this, * cut the length. """ last_end = chr_start if max_interval == -1: max_interval = len(seq) sim_seq = "" # Simulated sequence fa_out = [] # Hold the new .fa.out file contents (by line) rpt_count = 0 # Count of repeats (so we can quit when we reach num_repeats, if applicable) for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen: if limiting_chr and chr not in limiting_chr: # Skip if we are on the wrong chromsome continue if start >= chr_finish: # Quit if we have gone past the allowed range (repeats are assumed to be sorted by start) break if start < chr_start or finish > chr_finish: # Skip if we are outside the allowed range continue if start < last_end: # Skip if this repeat overlapped the last one continue rpt_count += 1 # Add the next inter-TE sequence inter_seq_len = max(min_interval, min(start - last_end, max_interval)) sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len) # Add the next sequence if rep_base_hash: rpt_seq = rep_base_hash[family] else: rpt_seq = seq[start:finish] fa_out.append([chr, len(sim_seq)+1, len(sim_seq) + len(rpt_seq), strand, family, rpt_class, rpt_id]) # Coords adjusted for biologist notation sim_seq += rpt_seq.lower() if mask else rpt_seq.upper() if rpt_count == num_repeats: break last_end = max(last_end, finish) # Add final sequence on final_seq_len = max(min_interval, min(chr_finish - last_end, max_interval)) sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len) sim_seq_len = len(sim_seq) fa_out_str = fa_out_header for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out: fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left = sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id) return sim_seq, fa_out_str
def generate_chromosome(seq, markov_list, coord_adjust, rpt_gen, mask = False, max_interval = None, num_repeats = None, max_length = None, limiting_chr = None): """ Generate a syntehtic sequence with real repeats: * seq: A sequence (as a string). * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module). * coord_adjust: Size of the prefix that has been cut off the template sequence (requiring that * that .fa.out coordinate be adjusted). * rpt_gen: A generating function returning the repeat information (created by nextRepeat) * mask: If true, all repeats will be lower-case. Otherwise, upper case.) * max_interval: Maximum inter-repeat length. """ current_coord = coord_adjust if max_interval == -1: max_interval = len(seq) s = [] # Hold the sequence (in chunks) fa_out = [] # Hold the new .fa.out file contents (by line) rpt_count = 0 length = min(len(seq), max_length) if max_length else len(seq) debug_sim_len = 0 for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen: if limiting_chr and chr not in limiting_chr: continue if start >= current_coord: rpt_count += 1 inter_seq_len = min(start-current_coord, max_interval) inter_seq = markov_gen.generate_sequence(markov_list, inter_seq_len) assert len(inter_seq) == inter_seq_len s.append(inter_seq) debug_sim_len += len(inter_seq) coord_adjust += max(0, start-current_coord-max_interval) rpt_seq = seq[start:finish] s.append(rpt_seq.lower() if mask else rpt_seq.upper()) debug_sim_len += len(rpt_seq) fa_out.append([chr, start+1-coord_adjust, finish-coord_adjust, strand, family, rpt_class, rpt_id]) #fa_out.append(fa_out_template.format(chr=chr, start=start+1-coord_adjust, finish=finish-coord_adjust, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id)) if num_repeats and rpt_count == num_repeats: break current_coord = finish if num_repeats: max_interval = min(1000, max_interval) tail_length = min(max_interval, length-current_coord) if tail_length > 0: s.append(markov_gen.generate_sequence(markov_list, tail_length)) sim_seq = "".join(s) sim_seq_len = len(sim_seq) fa_out_str = fa_out_header for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out: fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left = sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id) return sim_seq, fa_out_str
def generate_chromosome(seq, markov_list, chr_start, chr_finish, rpt_gen, mask=False, max_interval=-1, min_interval=0, num_repeats=None, max_length=None, limiting_chr=None, rep_base_hash=None): """ Generate a syntehtic sequence with real repeats: * seq: A sequence (as a string). * markov_list: List of the k+1 i-th order markov chains (from the markov_gen module). * start/finish: Defined the coordinates of our actual template sequence. (We are ignoring anything that occurs before/faster. * Allows us to cut of a prefix and/or suffix. * rpt_gen: A generating function returning the repeat information (created by nextRepeat) * mask: If true, all repeats will be lower-case. Otherwise, upper case.) * max_interval: Maximum inter-repeat length. * min_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this, * extend the length. * max_interval: Minimum allowed length of a sequence between repeats. If two repeats are closer than this, * cut the length. """ last_end = chr_start if max_interval == -1: max_interval = len(seq) sim_seq = "" # Simulated sequence fa_out = [] # Hold the new .fa.out file contents (by line) rpt_count = 0 # Count of repeats (so we can quit when we reach num_repeats, if applicable) for chr, start, finish, strand, family, rpt_class, rpt_id in rpt_gen: if limiting_chr and chr not in limiting_chr: # Skip if we are on the wrong chromsome continue if start >= chr_finish: # Quit if we have gone past the allowed range (repeats are assumed to be sorted by start) break if start < chr_start or finish > chr_finish: # Skip if we are outside the allowed range continue if start < last_end: # Skip if this repeat overlapped the last one continue rpt_count += 1 # Add the next inter-TE sequence inter_seq_len = max(min_interval, min(start - last_end, max_interval)) sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len) # Add the next sequence if rep_base_hash: rpt_seq = rep_base_hash[family] else: rpt_seq = seq[start:finish] fa_out.append([ chr, len(sim_seq) + 1, len(sim_seq) + len(rpt_seq), strand, family, rpt_class, rpt_id ]) # Coords adjusted for biologist notation sim_seq += rpt_seq.lower() if mask else rpt_seq.upper() if rpt_count == num_repeats: break last_end = max(last_end, finish) # Add final sequence on final_seq_len = max(min_interval, min(chr_finish - last_end, max_interval)) sim_seq += markov_gen.generate_sequence(markov_list, inter_seq_len) sim_seq_len = len(sim_seq) fa_out_str = fa_out_header for chr, start, finish, strand, family, rpt_class, rpt_id in fa_out: fa_out_str += fa_out_template.format(chr=chr, start=start, finish=finish, left=sim_seq_len - finish, strand=strand, family=family, rpt_class=rpt_class, rpt_id=rpt_id) return sim_seq, fa_out_str