def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds): """Truncate or Extend reads to make them fit the read length When insertions or deletions are introduced to the reads, their length will change. This function takes a (mutable) read and a reference sequence, and extend or truncate the read if it has had an insertion or a deletion Args: mut_seq (MutableSeq): a mutable sequence orientation (string): orientation of the read. Can be 'forward' or 'reverse' full_sequence (Seq): the reference sequence from which mut_seq comes from bounds (tuple): the position of the read in the full_sequence Returns: Seq: a sequence fitting the ErrorModel """ read_start, read_end = bounds if len(mut_seq) == self.read_length: return mut_seq.toseq() elif len(mut_seq) > self.read_length: while len(mut_seq) > self.read_length: mut_seq.pop() return mut_seq.toseq() else: # len smaller to_add = self.read_length - len(mut_seq) if orientation == 'forward': for i in range(to_add): if read_end + i >= len(full_sequence): nucl_to_add = 'A' else: nucl_to_add = str(full_sequence[read_end + i]) mut_seq.append(nucl_to_add) elif orientation == 'reverse': for i in range(to_add): if read_end + i >= len(full_sequence): nucl_to_add = 'A' else: nucl_to_add = util.rev_comp(full_sequence[read_end + i]) mut_seq.append(nucl_to_add) return mut_seq.toseq()
def test_rev_comp(): lowercase_seq = 'attgctat' uppercase_seq = 'CCGATTAC' assert util.rev_comp(lowercase_seq) == 'atagcaat' assert util.rev_comp(uppercase_seq) == 'GTAATCGG'
def simulate_read(record, ErrorModel, i, cpu_number): """From a read pair from one genome (or sequence) according to an ErrorModel Each read is a SeqRecord object returns a tuple containing the forward and reverse read. Args: record (SeqRecord): sequence or genome of reference ErrorModel (ErrorModel): an ErrorModel class i (int): a number identifying the read cpu_number (int): cpu number. Is added to the read id. Returns: tuple: tuple containg a forward read and a reverse read """ logger = logging.getLogger(__name__) sequence = record.seq header = record.id read_length = ErrorModel.read_length insert_size = ErrorModel.random_insert_size() # generate the forward read try: # a ref sequence has to be longer than 2 * read_length + i_size assert read_length < len(record.seq) forward_start = random.randrange( 0, len(record.seq) - (2 * read_length + insert_size)) except AssertionError as e: logger.error('%s shorter than read length for this ErrorModel:%s' % (e, record.id)) sys.exit(1) except ValueError as e: logger.debug('%s shorter than template length for this ErrorModel:%s' % (record.id, e)) forward_start = max(0, random.randrange(0, len(record.seq) - read_length)) # raise forward_end = forward_start + read_length bounds = (forward_start, forward_end) # create a perfect read forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]), IUPAC.unambiguous_dna), id='%s_%s_%s/1' % (header, i, cpu_number), description='') # add the indels, the qual scores and modify the record accordingly forward.seq = ErrorModel.introduce_indels(forward, 'forward', sequence, bounds) forward = ErrorModel.introduce_error_scores(forward, 'forward') forward.seq = ErrorModel.mut_sequence(forward, 'forward') # generate the reverse read try: reverse_start = forward_end + insert_size reverse_end = reverse_start + read_length assert reverse_end < len(record.seq) except AssertionError as e: # we use random insert when the modelled template length distribution # is too large reverse_end = random.randrange(read_length, len(record.seq)) reverse_start = reverse_end - read_length bounds = (reverse_start, reverse_end) # create a perfect read reverse = SeqRecord(Seq(rev_comp(str(sequence[reverse_start:reverse_end])), IUPAC.unambiguous_dna), id='%s_%s_%s/2' % (header, i, cpu_number), description='') # add the indels, the qual scores and modify the record accordingly reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse', sequence, bounds) reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') return (forward, reverse)
def simulate_read_with_error_model(cls, genome, ErrorModel, i, always_forward=True): """Form a read from one genome (or sequence) according to an ErrorModel returns a string Args: genome (string): sequence or genome of reference ErrorModel (ErrorModel): an ErrorModel class i (int): a number identifying the read Returns: string: a string representing a single read """ # ErrorModel.read_length = ErrorModel.read_length - 1 np_random = np.random.RandomState(seed=i) read_length = ErrorModel.read_length if len(genome) <= read_length: genome = "".join([genome, "N" * (read_length - len(genome) + 1)]) record = SeqRecord(Seq(genome, IUPAC.unambiguous_dna), id=f'genome_{i}', description='') sequence = record.seq header = record.id # generate the forward read forward_start = np_random.randint( low=0, high=max(len(record.seq) - read_length + 1, 1)) forward_end = forward_start + read_length generate_forward = np_random.randint(low=0, high=2) if generate_forward or always_forward: bounds = (forward_start, forward_end) # create a perfect read forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]), IUPAC.unambiguous_dna), id='%s_%s' % (header, i), description='') # add the indels, the qual scores and modify the record accordingly forward.seq = ErrorModel.introduce_indels(forward, 'forward', sequence, bounds) forward = ErrorModel.introduce_error_scores(forward, 'forward') forward.seq = ErrorModel.mut_sequence(forward, 'forward') return str(forward.seq) else: insert_size = ErrorModel.random_insert_size() try: reverse_start = forward_end + insert_size reverse_end = reverse_start + read_length assert reverse_end < len(record.seq) except AssertionError: reverse_end = np_random.randint(low=read_length, high=len(record.seq)) reverse_start = reverse_end - read_length bounds = (reverse_start, reverse_end) reverse = SeqRecord(Seq( rev_comp(str(sequence[reverse_start:reverse_end])), IUPAC.unambiguous_dna), id='%s_%s' % (header, i), description='') reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse', sequence, bounds) reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') return str(reverse.seq)