예제 #1
0
    def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds):
        """Truncate or Extend reads to make them fit the read length

        When insertions or deletions are introduced to the reads, their length
        will change. This function takes a (mutable) read and a reference
        sequence, and extend or truncate the read if it has had an insertion
        or a deletion

        Args:
            mut_seq (MutableSeq): a mutable sequence
            orientation (string): orientation of the read. Can be 'forward' or
                'reverse'
            full_sequence (Seq): the reference sequence from which mut_seq
                comes from
            bounds (tuple): the position of the read in the full_sequence

        Returns:
            Seq: a sequence fitting the ErrorModel
        """
        read_start, read_end = bounds
        if len(mut_seq) == self.read_length:
            return mut_seq.toseq()
        elif len(mut_seq) > self.read_length:
            while len(mut_seq) > self.read_length:
                mut_seq.pop()
            return mut_seq.toseq()
        else:  # len smaller
            to_add = self.read_length - len(mut_seq)
            if orientation == 'forward':
                for i in range(to_add):
                    if read_end + i >= len(full_sequence):
                        nucl_to_add = 'A'
                    else:
                        nucl_to_add = str(full_sequence[read_end + i])
                    mut_seq.append(nucl_to_add)
            elif orientation == 'reverse':
                for i in range(to_add):
                    if read_end + i >= len(full_sequence):
                        nucl_to_add = 'A'
                    else:
                        nucl_to_add = util.rev_comp(full_sequence[read_end +
                                                                  i])
                    mut_seq.append(nucl_to_add)
            return mut_seq.toseq()
예제 #2
0
def test_rev_comp():
    lowercase_seq = 'attgctat'
    uppercase_seq = 'CCGATTAC'
    assert util.rev_comp(lowercase_seq) == 'atagcaat'
    assert util.rev_comp(uppercase_seq) == 'GTAATCGG'
예제 #3
0
def simulate_read(record, ErrorModel, i, cpu_number):
    """From a read pair from one genome (or sequence) according to an
    ErrorModel

    Each read is a SeqRecord object
    returns a tuple containing the forward and reverse read.

    Args:
        record (SeqRecord): sequence or genome of reference
        ErrorModel (ErrorModel): an ErrorModel class
        i (int): a number identifying the read
        cpu_number (int): cpu number. Is added to the read id.

    Returns:
        tuple: tuple containg a forward read and a reverse read
    """
    logger = logging.getLogger(__name__)
    sequence = record.seq
    header = record.id

    read_length = ErrorModel.read_length
    insert_size = ErrorModel.random_insert_size()
    # generate the forward read
    try:  # a ref sequence has to be longer than 2 * read_length + i_size
        assert read_length < len(record.seq)
        forward_start = random.randrange(
            0,
            len(record.seq) - (2 * read_length + insert_size))
    except AssertionError as e:
        logger.error('%s shorter than read length for this ErrorModel:%s' %
                     (e, record.id))
        sys.exit(1)
    except ValueError as e:
        logger.debug('%s shorter than template length for this ErrorModel:%s' %
                     (record.id, e))
        forward_start = max(0,
                            random.randrange(0,
                                             len(record.seq) - read_length))
        # raise

    forward_end = forward_start + read_length
    bounds = (forward_start, forward_end)
    # create a perfect read
    forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]),
                            IUPAC.unambiguous_dna),
                        id='%s_%s_%s/1' % (header, i, cpu_number),
                        description='')
    # add the indels, the qual scores and modify the record accordingly
    forward.seq = ErrorModel.introduce_indels(forward, 'forward', sequence,
                                              bounds)
    forward = ErrorModel.introduce_error_scores(forward, 'forward')
    forward.seq = ErrorModel.mut_sequence(forward, 'forward')

    # generate the reverse read
    try:
        reverse_start = forward_end + insert_size
        reverse_end = reverse_start + read_length
        assert reverse_end < len(record.seq)
    except AssertionError as e:
        # we use random insert when the modelled template length distribution
        # is too large
        reverse_end = random.randrange(read_length, len(record.seq))
        reverse_start = reverse_end - read_length
    bounds = (reverse_start, reverse_end)
    # create a perfect read
    reverse = SeqRecord(Seq(rev_comp(str(sequence[reverse_start:reverse_end])),
                            IUPAC.unambiguous_dna),
                        id='%s_%s_%s/2' % (header, i, cpu_number),
                        description='')
    # add the indels, the qual scores and modify the record accordingly
    reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse', sequence,
                                              bounds)
    reverse = ErrorModel.introduce_error_scores(reverse, 'reverse')
    reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse')

    return (forward, reverse)
예제 #4
0
    def simulate_read_with_error_model(cls,
                                       genome,
                                       ErrorModel,
                                       i,
                                       always_forward=True):
        """Form a read from one genome (or sequence) according to an
        ErrorModel
        returns a string
        Args:
            genome (string): sequence or genome of reference
            ErrorModel (ErrorModel): an ErrorModel class
            i (int): a number identifying the read
        Returns:
            string: a string representing a single read
        """
        # ErrorModel.read_length = ErrorModel.read_length - 1
        np_random = np.random.RandomState(seed=i)

        read_length = ErrorModel.read_length

        if len(genome) <= read_length:
            genome = "".join([genome, "N" * (read_length - len(genome) + 1)])

        record = SeqRecord(Seq(genome, IUPAC.unambiguous_dna),
                           id=f'genome_{i}',
                           description='')

        sequence = record.seq
        header = record.id

        # generate the forward read
        forward_start = np_random.randint(
            low=0, high=max(len(record.seq) - read_length + 1, 1))

        forward_end = forward_start + read_length

        generate_forward = np_random.randint(low=0, high=2)

        if generate_forward or always_forward:

            bounds = (forward_start, forward_end)
            # create a perfect read
            forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]),
                                    IUPAC.unambiguous_dna),
                                id='%s_%s' % (header, i),
                                description='')
            # add the indels, the qual scores and modify the record accordingly
            forward.seq = ErrorModel.introduce_indels(forward, 'forward',
                                                      sequence, bounds)
            forward = ErrorModel.introduce_error_scores(forward, 'forward')
            forward.seq = ErrorModel.mut_sequence(forward, 'forward')

            return str(forward.seq)

        else:
            insert_size = ErrorModel.random_insert_size()
            try:
                reverse_start = forward_end + insert_size
                reverse_end = reverse_start + read_length
                assert reverse_end < len(record.seq)
            except AssertionError:
                reverse_end = np_random.randint(low=read_length,
                                                high=len(record.seq))
                reverse_start = reverse_end - read_length

            bounds = (reverse_start, reverse_end)
            reverse = SeqRecord(Seq(
                rev_comp(str(sequence[reverse_start:reverse_end])),
                IUPAC.unambiguous_dna),
                                id='%s_%s' % (header, i),
                                description='')
            reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse',
                                                      sequence, bounds)
            reverse = ErrorModel.introduce_error_scores(reverse, 'reverse')
            reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse')

            return str(reverse.seq)