示例#1
0
def add_single_indel(indelfo, pos, length, gapped_codon_positions, keep_in_frame=False, debug=False):
    ifo = {'type' : None, 'pos' : pos, 'len' : length, 'seqstr' : None}
    if numpy.random.uniform(0, 1) < 0.5:  # fifty-fifty chance of insertion and deletion
        ifo['type'] = 'insertion'
        ifo['seqstr'] = ''.join([utils.nukes[random.randint(0, len(utils.nukes) - 1)] for _ in range(length)])
        if utils.gap_len(ifo['seqstr']) > 0:  # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something
            print '  failed adding indel (overlaps with previous one)'
            return
        indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + ifo['seqstr'] + indelfo['qr_gap_seq'][pos:]
        indelfo['gl_gap_seq'] = indelfo['gl_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['gl_gap_seq'][pos:]
        for region in gapped_codon_positions:
            if pos < gapped_codon_positions[region]:  # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below
                gapped_codon_positions[region] += length
        for otherfo in indelfo['indels']:  # correct the positions of any existing indels that're to the right of this one
            if otherfo['pos'] > pos:
                otherfo['pos'] += ifo['len']
    else:
        ifo['type'] = 'deletion'
        ifo['seqstr'] = indelfo['gl_gap_seq'][pos : pos + length]  # NOTE it's kind of unclear whether this should be the bit in the qr or gl seq. Using the gl like this probably makes more sense, since it corresponds to what we would infer in s-w (i.e., if we _do_ delete some SHMd positions, we will never know about it, so who cares)
        if utils.gap_len(ifo['seqstr']) > 0:  # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something
            print '  failed adding indel (overlaps with previous one)'
            return
        indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['qr_gap_seq'][pos + length : ]

    if not utils.codon_unmutated('cyst', indelfo['qr_gap_seq'], gapped_codon_positions['v']):
        if debug:
            print '  adding indel within %s codon' % 'cyst'

    indelfo['indels'].append(ifo)
    indelfo['indels'] = sorted(indelfo['indels'], key=lambda q: q['pos'])

    if debug:
        print get_dbg_str(indelfo)
示例#2
0
def reconstruct_indelfo_from_gap_seqs_and_naive_seq(qr_gap_seq, gl_gap_seq, indel_genes, line, iseq, debug=False):  # either a <line> that doesn't already have <indelfos> in it (from a new-style file), or it does but we want to reconstruct the indelfos to make sure we get the same thing
    # NOTE passing gap seqs separately on purpose! don't use any that might be in <line>
    if utils.gap_len(qr_gap_seq) == 0 and utils.gap_len(gl_gap_seq) == 0:
        return get_empty_indel()

    # make a new cigar str using the gapped sequences, then combine that cigar str with info from <line> to make a new indelfo
    new_cigarstr = get_cigarstr_from_gap_seqs(qr_gap_seq, gl_gap_seq, debug=debug)
    new_indelfo = get_indelfo_from_cigar_and_line(new_cigarstr, line, iseq, debug=debug)
    return new_indelfo
示例#3
0
	def connect_cost(self, connector, line1, line2, pos_offset=None):
		'''
		Cost of connector connecting line1 and line2
		Assumes the lines are approx colinear
		:param connector: line connecting line1/2
		:param line1/2: line
		:param offset: pos offset: connector.pos coords = line1/2.pos coords + offset
		'''
		if pos_offset is None:
			pos_offset = (0, 0)
		o = connector.orien
		if (line1.pos[1-o] > line2.pos[1-o]):
			# make sure the line1 has the smaller position
			line1, line2 = line2, line1

		connector_range = connector.length_range()
		line1_range = line1.length_range(offset=pos_offset[1-o])
		line2_range = line2.length_range(offset=pos_offset[1-o])

		gap_len = utils.gap_len(line1_range, line2_range)
		cost = gap_len * (connector.count + ( (line1.count + line2.count) / 2)) # double count the gap

		# count the cost if line1 and line2 don't align perfectly with the ends of connector
		start_diff = abs(connector_range[0] - line1_range[0])
		cost += start_diff * ( (connector.count + line1.count) / 2.0)
		end_diff = abs(connector_range[1] - line2_range[1])
		cost += end_diff * ( (connector.count + line2.count) / 2.0)
		return cost
示例#4
0
 def overlaps(pos, length):  # see if there's any existing indels close to where we're thinking of putting this one NOTE in practice this _really_ shouldn't happen much -- there should be only a a couple of indels per sequence at most -- this just keeps other things (e.g. indelfo consistency checks) from getting confused and crashing
     for gapseq in (indelfo['qr_gap_seq'], indelfo['gl_gap_seq']):
         if len(gapseq) < pos + length + 1:
             return True
         if utils.gap_len(gapseq[pos - length : pos + length]) > 0:  # this leaves a pretty, albeit inexact, large buffer
             return True
     return False
示例#5
0
def combine_indels(regional_indelfos,
                   full_qrseq,
                   qrbounds,
                   uid=None,
                   debug=False):
    # debug = 2
    joint_indelfo = get_empty_indel()

    # make sure the regional qrbounds consist of a nice orderly progression
    tmpqrblist = [b for r in utils.regions for b in qrbounds[r]]
    if tmpqrblist != sorted(tmpqrblist):
        raise Exception(
            'messed up qrbounds %s for qr sequence with length %d:\n  %s' %
            ('   '.join([('%s %s' % (r, qrbounds[r])) for r in utils.regions
                         ]), len(full_qrseq), full_qrseq))
    if qrbounds['j'][1] > len(
            full_qrseq
    ):  # qrbounds['v'][1] > len(full_qrseq) or qrbounds['d'][1] > len(full_qrseq) or qrbounds['j'][1] > len(full_qrseq):
        raise Exception(
            'qrbounds %s extend beyond sequence with len %d:\n  %s' %
            (qrbounds, len(full_qrseq), full_qrseq))

    if debug > 1:
        print 'combining %d indelfo%s from %s' % (
            len(regional_indelfos), utils.plural(len(regional_indelfos)),
            ' '.join([r for r in utils.regions if r in regional_indelfos]))
        print '  qrbounds:   %s' % '   '.join([('%s %s' % (r, qrbounds[r]))
                                               for r in utils.regions])
        print '     full qr %s' % full_qrseq
    qr_gap_seq, gl_gap_seq = [], []
    for region in utils.regions:
        ireg = utils.regions.index(region)
        if debug > 1:
            print '  %s' % region
        if region in regional_indelfos:
            rfo = regional_indelfos[region]
            assert has_indels(
                rfo
            )  # calling fcn needs to not add it if it doesn't have indels
            joint_indelfo['genes'][region] = rfo['genes'][region]
            if utils.non_gap_len(
                    rfo['qr_gap_seq']
            ) != qrbounds[region][1] - qrbounds[region][
                    0]:  # should be fixed by overlapping boundary shifter
                return None  # UPDATE eh screw it this managed to happen *again* (see issue #310)
                # raise Exception('%sqr_gap_seq non-gap length %d not the same as qrbound length %d in %s region indelfo' % ('%s: ' % uid if uid is not None else '', utils.non_gap_len(rfo['qr_gap_seq']), qrbounds[region][1] - qrbounds[region][0], region))
            qr_gap_seq += [rfo['qr_gap_seq']]
            gl_gap_seq += [rfo['gl_gap_seq']]

            reg_indel_list = copy.deepcopy(rfo['indels'])
            for i_prev_reg in range(0, ireg):  # loop over previous regions
                prev_reg = utils.regions[i_prev_reg]
                if prev_reg not in regional_indelfos:  # don't need to do anything if the previous region didn't have indels
                    continue
                prev_reg_gaps = utils.gap_len(
                    regional_indelfos[prev_reg]['qr_gap_seq']
                )  # number of gaps in the previous region's qr gap seq
                for ifo in reg_indel_list:
                    ifo['pos'] += prev_reg_gaps
                    if debug > 1:
                        print '    add %d to pos for gaps in %s' % (
                            prev_reg_gaps, prev_reg)
            joint_indelfo['indels'] += reg_indel_list
        else:
            qr_gap_seq += [full_qrseq[qrbounds[region][0]:qrbounds[region][1]]]
            gl_gap_seq += [
                utils.ambig_base * (qrbounds[region][1] - qrbounds[region][0])
            ]
        if debug > 1:
            print '    %s\n    %s' % (qr_gap_seq[-1].replace(
                utils.gap_chars[0], utils.color(
                    'red', utils.gap_chars[0])), gl_gap_seq[-1].replace(
                        utils.gap_chars[0],
                        utils.color('red', utils.gap_chars[0])))

        if ireg < len(utils.regions) - 1:
            next_reg = utils.regions[ireg + 1]
            assert region + next_reg in utils.boundaries
            qr_gap_seq += [
                full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]]
            ]
            gl_gap_seq += [
                utils.ambig_base *
                (qrbounds[next_reg][0] - qrbounds[region][1])
            ]
            if debug > 1:
                print '  %s%s' % (region, next_reg)
                print '    %s\n    %s' % (
                    full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]],
                    utils.ambig_base *
                    (qrbounds[next_reg][0] - qrbounds[region][1]))

    if debug > 1:
        print 'combined gap seqs:'
        print '  qr %s' % '  '.join(qr_gap_seq)
        print '  gl %s' % '  '.join(gl_gap_seq)

    joint_indelfo['qr_gap_seq'] = ''.join(qr_gap_seq)
    joint_indelfo['gl_gap_seq'] = ''.join(gl_gap_seq)
    assert len(joint_indelfo['qr_gap_seq']) == len(joint_indelfo['gl_gap_seq'])
    joint_indelfo['reversed_seq'] = get_reversed_seq(
        joint_indelfo['qr_gap_seq'], joint_indelfo['gl_gap_seq'],
        full_qrseq[:qrbounds['v'][0]], full_qrseq[qrbounds['j'][1]:])
    # assert 'N' not in joint_indelfo['reversed_seq']  # this happens if there's Ns in the initial sequence

    joint_indelfo[
        'qr_gap_seq'] = full_qrseq[:qrbounds['v'][0]] + joint_indelfo[
            'qr_gap_seq'] + full_qrseq[qrbounds['j'][1]:]
    joint_indelfo['gl_gap_seq'] = utils.ambig_base * qrbounds['v'][
        0] + joint_indelfo['gl_gap_seq'] + utils.ambig_base * (
            len(full_qrseq) - qrbounds['j'][1])

    if debug:
        print 'combined'
        print get_dbg_str(joint_indelfo)

    return joint_indelfo