Пример #1
0
    def get_indel_info(self, query_name, cigarstr, qrseq, glseq, gene):
        cigars = re.findall('[0-9][0-9]*[A-Z]', cigarstr)  # split cigar string into its parts
        cigars = [(cstr[-1], int(cstr[:-1])) for cstr in cigars]  # split each part into the code and the length

        codestr = ''
        qpos = 0  # position within query sequence
        indelfo = utils.get_empty_indel()  # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions
        tmp_indices = []
        for code, length in cigars:
            codestr += length * code
            if code == 'I':  # advance qr seq but not gl seq
                indelfo['indels'].append({'type' : 'insertion', 'pos' : qpos, 'len' : length, 'seqstr' : ''})  # insertion begins at <pos>
                tmp_indices += [len(indelfo['indels']) - 1  for _ in range(length)]# indel index corresponding to this position in the alignment
            elif code == 'D':  # advance qr seq but not gl seq
                indelfo['indels'].append({'type' : 'deletion', 'pos' : qpos, 'len' : length, 'seqstr' : ''})  # first deleted base is <pos> (well, first base which is in the position of the first deleted base)
                tmp_indices += [len(indelfo['indels']) - 1  for _ in range(length)]# indel index corresponding to this position in the alignment
            else:
                tmp_indices += [None  for _ in range(length)]  # indel index corresponding to this position in the alignment
            qpos += length

        qrprintstr, glprintstr = '', ''
        iqr, igl = 0, 0
        for icode in range(len(codestr)):
            code = codestr[icode]
            if code == 'M':
                qrbase = qrseq[iqr]
                if qrbase != glseq[igl]:
                    qrbase = utils.color('red', qrbase)
                qrprintstr += qrbase
                glprintstr += glseq[igl]
                indelfo['reversed_seq'] += qrseq[iqr]  # add the base to the overall sequence with all indels reversed
            elif code == 'S':
                continue
            elif code == 'I':
                qrprintstr += utils.color('light_blue', qrseq[iqr])
                glprintstr += utils.color('light_blue', '*')
                indelfo['indels'][tmp_indices[icode]]['seqstr'] += qrseq[iqr]  # and to the sequence of just this indel
                igl -= 1
            elif code == 'D':
                qrprintstr += utils.color('light_blue', '*')
                glprintstr += utils.color('light_blue', glseq[igl])
                indelfo['reversed_seq'] += glseq[igl]  # add the base to the overall sequence with all indels reversed
                indelfo['indels'][tmp_indices[icode]]['seqstr'] += glseq[igl]  # and to the sequence of just this indel
                iqr -= 1
            else:
                raise Exception('unhandled code %s' % code)

            iqr += 1
            igl += 1

        if self.debug:
            print '\n      indels in %s' % query_name
            print '          %20s %s' % (gene, glprintstr)
            print '          %20s %s' % ('query', qrprintstr)
            for idl in indelfo['indels']:
                print '          %10s: %d bases at %d (%s)' % (idl['type'], idl['len'], idl['pos'], idl['seqstr'])
        # utils.undo_indels(indelfo)
        # print '                       %s' % self.input_info[query_name]['seq']

        return indelfo
Пример #2
0
    def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions):
        assert query_name not in self.info
        self.info['queries'].append(query_name)
        self.info[query_name] = {}
        self.info[query_name]['unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j'])  # all gene matches for this query

        self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ]

        self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel())

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info['all_best_matches'].add(best[region])
            self.info['all_matches'][region] |= set(match_names[region])

        self.info[query_name]['seq'] = query_seq  # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence

        existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position'])
        utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys)

        if self.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str='      ', label='true:')
            utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str='      ', label='inferred:')

        if self.alfinder is not None:
            self.alfinder.increment(self.info[query_name])
        if self.pcounter is not None:
            self.pcounter.increment_all_params(self.info[query_name])
            if self.true_pcounter is not None:
                self.true_pcounter.increment_all_params(self.reco_info[query_name])
        if self.perfplotter is not None:
            if query_name in self.info['indels']:
                print '    skipping performance evaluation of %s because of indels' % query_name  # I just have no idea how to handle naive hamming fraction when there's indels
            else:
                self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name])

        self.remaining_queries.remove(query_name)
Пример #3
0
 def add_shm_indels(self, reco_event):
     if self.args.debug and self.args.indel_frequency > 0.:
         print '      indels'
     for iseq in range(len(reco_event.final_seqs)):
         reco_event.indelfos.append(utils.get_empty_indel())
         if self.args.indel_frequency == 0.:  # no indels at all
             continue
         if numpy.random.uniform(0, 1) > self.args.indel_frequency:  # no indels for this sequence
             if self.args.debug:
                 print '        0'
             continue
         seq = reco_event.final_seqs[iseq]
         reco_event.indelfos[-1]['reversed_seq'] = seq  # set the original sequence (i.e. with all the indels reversed)
         n_indels = 1  #numpy.random.geometric(1. / self.args.mean_n_indels)
         if self.args.debug:
             print '        %d' % n_indels
         for _ in range(n_indels):
             seq = self.add_single_indel(seq, reco_event)
         reco_event.final_seqs[iseq] = seq
Пример #4
0
        def try_scratch_erode_insert(tmpline):
            utils.remove_all_implicit_info(tmpline)
            for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
                region = erosion[0]
                gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']])
                if self.args.chain != 'h' and region == 'd':  # light chains dummy d treatment
                    assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.chain]
                    tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0  # always erode the whole dummy d from the left
                else:
                    max_erosion = max(0, gene_length/2 - 2)  # now that, son, is a heuristic
                    if region in utils.conserved_codons[self.args.chain]:
                        codon_pos = self.glfo[utils.conserved_codons[self.args.chain][region] + '-positions'][tmpline[region + '_gene']]
                        if '3p' in erosion:
                            n_bases_to_codon = gene_length - codon_pos - 3
                        elif '5p' in erosion:
                            n_bases_to_codon = codon_pos
                        max_erosion = min(max_erosion, n_bases_to_codon)
                    tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
            for bound in utils.boundaries:
                mean_length = utils.scratch_mean_insertion_lengths[self.args.chain][bound]
                length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1
                probs = [self.insertion_content_probs[bound][n] for n in utils.nukes]
                tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs))

            # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
            gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions}
            for erosion in utils.real_erosions:
                region = erosion[0]
                e_length = tmpline[erosion + '_del']
                if '5p' in erosion:
                    gl_seqs[region] = gl_seqs[region][e_length:]
                elif '3p' in erosion:
                    gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length]
            tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ]
            tmpline['indelfos'] = [utils.get_empty_indel(), ]
            utils.add_implicit_info(self.glfo, tmpline)
            assert len(tmpline['in_frames']) == 1
Пример #5
0
 def add_shm_indels(self, reco_event):
     if self.args.debug and self.args.indel_frequency > 0.:
         print '      indels'
     reco_event.indelfos = [
         utils.get_empty_indel() for _ in range(len(reco_event.final_seqs))
     ]
     for iseq in range(len(reco_event.final_seqs)):
         if self.args.indel_frequency == 0.:  # no indels at all
             continue
         if numpy.random.uniform(
                 0, 1
         ) > self.args.indel_frequency:  # no indels for this sequence
             if self.args.debug:
                 print '        0'
             continue
         reco_event.indelfos[iseq]['reversed_seq'] = reco_event.final_seqs[
             iseq]  # set the original sequence (i.e. with all the indels reversed)
         n_indels = 1  #numpy.random.geometric(1. / self.args.mean_n_indels)
         if self.args.debug:
             print '        %d' % n_indels
         for _ in range(n_indels):
             reco_event.final_seqs[iseq] = self.add_single_indel(
                 reco_event.final_seqs[iseq], reco_event.indelfos[iseq],
                 reco_event.final_codon_positions)
Пример #6
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                utils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        chosen_treeinfo = self.treeinfo[random.randint(0,
                                                       len(self.treeinfo) - 1)]
        chosen_tree = chosen_treeinfo.split(';')[0] + ';'
        branch_length_ratios = {
        }  # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        for tmpstr in chosen_treeinfo.split(';')[1].split(
                ','
        ):  # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
            region = tmpstr.split(':')[0]
            assert region in utils.regions
            ratio = float(tmpstr.split(':')[1])
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                # if self.args.debug:
                # print '    adding branch length factor %f ' % self.args.mutation_multiplier
                ratio *= self.args.mutation_multiplier
            branch_length_ratios[region] = ratio

        if self.args.debug:  # NOTE should be the same for t[0-9]... but I guess I should check at some point
            print '  using tree with total depth %f' % treegenerator.get_leaf_node_depths(
                chosen_tree
            )['t1']  # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes...
            if len(re.findall('t', chosen_tree)) > 1:  # if more than one leaf
                Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick'))
            else:
                print '    one leaf'
            print '    with branch length ratios ', ', '.join([
                '%s %f' % (region, branch_length_ratios[region])
                for region in utils.regions
            ])

        scaled_trees = self.get_rescaled_trees(chosen_tree,
                                               branch_length_ratios)
        treg = re.compile('t[0-9][0-9]*')
        n_leaf_nodes = len(treg.findall(chosen_tree))
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaf_nodes,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(len(utils.regions)):
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaf_nodes)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaf_nodes)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaf_nodes):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event

        self.add_shm_indels(reco_event)
Пример #7
0
    def try_scratch_erode_insert(self, tmpline, debug=False):
        utils.remove_all_implicit_info(tmpline)
        for erosion in utils.real_erosions:  # includes various contortions to avoid eroding the entire gene
            region = erosion[0]
            gene_length = len(self.glfo['seqs'][region][tmpline[region +
                                                                '_gene']])
            if region == 'd' and not utils.has_d_gene(
                    self.args.locus
            ):  # dummy d genes: always erode the whole thing from the left
                assert gene_length == 1 and tmpline[
                    'd_gene'] == glutils.dummy_d_genes[self.args.locus]
                tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0
            else:
                max_erosion = max(0, gene_length / 2 - 2)  # heuristic
                if region in utils.conserved_codons[
                        self.args.
                        locus]:  # make sure not to erode a conserved codon
                    codon_pos = self.glfo[
                        utils.conserved_codons[self.args.locus][region] +
                        '-positions'][tmpline[region + '_gene']]
                    if '3p' in erosion:
                        n_bases_to_codon = gene_length - codon_pos - 3
                    elif '5p' in erosion:
                        n_bases_to_codon = codon_pos
                    max_erosion = min(max_erosion, n_bases_to_codon)
                tmpline[erosion + '_del'] = min(
                    max_erosion,
                    numpy.random.geometric(
                        1. / utils.scratch_mean_erosion_lengths[erosion]) - 1)
        for bound in utils.boundaries:
            mean_length = utils.scratch_mean_insertion_lengths[
                self.args.locus][bound]
            length = 0 if mean_length == 0 else numpy.random.geometric(
                1. / mean_length) - 1
            probs = [
                self.insertion_content_probs[bound][n] for n in utils.nukes
            ]
            tmpline[bound + '_insertion'] = ''.join(
                numpy.random.choice(utils.nukes, size=length, p=probs))

        if debug:
            print '    erosions:  %s' % ('   '.join(
                [('%s %d' % (e, tmpline[e + '_del']))
                 for e in utils.real_erosions]))
            print '    insertions:  %s' % ('   '.join(
                [('%s %s' % (b, tmpline[b + '_insertion']))
                 for b in utils.boundaries]))

        # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator)
        gl_seqs = {
            r: self.glfo['seqs'][r][tmpline[r + '_gene']]
            for r in utils.regions
        }
        for erosion in utils.real_erosions:
            region = erosion[0]
            e_length = tmpline[erosion + '_del']
            if '5p' in erosion:
                gl_seqs[region] = gl_seqs[region][e_length:]
            elif '3p' in erosion:
                gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) -
                                                  e_length]
        tmpline['seqs'] = [
            gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] +
            tmpline['dj_insertion'] + gl_seqs['j'],
        ]
        tmpline['indelfos'] = [
            utils.get_empty_indel(),
        ]
        utils.add_implicit_info(self.glfo, tmpline)
        assert len(tmpline['in_frames']) == 1