def print_hmm_output(self, line, print_true=False, perfplotter=None):
        out_str_list = []
        ilabel = ''
        if print_true and not self.args.is_data:  # first print true event (if this is simulation)
            for reco_id, uids in self.get_true_clusters(
                    line['unique_ids']).items():
                for iid in range(len(uids)):
                    out_str_list.append(
                        utils.print_reco_event(self.germline_seqs,
                                               self.reco_info[uids[iid]],
                                               extra_str='    ',
                                               return_string=True,
                                               label='true:',
                                               one_line=(iid != 0)))
            ilabel = 'inferred:'

        out_str_list.append(
            utils.print_reco_event(self.germline_seqs,
                                   line,
                                   extra_str='    ',
                                   return_string=True,
                                   label=ilabel))
        for iextra in range(1, len(line['unique_ids'])):
            line['seq'] = line['seqs'][iextra]
            out_str_list.append(
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       extra_str='    ',
                                       return_string=True,
                                       one_line=True))

        # if not self.args.is_data:
        #     self.print_performance_info(line, perfplotter=perfplotter)

        print ''.join(out_str_list),
Exemplo n.º 2
0
    def harmonize_naive_seq_lengths(self, true_line, line):
        def tpos_to_j_end(tmpline):
            return len(tmpline['naive_seq']) - tmpline['codon_positions'][
                'j']  # not quite sure it's best to use the naive seq, but I think it is

        true_naive_seq = true_line['naive_seq']
        inferred_naive_seq = line['naive_seq']
        if len(line['fv_insertion']) > 0:
            inferred_naive_seq = inferred_naive_seq[len(line['fv_insertion']):]
        if len(true_naive_seq) != len(inferred_naive_seq) and len(
                line['jf_insertion']
        ) > 0:  # some j genes are very similar, except differ by one base in length, so shit is complicated
            inferred_naive_seq = inferred_naive_seq[:len(inferred_naive_seq) -
                                                    len(line['jf_insertion'])]
        if len(true_naive_seq) != len(inferred_naive_seq) and tpos_to_j_end(
                true_line) != tpos_to_j_end(line):
            extra_true_bases = tpos_to_j_end(true_line) - tpos_to_j_end(line)
            if extra_true_bases > 0:  # add Ns to the inferred line if the true line is longer
                inferred_naive_seq += extra_true_bases * 'N'
            else:  # otherwise add 'em to the true line
                true_naive_seq += (-extra_true_bases) * 'N'
        if len(true_naive_seq) != len(inferred_naive_seq):
            utils.print_reco_event(true_line, label='true')
            utils.print_reco_event(line, label='inf')
            raise Exception(
                'different length true and inferred naive seqs for %s\n  %s\n  %s (see above)'
                % (' '.join(
                    line['unique_ids']), true_naive_seq, inferred_naive_seq))

        return true_naive_seq, inferred_naive_seq
Exemplo n.º 3
0
 def print_event(self):
     line = {}  # collect some information into a form that the print fcn understands
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['input_seqs'] = self.final_seqs
     line['indel_reversed_seqs'] = []
     for iseq in range(len(self.indelfos)):
         if self.indelfos[iseq]['reversed_seq'] != '':
             line['indel_reversed_seqs'].append(self.indelfos[iseq]['reversed_seq'])
         else:
             line['indel_reversed_seqs'].append(line['input_seqs'][iseq])
     line['seqs'] = line['indel_reversed_seqs']
     line['indelfos'] = self.indelfos
     line['unique_ids'] = [str(i) for i in range(len(self.final_seqs))]
     line['cdr3_length'] = self.cdr3_length
     line['codon_positions'] = copy.deepcopy(self.final_codon_positions)
     utils.add_implicit_info(self.glfo, line)
     utils.print_reco_event(self.glfo['seqs'], line)
Exemplo n.º 4
0
    def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions):
        assert query_name not in self.info
        self.info['queries'].append(query_name)
        self.info[query_name] = {}
        self.info[query_name]['unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j'])  # all gene matches for this query

        self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ]

        self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel())

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info['all_best_matches'].add(best[region])
            self.info['all_matches'][region] |= set(match_names[region])

        self.info[query_name]['seq'] = query_seq  # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence

        existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position'])
        utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys)

        if self.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str='      ', label='true:')
            utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str='      ', label='inferred:')

        if self.alfinder is not None:
            self.alfinder.increment(self.info[query_name])
        if self.pcounter is not None:
            self.pcounter.increment_all_params(self.info[query_name])
            if self.true_pcounter is not None:
                self.true_pcounter.increment_all_params(self.reco_info[query_name])
        if self.perfplotter is not None:
            if query_name in self.info['indels']:
                print '    skipping performance evaluation of %s because of indels' % query_name  # I just have no idea how to handle naive hamming fraction when there's indels
            else:
                self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name])

        self.remaining_queries.remove(query_name)
Exemplo n.º 5
0
    def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions):
        assert query_name not in self.info
        self.info['queries'].append(query_name)
        self.info[query_name] = {}
        self.info[query_name]['unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j'])

        # assert codon_positions['v'] != -1
        # assert codon_positions['j'] != -1
        self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']
        if self.info[query_name]['cyst_position'] < 0 or self.info[query_name]['cyst_position'] >= len(query_seq):
            raise Exception('cpos %d invalid for %s (%s)' % (self.info[query_name]['cyst_position'], query_name, query_seq))
        if self.info[query_name]['tryp_position'] < 0 or self.info[query_name]['tryp_position'] >= len(query_seq):
            raise Exception('tpos %d invalid for %s (%s)' % (self.info[query_name]['tryp_position'], query_name, query_seq))

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ]

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq']
            self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq']
            self.info['all_best_matches'].add(best[region])

        self.info[query_name]['seq'] = query_seq  # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence
        if self.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str='      ', label='true:', indelfo=self.reco_info[query_name]['indels'])
            utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str='      ', label='inferred:', indelfo=self.info['indels'].get(query_name, None))

        if self.pcounter is not None:
            self.pcounter.increment_reco_params(self.info[query_name])
            self.pcounter.increment_mutation_params(self.info[query_name])
        if self.true_pcounter is not None:
            self.true_pcounter.increment_reco_params(self.reco_info[query_name])
            self.true_pcounter.increment_mutation_params(self.reco_info[query_name])
        if self.perfplotter is not None:
            self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name])  #, subtract_unphysical_erosions=True)

        self.remaining_queries.remove(query_name)
Exemplo n.º 6
0
    def harmonize_naive_seq_lengths(self, true_line, line):
        def tpos_to_j_end(tmpline):
            return len(tmpline['naive_seq']) - tmpline['codon_positions'][
                'j']  # not quite sure it's best to use the naive seq, but I think it is

        true_naive_seq = true_line['naive_seq']
        inferred_naive_seq = line['naive_seq']
        if len(line['fv_insertion']) > 0:
            inferred_naive_seq = inferred_naive_seq[len(line['fv_insertion']):]
        if len(true_naive_seq) != len(inferred_naive_seq) and len(
                line['jf_insertion']
        ) > 0:  # some j genes are very similar, except differ by one base in length, so shit is complicated
            inferred_naive_seq = inferred_naive_seq[:len(inferred_naive_seq) -
                                                    len(line['jf_insertion'])]
        if len(true_naive_seq) != len(inferred_naive_seq) and tpos_to_j_end(
                true_line) != tpos_to_j_end(line):
            extra_true_bases = tpos_to_j_end(true_line) - tpos_to_j_end(line)
            if extra_true_bases > 0:  # add Ns to the inferred line if the true line is longer
                inferred_naive_seq += extra_true_bases * 'N'
            else:  # otherwise add 'em to the true line
                true_naive_seq += (-extra_true_bases) * 'N'
        if len(true_naive_seq) != len(inferred_naive_seq):
            # all this stuff gets printed four times, since we're calling this fcn for each region. sigh.
            utils.print_reco_event(true_line, label='true')
            utils.print_reco_event(line, label='inf')
            print '%s different length true and inferred naive seqs for %s (see above)\n  %s\n  %s' % (
                utils.color('yellow', 'warning'), ' '.join(
                    line['unique_ids']), true_naive_seq, inferred_naive_seq)

            # I'd rather just give up and skip it at this point, but that involves passing knowledge of the failure through too many functions so it's hard, so... align 'em, which isn't right, but oh well
            aligned_true, aligned_inferred = utils.align_seqs(
                true_naive_seq, inferred_naive_seq)
            true_list, inf_list = [], []
            for ctrue, cinf in zip(
                    aligned_true, aligned_inferred
            ):  # remove bases corresponding to gaps in true, and replace gaps in inf with Ns (the goal is to end up with aligned seqs that are the same length as the true inferred sequence, so the restrict_to_region stuff still works)
                if ctrue in utils.gap_chars:
                    continue
                elif cinf in utils.gap_chars:
                    true_list += [ctrue]
                    inf_list += [utils.ambiguous_bases[0]]
                else:
                    true_list += [ctrue]
                    inf_list += [cinf]
            assert len(true_list) == len(true_naive_seq)
            true_naive_seq = ''.join(true_list)
            inferred_naive_seq = ''.join(inf_list)
            # utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True)

        return true_naive_seq, inferred_naive_seq
Exemplo n.º 7
0
    def finalize(self):
        if self.perfplotter is not None:
            self.perfplotter.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots)
        # print '    sw time: %.3f' % (time.time()-start)
        print '      info for %d' % len(self.info['queries']),
        skipped_unproductive = len(self.unproductive_queries)
        n_remaining = len(self.remaining_queries)
        if skipped_unproductive > 0 or n_remaining > 0:
            print '     (skipped',
            print '%d / %d = %.2f unproductive' % (skipped_unproductive, len(self.input_info), float(skipped_unproductive) / len(self.input_info)),
            if n_remaining > 0:
                print '   %d / %d = %.2f other' % (n_remaining, len(self.input_info), float(n_remaining) / len(self.input_info)),
            print ')',
        print ''
        sys.stdout.flush()
        if n_remaining > 0:
            printstr = '   %s %d missing %s' % (utils.color('red', 'warning'), n_remaining, utils.plural_str('annotation', n_remaining))
            if n_remaining < 15:
                printstr += ' (' + ':'.join(self.remaining_queries) + ')'
            print printstr
        if self.debug and len(self.info['indels']) > 0:
            print '      indels: %s' % ':'.join(self.info['indels'].keys())
        assert len(self.info['queries']) + skipped_unproductive + n_remaining == len(self.input_info)
        if self.debug and not self.args.is_data and n_remaining > 0:
            print 'true annotations for remaining events:'
            for qry in self.remaining_queries:
                utils.print_reco_event(self.glfo['seqs'], self.reco_info[qry], extra_str='      ', label='true:')
        if self.alfinder is not None:
            self.alfinder.finalize(debug=self.args.debug_new_allele_finding)
            self.info['new-alleles'] = self.alfinder.new_allele_info
            if self.args.plotdir is not None:
                self.alfinder.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots)

        # add padded info to self.info (returns if stuff has already been padded)
        self.pad_seqs_to_same_length()  # NOTE this uses *all the gene matches (not just the best ones), so it has to come before we call pcounter.write(), since that fcn rewrites the germlines removing genes that weren't best matches. But NOTE also that I'm not sure what but that the padding actually *needs* all matches (rather than just all *best* matches)

        if self.pcounter is not None:
            if self.args.plotdir is not None:
                self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots)
                if self.true_pcounter is not None:
                    self.true_pcounter.plot(self.args.plotdir + '/sw-true', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots)
            self.pcounter.write(self.parameter_dir, self.my_datadir)
            if self.true_pcounter is not None:
                self.true_pcounter.write(self.parameter_dir + '-true')

        self.info['remaining_queries'] = self.remaining_queries
Exemplo n.º 8
0
    def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions, perfplotter=None):
        assert query_name not in self.info
        self.info[query_name] = {}
        self.info[query_name]['unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j'])

        assert codon_positions['v'] != -1
        assert codon_positions['j'] != -1
        self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ]

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq']
            self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq']
            self.info['all_best_matches'].add(best[region])

        self.info[query_name]['seq'] = query_seq  # only need to add this so I can pass it to print_reco_event
        if self.args.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str='      ', label='true:')
            utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str='      ', label='inferred:')

        if self.pcounter != None:
            self.pcounter.increment(self.info[query_name])
        if self.true_pcounter != None:
            self.true_pcounter.increment(self.reco_info[query_name])
        if perfplotter != None:
            perfplotter.evaluate(self.reco_info[query_name], self.info[query_name])  #, subtract_unphysical_erosions=True)
Exemplo n.º 9
0
    def print_hmm_output(self, line, print_true=False, perfplotter=None):
        out_str_list = []
        ilabel = ''
        if print_true and not self.args.is_data:  # first print true event (if this is simulation)
            for reco_id, uids in self.get_true_clusters(line['unique_ids']).items():
                for iid in range(len(uids)):
                    out_str_list.append(utils.print_reco_event(self.germline_seqs, self.reco_info[uids[iid]], extra_str='    ', return_string=True, label='true:', one_line=(iid!=0)))
            ilabel = 'inferred:'

        out_str_list.append(utils.print_reco_event(self.germline_seqs, line, extra_str='    ', return_string=True, label=ilabel))
        for iextra in range(1, len(line['unique_ids'])):
            line['seq'] = line['seqs'][iextra]
            out_str_list.append(utils.print_reco_event(self.germline_seqs, line, extra_str='    ', return_string=True, one_line=True))

        # if not self.args.is_data:
        #     self.print_performance_info(line, perfplotter=perfplotter)

        print ''.join(out_str_list),
Exemplo n.º 10
0
 def print_event(self):
     line = {}  # collect some information into a form that print_reco_event understands
     line['cdr3_length'] = self.cdr3_length
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     line['cyst_position'] = self.final_cyst_position
     line['tryp_position'] = self.final_tryp_position
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['seqs'] = self.final_seqs
     line['unique_ids'] = [i for i in range(len(self.final_seqs))]
     utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
Exemplo n.º 11
0
 def print_event(self, total_length_from_right=0):
     line = {
     }  # collect some information into a form that print_reco_event understands
     line['cdr3_length'] = self.cdr3_length
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     line['cyst_position'] = self.final_cyst_position
     line['tryp_position'] = self.final_tryp_position
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['seqs'] = self.final_seqs
     line['unique_ids'] = [i for i in range(len(self.final_seqs))]
     utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
Exemplo n.º 12
0
    def finalize(self):
        if self.perfplotter is not None:
            self.perfplotter.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots)
        # print '    sw time: %.3f' % (time.time()-start)
        print '      info for %d' % len(self.info['queries']),
        skipped_unproductive = len(self.unproductive_queries)
        n_remaining = len(self.remaining_queries)
        if skipped_unproductive > 0 or n_remaining > 0:
            print '     (skipped',
            print '%d / %d = %.2f unproductive' % (skipped_unproductive, len(self.input_info), float(skipped_unproductive) / len(self.input_info)),
            if n_remaining > 0:
                print '   %d / %d = %.2f other' % (n_remaining, len(self.input_info), float(n_remaining) / len(self.input_info)),
            print ')',
        print ''
        sys.stdout.flush()
        if n_remaining > 0:
            printstr = '   %s %d missing annotations' % (utils.color('red', 'warning'), n_remaining)
            if n_remaining < 15:
                printstr += ' (' + ':'.join(self.remaining_queries) + ')'
            print printstr
        if self.debug and len(self.info['indels']) > 0:
            print '      indels: %s' % ':'.join(self.info['indels'].keys())
        assert len(self.info['queries']) + skipped_unproductive + n_remaining == len(self.input_info)
        if self.debug and not self.args.is_data and n_remaining > 0:
            print 'true annotations for remaining events:'
            for qry in self.remaining_queries:
                utils.print_reco_event(self.glfo['seqs'], self.reco_info[qry], extra_str='      ', label='true:')
        if self.pcounter is not None:
            self.pcounter.write(self.parameter_dir)
            if self.true_pcounter is not None:
                assert self.parameter_dir[-1] != '/'
                self.true_pcounter.write(self.parameter_dir + '-true')
            if self.args.plotdir is not None:
                self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots)
                if self.true_pcounter is not None:
                    self.true_pcounter.plot(self.args.plotdir + '/sw-true', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots)

        utils.pad_seqs_to_same_length(self.info['queries'], self.info, self.glfo, self.info['indels'])  # adds padded info to self.info (returns if stuff has already been padded)
        self.info['remaining_queries'] = self.remaining_queries
Exemplo n.º 13
0
 def print_event(self, total_length_from_right=0):
     line = {}  # collect some information into a form that print_reco_event understands
     line["cdr3_length"] = self.cdr3_length
     for region in utils.regions:
         line[region + "_gene"] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + "_insertion"] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + "_del"] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + "_del"] = self.effective_erosions[erosion]
     line["cyst_position"] = self.final_cyst_position
     line["tryp_position"] = self.final_tryp_position
     assert (
         "fv_insertion" not in line
     )  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert "jf_insertion" not in line
     line["fv_insertion"] = ""
     line["jf_insertion"] = ""
     line["seqs"] = self.final_seqs
     line["unique_ids"] = [i for i in range(len(self.final_seqs))]
     utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
Exemplo n.º 14
0
 def print_event(self):
     line = {}  # collect some information into a form that the print fcn understands
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     line['seqs'] = self.final_seqs
     line['unique_ids'] = [i for i in range(len(self.final_seqs))]
     line['cdr3_length'] = self.cdr3_length
     line['cyst_position'] = self.final_cyst_position
     line['tryp_position'] = self.final_tryp_position
     line['indelfos'] = self.indelfos
     utils.add_implicit_info(self.glfo, line, multi_seq=True, existing_implicit_keys=('cdr3_length', 'cyst_position', 'tryp_position'))
     utils.print_reco_event(self.glfo['seqs'], line)
Exemplo n.º 15
0
 def print_event(self, total_length_from_right=0):
     line = {}  # collect some information into a form that print_reco_event understands
     line['cdr3_length'] = self.cdr3_length
     for region in utils.regions:
         line[region + '_gene'] = self.genes[region]
     for boundary in utils.boundaries:
         line[boundary + '_insertion'] = self.insertions[boundary]
     for erosion in utils.real_erosions:
         line[erosion + '_del'] = self.erosions[erosion]
     for erosion in utils.effective_erosions:
         line[erosion + '_del'] = self.effective_erosions[erosion]
     line['cyst_position'] = self.final_cyst_position
     line['tryp_position'] = self.final_tryp_position
     assert 'fv_insertion' not in line  # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things
     assert 'jf_insertion' not in line
     line['fv_insertion'] = ''
     line['jf_insertion'] = ''
     for imute in range(len(self.final_seqs)):
         line['seq'] = self.final_seqs[imute]
         if total_length_from_right > 0:
             line['seq'] = line['seq'][len(line['seq'])-total_length_from_right : ]
         utils.print_reco_event(self.germlines, line, one_line=(imute!=0))
Exemplo n.º 16
0
def print_stuff(line):
    cluster_index = sorted_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan

    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    # for iseq in range(len(line['unique_ids'])):
    #     naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #     mature_cdr3_seqs.append(mature_cdr3_seq)
    # mature_cdr3_seqs
    # translated_cdr3 = mature_cdr3_seqs.translate()

    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    # If a cluster contains one of our seed seqs, color this CDR3 red
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)
    if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument
        print 'index    genes                                        size    n muts    SHM     rep frac     CDR3                                FayWuH'
        print '                                                            mean  med                        len  seq'
        print '%4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
                cluster_index,
                utils.color_gene(line['v_gene'], width=15),
                utils.color_gene(line['d_gene'], width=15),
                utils.color_gene(line['j_gene'], width=10),
                len(line['unique_ids']),
                numpy.mean(line['n_mutations']),
                numpy.median(line['n_mutations']),
                numpy.mean(line['mut_freqs']),
                float(len(cluster)) / n_total,
                (line['cdr3_length']/3),
                cdr3_aa,
                utils.fay_wu_h(line, debug=False),
                )
        # print 'number of mutations per sequence in cluster', sorted(line['n_mutations'])
        print len(line['naive_seq']), 'length of naive seq'
        # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0))  # print ascii-art representation of the rearrangement event
        print 'unique_ids: ', getkey(line['unique_ids'])
        print
        print utils.print_reco_event(line)
Exemplo n.º 17
0
    def parse_file(self, infname):
        tree = ET.parse(infname)
        root = tree.getroot()

        for query in root:
            self.n_total += 1
            if self.n_max_queries > 0 and self.n_total > self.n_max_queries:
                break

            unique_id = query.attrib['id'].replace('>', '').replace(' ', '')
            if len(self.queries) > 0 and  unique_id not in self.queries:
                continue
            if self.debug:
                print self.n_total, unique_id
            line = {}
            line['unique_id'] = unique_id
            line['seq'] = self.seqinfo[unique_id]['seq']
            for region in utils.regions:
                if self.debug:
                    print ' ', region
                self.get_region_matches(region, query, line)
            if 'v_gene' not in line or 'd_gene' not in line or 'j_gene' not in line:
                print '  ERROR giving up on %s' % unique_id
                self.n_failed += 1
                continue

            add_insertions(line)
            try:
                resolve_overlapping_matches(line, self.debug)
            except:
                print 'ERROR apportionment failed on %s' % unique_id
                self.n_failed += 1
                continue

            self.perfplotter.evaluate(self.seqinfo[unique_id], line)

            if self.debug:
                utils.print_reco_event(self.germline_seqs, line)
Exemplo n.º 18
0
    def parse_file(self, infname):
        tree = ET.parse(infname)
        root = tree.getroot()

        for query in root:
            self.n_total += 1
            if self.n_max_queries > 0 and self.n_total > self.n_max_queries:
                break

            unique_id = query.attrib['id'].replace('>', '').replace(' ', '')
            if len(self.queries) > 0 and unique_id not in self.queries:
                continue
            if self.debug:
                print self.n_total, unique_id
            line = {}
            line['unique_id'] = unique_id
            line['seq'] = self.seqinfo[unique_id]['seq']
            for region in utils.regions:
                if self.debug:
                    print ' ', region
                self.get_region_matches(region, query, line)
            if 'v_gene' not in line or 'd_gene' not in line or 'j_gene' not in line:
                print '  ERROR giving up on %s' % unique_id
                self.n_failed += 1
                continue

            add_insertions(line)
            try:
                resolve_overlapping_matches(line, self.debug)
            except:
                print 'ERROR apportionment failed on %s' % unique_id
                self.n_failed += 1
                continue

            self.perfplotter.evaluate(self.seqinfo[unique_id], line)

            if self.debug:
                utils.print_reco_event(self.germline_seqs, line)
Exemplo n.º 19
0
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != "Details":
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info["unique_id"] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print "oop", begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != "":
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find("_gene") == 1:
                    region = column[0]
                    info[region + "_5p_del"] = (
                        int(fk.line[fk.line.index("start:") + 1]) - 1
                    )  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index("gene:") + 1]) - 1
                    match_end = int(fk.line[fk.line.index("end:") + 1]) - 1
                    assert gl_length >= match_end
                    info[region + "_3p_del"] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                fk.increment()
            return

        info["fv_insertion"] = ""
        info["jf_insertion"] = ""
        info["seq"] = (
            info["v_qr_seq"] + info["vd_insertion"] + info["d_qr_seq"] + info["dj_insertion"] + info["j_qr_seq"]
        )

        if "-" in info["seq"]:
            print "ERROR found a dash in %s, returning failure" % unique_id
            while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                fk.increment()
            return

        if (
            info["seq"] not in self.siminfo[unique_id]["seq"]
        ):  # arg. I can't do != because it tacks on v left and j right deletions
            print "ERROR didn't find the right sequence for %s" % unique_id
            print "  ", info["seq"]
            print "  ", self.siminfo[unique_id]["seq"]
            sys.exit()

        if self.args.debug:
            print unique_id
            for region in utils.regions:
                infer_gene = info[region + "_gene"]
                true_gene = self.siminfo[unique_id][region + "_gene"]
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color("bold", utils.color("blue", region))
                    truestr = ""  #'(originally %s)' % match_name
                else:
                    regionstr = utils.color("bold", utils.color("red", region))
                    truestr = "(true: %s)" % utils.color_gene(true_gene).replace(region, "")
                print "  %s %s %s" % (regionstr, utils.color_gene(infer_gene).replace(region, ""), truestr)

            utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label="true:", extra_str="    ")
            utils.print_reco_event(self.germline_seqs, info, label="inferred:", extra_str="    ")

        for region in utils.regions:
            if info[region + "_gene"] not in self.germline_seqs[region]:
                print "ERROR %s not in germlines" % info[region + "_gene"]
                assert False

            gl_seq = info[region + "_gl_seq"]
            if "[" in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace("[", nuke)
                    if gl_seq in self.germline_seqs[region][info[region + "_gene"]]:
                        print "  replaced [ with %s" % nuke
                        break
                info[region + "_gl_seq"] = gl_seq

            if info[region + "_gl_seq"] not in self.germline_seqs[region][info[region + "_gene"]]:
                print "ERROR gl match not found for %s in %s" % (info[region + "_gene"], unique_id)
                print "  ", info[region + "_gl_seq"]
                print "  ", self.germline_seqs[region][info[region + "_gene"]]
                self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
                while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
            fk.increment()
Exemplo n.º 20
0
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH',
                              line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(
                    self.seqinfo[query_name],
                    qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '  ERROR no %s match for %d' % (region, query_name)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name][
                    'seq'][:qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name][
                'seq'][:qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name][
                    'seq'][qr_info['match_end'] -
                           len(self.seqinfo[query_name]['seq']):]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][
                qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start:end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug,
                                        self.germline_seqs)
        except AssertionError:
            print 'ERROR apportionment failed on %s' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                              qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                print '    %s %3d %3d %s %s' % (
                    region, qr_info[region + '_qr_bounds'][0],
                    qr_info[region + '_qr_bounds'][1],
                    utils.color_gene(qr_info[region + '_gene']),
                    qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs,
                                   self.seqinfo[query_name],
                                   label='true:',
                                   extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
Exemplo n.º 21
0
    def summarize_query(self, query_name, query_seq, raw_best, all_match_names,
                        all_query_bounds, all_germline_bounds, perfplotter,
                        warnings):
        if self.args.debug:
            print '%s' % str(query_name)

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v': 0, 'd': 0, 'j': 0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region],
                                             reverse=True)
            match_names[region] = []
        codon_positions = {
            'v': -1,
            'd': -1,
            'j': -1
        }  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(
                    query_seq
                )  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][
                    glbounds[0]:glbounds[1]]

                # only use the best few matches
                if n_used[region] >= int(
                        self.args.n_max_per_region[utils.regions.index(region)]
                ):  # only take the top few from each region
                    break

                # only use a specified set of genes
                if self.args.only_genes != None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region,
                                 gene,
                                 query_seq,
                                 score,
                                 glbounds,
                                 qrbounds,
                                 -1,
                                 warnings,
                                 skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(
                        query_seq[qrbounds[0]:qrbounds[1]]
                ):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][
                        1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                    # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][1] - all_query_bounds[
                        raw_best['v']][1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][
                        gene][glbounds[0]:glbounds[1]]
                    best[region +
                         '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.args.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)

        for region in utils.regions:
            if region not in best:
                print '    no', region, 'match found for', query_name  # NOTE if no d match found, we should really should just assume entire d was eroded
                if not self.args.is_data:
                    print '    true:'
                    utils.print_reco_event(self.germline_seqs,
                                           self.reco_info[query_name],
                                           extra_str='    ')
                return

        # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j
        try:
            self.shift_overlapping_boundaries(all_query_bounds,
                                              all_germline_bounds, query_name,
                                              query_seq, best)
        except AssertionError:
            print '      ERROR %s apportionment failed' % str(query_name)
            return

        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(
                self.cyst_positions, self.tryp_positions, region, best[region],
                all_germline_bounds,
                all_query_bounds)  # position in the query sequence, that is

        # check for unproductive rearrangements
        try:
            # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons
            utils.check_both_conserved_codons(query_seq,
                                              codon_positions['v'],
                                              codon_positions['j'],
                                              debug=self.args.debug,
                                              extra_str='      ')
            cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
            if cdr3_length % 3 != 0:  # make sure we've stayed in frame
                if self.args.debug:
                    print '      out of frame cdr3: %d %% 3 = %d' % (
                        cdr3_length, cdr3_length % 3)
                assert False
            utils.check_for_stop_codon(query_seq,
                                       codon_positions['v'],
                                       debug=self.args.debug)
        except AssertionError:
            if self.args.debug:
                print '       unproductive rearrangement in waterer'
            if self.args.skip_unproductive:
                if self.args.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][
            1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.args.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][
                -5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.args.debug:
                print '  doubly expanding k_d'
            if k_d_max - k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(
            0, k_v_min - self.args.default_v_fuzz
        )  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.args.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region],
                                      n_matches[region]),
            print ''

        kvals = {}
        kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max}
        kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max}
        self.add_to_info(query_name,
                         query_seq,
                         kvals,
                         match_names,
                         best,
                         all_germline_bounds,
                         all_query_bounds,
                         codon_positions=codon_positions,
                         perfplotter=perfplotter)
Exemplo n.º 22
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                indelutils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data)
        # This chosen depth corresponds to the sequence-wide mutation frequency.
        # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region.
        # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file).
        # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
        # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        treefostr = self.treeinfo[random.randint(
            0,
            len(self.treeinfo) - 1
        )]  # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok.
        assert treefostr.count(';') == 1
        isplit = treefostr.find(';') + 1
        chosen_tree = treefostr[:isplit]  # includes semi-colon
        mutefo = [rstr for rstr in treefostr[isplit:].split(',')]
        mean_total_height = treegenerator.get_mean_height(chosen_tree)
        regional_heights = {
        }  # per-region height, including <self.args.mutation_multiplier>
        for tmpstr in mutefo:
            region, ratio = tmpstr.split(':')
            assert region in utils.regions
            ratio = float(ratio)
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                ratio *= self.args.mutation_multiplier
            regional_heights[region] = mean_total_height * ratio

        scaled_trees = {
            r: treegenerator.rescale_tree(chosen_tree, regional_heights[r])
            for r in utils.regions
        }

        if self.args.debug:
            print '  chose tree with total height %f' % treegenerator.get_mean_height(
                chosen_tree)
            print '    regional trees rescaled to heights:  %s' % ('   '.join([
                '%s %.3f  (expected %.3f)' %
                (region, treegenerator.get_mean_height(
                    scaled_trees[region]), regional_heights[region])
                for region in utils.regions
            ]))
            print treegenerator.get_ascii_tree(chosen_tree, extra_str='    ')

        n_leaves = treegenerator.get_n_leaves(chosen_tree)
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaves,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(
                len(utils.regions)
        ):  # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.)
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaves)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaves)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaves):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq, debug=self.args.debug
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event
            reco_event.final_codon_positions.append(
                copy.deepcopy(reco_event.post_erosion_codon_positions)
            )  # separate codon positions for each sequence, because of shm indels

        self.add_shm_indels(reco_event)

        reco_event.setline(
            irandom
        )  # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow

        self.check_tree_simulation(mean_total_height, regional_heights,
                                   scaled_trees, mseqs, reco_event)

        if self.args.debug:
            utils.print_reco_event(reco_event.line, extra_str='    ')
Exemplo n.º 23
0
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != 'Details':
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info['unique_id'] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print 'oop', begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != '':
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find('_gene') == 1:
                    region = column[0]
                    info[region + '_5p_del'] = int(
                        fk.line[fk.line.index('start:') +
                                1]) - 1  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1
                    match_end = int(fk.line[fk.line.index('end:') + 1]) - 1
                    assert gl_length >= match_end
                    info[region + '_3p_del'] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[
                    1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        info['fv_insertion'] = ''
        info['jf_insertion'] = ''
        info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info[
            'd_qr_seq'] + info['dj_insertion'] + info['j_qr_seq']

        if '-' in info['seq']:
            print 'ERROR found a dash in %s, returning failure' % unique_id
            while not fk.eof and fk.line[
                    1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        if info['seq'] not in self.siminfo[unique_id][
                'seq']:  # arg. I can't do != because it tacks on v left and j right deletions
            print 'ERROR didn\'t find the right sequence for %s' % unique_id
            print '  ', info['seq']
            print '  ', self.siminfo[unique_id]['seq']
            sys.exit()

        if self.args.debug:
            print unique_id
            utils.print_reco_event(self.germline_seqs,
                                   self.siminfo[unique_id],
                                   label='true:',
                                   extra_str='    ')
            utils.print_reco_event(self.germline_seqs,
                                   info,
                                   label='inferred:',
                                   extra_str='    ')

        for region in utils.regions:
            if info[region + '_gene'] not in self.germline_seqs[region]:
                print 'ERROR %s not in germlines' % info[region + '_gene']
                assert False

            gl_seq = info[region + '_gl_seq']
            if '[' in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace('[', nuke)
                    if gl_seq in self.germline_seqs[region][info[region +
                                                                 '_gene']]:
                        print '  replaced [ with %s' % nuke
                        break
                info[region + '_gl_seq'] = gl_seq

            if info[region + '_gl_seq'] not in self.germline_seqs[region][info[
                    region + '_gene']]:
                print 'ERROR gl match not found for %s in %s' % (
                    info[region + '_gene'], unique_id)
                print '  ', info[region + '_gl_seq']
                print '  ', self.germline_seqs[region][info[region + '_gene']]
                self.perfplotter.add_partial_fail(self.siminfo[unique_id],
                                                  info)
                while not fk.eof and fk.line[
                        1] != 'Details':  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[
                1] != 'Details':  # skip stuff until start of next Detail block
            fk.increment()
Exemplo n.º 24
0
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path +
          '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(
            line)  # converts strings in the csv file to floats/ints/dicts/etc.
        utils.add_implicit_info(
            glfo, line
        )  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
        utils.print_reco_event(
            line)  # print ascii-art representation of the rearrangement event
        print '\navailable annotation info for each line (see manual for descriptions):'
        for key, val in line.items():
            print '%20s %s' % (key, val)
        break

print '\n\nthen parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path +
            '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Exemplo n.º 25
0
            outline[region + '_gene'] = utils.unsanitize_name(inferred_name)
            true_name = utils.sanitize_name(inline[region + '_gene'])

            inferred_group_str += inferred_name
            true_group_str += true_name
            if inferred_name == 'none':
                print ' none',
            elif  inferred_name == true_name:
                print '  -  ',
            else:
                print '  x  ',
        for region in utils.regions:
            print '%3d' % searcher.n_tries[region],
        print ''
        print '  true'
        utils.print_reco_event(germlines, inline, -1, -1)
        if searcher.all_matched():
            print '  inferred'
            try:
                searcher.build_inferred_seq(inline['seq'], germlines, outline)
                utils.print_reco_event(germlines, outline, -1, -1)
            except:
                print '   *something* is wrong!'
                print '    ',searcher.best_matches['v']
                print '    ',searcher.best_matches['d']
                print '    ',searcher.best_matches['j']
                continue
        else:
            print 'no matches!'
            print '    ',searcher.best_matches['v']
            print '    ',searcher.best_matches['d']
Exemplo n.º 26
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Exemplo n.º 27
0
    def summarize_query(self, query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings):
        if self.args.debug:
            print '%s' % str(query_name)

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v':0, 'd':0, 'j':0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region], reverse=True)
            match_names[region] = []
        codon_positions = {'v':-1, 'd':-1, 'j':-1}  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(query_seq)  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]

                # only use the best few matches
                if n_used[region] >= int(self.args.n_max_per_region[utils.regions.index(region)]):  # only take the top few from each region
                    break

                # only use a specified set of genes
                if self.args.only_genes != None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                                                          # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][1] - all_query_bounds[raw_best['v']][1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]
                    best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.args.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)
                        
        for region in utils.regions:
            if region not in best:
                print '    no',region,'match found for',query_name  # NOTE if no d match found, we should really should just assume entire d was eroded
                if not self.args.is_data:
                    print '    true:'
                    utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str='    ')
                return

        # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j
        try:
            self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best)
        except AssertionError:
            print '      ERROR %s apportionment failed' % str(query_name)
            return

        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds, all_query_bounds)  # position in the query sequence, that is

        # check for unproductive rearrangements
        try:
            # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons
            utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.args.debug, extra_str='      ')
            cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
            if cdr3_length % 3 != 0:  # make sure we've stayed in frame
                if self.args.debug:
                    print '      out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3)
                assert False
            utils.check_for_stop_codon(query_seq, codon_positions['v'], debug=self.args.debug)
        except AssertionError:
            if self.args.debug:
                print '       unproductive rearrangement in waterer'
            if self.args.skip_unproductive:
                if self.args.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.args.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)
            
        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.args.debug:
                print '  doubly expanding k_d'
            if k_d_max-k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(0, k_v_min - self.args.default_v_fuzz)  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.args.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region], n_matches[region]),
            print ''


        kvals = {}
        kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max}
        kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max}
        self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions, perfplotter=perfplotter)
Exemplo n.º 28
0
chfo = {
    uid: {
        k: v
        for k, v in zip(
            ('imax', 'max_abs_diff'),
            utils.get_chimera_max_abs_diff(
                annotations[uid], iseq=0, chunk_len=args.chunk_len))
    }
    for uid in annotations
}
biggest_adiffs = sorted(chfo,
                        key=lambda q: chfo[q]['max_abs_diff'],
                        reverse=True)
for uid in biggest_adiffs[:5]:
    print '%-3d  %6.3f' % (chfo[uid]['imax'], chfo[uid]['max_abs_diff'])
    utils.print_reco_event(annotations[uid])

n_above_cutoff = len(
    [_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff])
chimeric_fraction = n_above_cutoff / float(len(chfo))
print '  %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo),
                                                  chimeric_fraction)

hmaxval = Hist(45, 0., 0.65)
for uid in annotations:
    hmaxval.fill(chfo[uid]['max_abs_diff'])
himax = Hist(75, 0., 400)
for uid in annotations:
    himax.fill(chfo[uid]['imax'])

utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])
Exemplo n.º 29
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir,
                                         'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[:self.args.indir.rfind(
            '/'
        )]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(
                            re.findall('[123]. Alignment for [VDJ]-GENE',
                                       full_text)) < 3:
                        failregions = re.findall(
                            'No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position + 1)
                        pgraph = full_text[position:full_text.
                                           find('\n\n', position + 1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(
                    line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs,
                                       self.seqinfo[unique_id],
                                       label='true:')
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total,
                                        float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total,
                                         float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found +
                                                 n_found, n_not_found /
                                                 float(n_not_found + n_found))
Exemplo n.º 30
0
    def add_to_info(self, query_name, query_seq, kvals, match_names, best,
                    all_germline_bounds, all_query_bounds, codon_positions):
        assert query_name not in self.info
        self.info['queries'].append(query_name)
        self.info[query_name] = {}
        self.info[query_name][
            'unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] +
                                                match_names['d'] +
                                                match_names['j'])

        # assert codon_positions['v'] != -1
        # assert codon_positions['j'] != -1
        self.info[query_name][
            'cdr3_length'] = codon_positions['j'] - codon_positions[
                'v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']
        if self.info[query_name]['cyst_position'] < 0 or self.info[query_name][
                'cyst_position'] >= len(query_seq):
            raise Exception('cpos %d invalid for %s (%s)' %
                            (self.info[query_name]['cyst_position'],
                             query_name, query_seq))
        if self.info[query_name]['tryp_position'] < 0 or self.info[query_name][
                'tryp_position'] >= len(query_seq):
            raise Exception('tpos %d invalid for %s (%s)' %
                            (self.info[query_name]['tryp_position'],
                             query_name, query_seq))

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(
            self.germline_seqs['v'][best['v']]) - all_germline_bounds[
                best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][
            best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][
            best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name][
            'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[
            all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[
            all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[
            all_query_bounds[best['j']][1]:]

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info[query_name][region + '_gl_seq'] = best[region +
                                                             '_gl_seq']
            self.info[query_name][region + '_qr_seq'] = best[region +
                                                             '_qr_seq']
            self.info['all_best_matches'].add(best[region])

        self.info[query_name][
            'seq'] = query_seq  # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence
        if self.debug:
            if not self.args.is_data:
                utils.print_reco_event(
                    self.germline_seqs,
                    self.reco_info[query_name],
                    extra_str='      ',
                    label='true:',
                    indelfo=self.reco_info[query_name]['indels'])
            utils.print_reco_event(self.germline_seqs,
                                   self.info[query_name],
                                   extra_str='      ',
                                   label='inferred:',
                                   indelfo=self.info['indels'].get(
                                       query_name, None))

        if self.pcounter is not None:
            self.pcounter.increment_reco_params(self.info[query_name])
            self.pcounter.increment_mutation_params(self.info[query_name])
        if self.true_pcounter is not None:
            self.true_pcounter.increment_reco_params(
                self.reco_info[query_name])
            self.true_pcounter.increment_mutation_params(
                self.reco_info[query_name])
        if self.perfplotter is not None:
            self.perfplotter.evaluate(
                self.reco_info[query_name],
                self.info[query_name])  #, subtract_unphysical_erosions=True)

        self.remaining_queries.remove(query_name)
Exemplo n.º 31
0
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'skip_gene' in qr_info:
                self.n_skipped += 1
                return
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '    %d: no %s match' % (query_name, region)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start : end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs)
        except AssertionError:
            print '    %s: apportionment failed' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                true_gene = self.seqinfo[query_name][region + '_gene']
                infer_gene = qr_info[region + '_gene']
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold', utils.color('blue', region))
                    truestr = ''  #'(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '')
                # print '  %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr)

                print '    %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
Exemplo n.º 32
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Exemplo n.º 33
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(line)
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Exemplo n.º 35
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[ : self.args.indir.rfind('/')]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3:
                        failregions = re.findall('No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position+1)
                        pgraph = full_text[position : full_text.find('\n\n', position+1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:')
                utils.print_reco_event(self.germline_seqs, line, label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
Exemplo n.º 36
0
                    default=partis_dir +
                    '/test/reference-results/partition-ref-simu.yaml')
parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human')
parser.add_argument('--locus', default='igh')
args = parser.parse_args()

glfo = None
if utils.getsuffix(args.fname) == '.csv':
    print '  reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo)

if cpath is None or len(cpath.partitions) == 0:
    print 'no partitions read from %s, so just printing first annotation:' % args.fname
    utils.print_reco_event(annotation_list[0])
    sys.exit(0)

print utils.color('green', 'list of partitions:')
cpath.print_partitions(
    abbreviate=True
)  # 'abbreviate' print little 'o's instead of the full sequence ids

# print annotations for the biggest cluster in the most likely partition
annotations = {
    ':'.join(adict['unique_ids']): adict
    for adict in annotation_list
}  # collect the annotations in a dictionary so they're easier to access
most_likely_partition = cpath.partitions[
    cpath.
    i_best]  # a partition is represented as a list of lists of strings, with each string a sequence id
Exemplo n.º 37
0
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line
Exemplo n.º 38
0
def clean_pair_info(cpaths, antn_lists, n_max_clusters=3, debug=False):
    # ----------------------------------------------------------------------------------------
    def check_droplet_id_groups(tdbg=False):
        # check against the droplet id method (we could just do it this way, but it would only work for 10x, and only until they change their naming convention)
        pgroup_strs = set(':'.join(sorted(pg)) for pg in pid_groups)
        all_uids = list(
            set([
                su for l in cpaths for c in cpaths[l].best() for u in c
                for su in [u] +
                utils.per_seq_val(all_antns[u], 'paired-uids', u)
            ]))
        n_not_found = 0
        for dropid, drop_queries in itertools.groupby(
                sorted(all_uids, key=utils.get_droplet_id),
                key=utils.get_droplet_id):
            dqlist = list(drop_queries)
            found = ':'.join(sorted(dqlist)) in pgroup_strs
            if not found:
                overlaps = [g for g in pgroup_strs if dropid in g]
                overlaps = utils.get_single_entry(overlaps)
                n_not_found += 1
            if tdbg or not found:
                print '  %25s %s  %s  %s' % (
                    utils.color('green', '-') if found else utils.color(
                        'red', 'x'), dropid, ' '.join(
                            sorted(utils.get_contig_id(q) for q in dqlist)),
                    utils.color(
                        'red', ' '.join(
                            sorted(
                                utils.get_contig_id(q)
                                for q in overlaps.split(':')))
                        if not found else ''))
        if n_not_found > 0:
            print '  %s droplet id group check failed for %d groups' % (
                utils.color('red', 'error'), n_not_found)

    # ----------------------------------------------------------------------------------------
    def getloc(uid):
        if uid not in all_antns:
            return '?'
        return utils.per_seq_val(all_antns[uid], 'loci', uid)

    # ----------------------------------------------------------------------------------------
    def gval(uid, key):  # get per-seq val for <uid>
        if uid not in all_antns:
            return None
        return utils.per_seq_val(all_antns[uid], key, uid)

    # ----------------------------------------------------------------------------------------
    def lgstr(lgroup, sort=True):
        return ' '.join(
            utils.locstr(l) for l in (sorted if sort else utils.pass_fcn
                                      )([getloc(u) for u in lgroup]))

    # ----------------------------------------------------------------------------------------
    def choose_seqs_to_remove(
            chain_ids,
            max_hdist=4,
            tdbg=False):  # choose one of <chain_ids> to eliminate
        # look for pairs with the same locus that
        ids_to_remove = set(u for u in chain_ids if getloc(u) == '?')
        if tdbg and len(
                ids_to_remove
        ) > 0:  # i think this actually can't happen a.t.m. TODO maybe remove it
            print '      removed %d with missing annotations' % len(
                ids_to_remove)

        dbgstr = []
        n_equivalent = 0
        for tpair in itertools.combinations(chain_ids, 2):
            if len(set(getloc(u) for u in tpair)) > 1:
                continue
            if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1:
                continue
            hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair])
            if tdbg:
                dbgstr.append(
                    utils.color('blue' if hdist == 0 else 'yellow',
                                '%d' % hdist))
            if hdist <= max_hdist:  # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard
                # print '      identical sequence overlap, choosing longer one'
                better_id, worse_id = sorted(
                    tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs'))
                )  # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length?
                ids_to_remove.add(worse_id)
                n_equivalent += 1
        if tdbg and len(dbgstr) > 0:
            print '        %d pair%s equivalent with hdists %s' % (
                n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr))

        # remove unproductive
        dbgstr = []
        unproductive_ids = []
        for uid in chain_ids:
            if not utils.is_functional(
                    all_antns[uid], all_antns[uid]['unique_ids'].index(uid)):
                unproductive_ids.append(uid)
                if tdbg:
                    dbgstr.append(
                        utils.is_functional_dbg_str(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            sep='+'))
        # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))]  # this way is only one line, which may or may not be nicer
        if tdbg and len(unproductive_ids) > 0:
            print '        %d unproductive  %s' % (len(unproductive_ids),
                                                   ',  '.join(dbgstr))
            ids_to_remove |= set(unproductive_ids)

        return ids_to_remove

    # ----------------------------------------------------------------------------------------
    antn_dicts = {
        l: utils.get_annotation_dict(antn_lists[l])
        for l in antn_lists
    }

    # first make a map from each uid (for all loci) to its annotation
    pid_groups = [
    ]  # list of pid groups, i.e. each element is the uids from a single droplet (for 10x)
    pid_ids = {}  # map from each uid to the index of its pid group
    all_antns = {}
    if debug:
        print '  %s consolidating info for %d loci with cluster/sequence counts: %s' % (
            utils.color('blue', '+'.join(cpaths)), len(cpaths), '  '.join(
                '%s: %d/%d' % (l, len(cpaths[l].best()),
                               sum(len(c) for c in cpaths[l].best()))
                for l in sorted(cpaths)))
    for ltmp in sorted(cpaths):
        for cluster in cpaths[ltmp].best():
            cline = antn_dicts[ltmp][':'.join(cluster)]
            if 'paired-uids' not in cline:
                print '  %s no paired-uids in line' % utils.color(
                    'yellow', 'warning')
                continue  # maybe should still add to all_antns?
            for uid, pids in zip(cline['unique_ids'], cline['paired-uids']):
                pset = set([uid] + pids)
                found = False
                for ipg, pgroup in enumerate(pid_groups):
                    if any(
                            p in pgroup for p in pset
                    ):  # TODO should maybe check for consistency if some of them are already in there (i.e. from reciprocal info in another chain)?
                        found = True
                        pgroup |= pset
                        break
                if not found:
                    pid_groups.append(pset)
                    ipg = len(pid_groups) - 1
                assert ipg is not None
                for pid in pset:
                    pid_ids[pid] = ipg

            cline['loci'] = [
                ltmp for _ in cline['unique_ids']
            ]  # TODO maybe should add this somewhere else, like in partitiondriver? (eh, maybe not? the locus is always available in each file from the germline info anyway)
            for uid in cline['unique_ids']:
                all_antns[uid] = cline
    # for ipg, pg in enumerate(pid_groups):
    #     print '  %3d %s' % (ipg, ' '.join(pg))

    check_droplet_id_groups()
    # TODO handle/keep better track of failures

    # then go through each group and try to figure out which seqs are real
    print '  cleaning %d pid groups:' % len(pid_groups)
    n_ok = {}
    for ipg, pgroup in enumerate(pid_groups):
        pgroup = [u for u in pgroup if getloc(u) != '?'
                  ]  # TODO figure out what to do with missing ones
        # print '    %s' % lgstr(pgroup),
        hids = [u for u in pgroup if utils.has_d_gene(getloc(u))]
        lids = [u for u in pgroup if u not in hids]
        if len(hids) < 2 and len(lids) < 2:
            # print '  both ok'
            if lgstr(pgroup) not in n_ok:
                n_ok[lgstr(pgroup)] = 0
            n_ok[lgstr(pgroup)] += 1
            pid_groups[ipg] = pgroup
            continue
        if debug:
            print '    %s' % lgstr(pgroup),
        for chain, idlist in zip(utils.chains, [hids, lids]):
            if len(idlist) < 2:
                continue
            if debug:
                print '\n      too many %s chains: %s' % (chain, lgstr(idlist))
            ids_to_remove = choose_seqs_to_remove(idlist)
            for rid in ids_to_remove:
                pgroup.remove(rid)
                idlist.remove(rid)
            if debug:
                print '      %s: removed %d, leaving %d' % (utils.color(
                    'green', 'fixed') if len(idlist) == 1 else utils.color(
                        'red', 'nope'), len(ids_to_remove), len(idlist))
                if len(idlist) > 1:
                    for uid in idlist:
                        prutils.print_seq_in_reco_event(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            one_line=True,
                            extra_str='        ',
                            uid_extra_str=utils.locstr(getloc(uid)))

        pid_groups[ipg] = pgroup

    print '    N ok:'
    for lstr, count in sorted(n_ok.items(),
                              key=operator.itemgetter(1),
                              reverse=True):
        print '      %3d  %s' % (count, lstr)

    for ltmp in sorted(cpaths):
        print '%s' % utils.color('green', ltmp)
        cpaths[ltmp].print_partitions()
        for iclust, cluster in enumerate(
                sorted(cpaths[ltmp].best(), key=len, reverse=True)):
            cline = antn_dicts[ltmp][':'.join(cluster)]
            # before_strs = [lgstr(pids) for pids in cline['paired-uids']]
            cline['paired-uids'] = [[
                p for p in pid_groups[pid_ids[u]] if p != u
            ] for u in cline['unique_ids']]

            # see what others in its family are paired with
            pfamilies = {
            }  # TODO rewrite comment: map, for each locus, of the families that are paired with each uid in <cluster> (family name str : family annotation)
            for uid, pids in zip(cline['unique_ids'], cline['paired-uids']):
                for pid in pids:
                    fline = all_antns[pid]
                    fkey = ':'.join(fline['unique_ids'])
                    floc = gval(pid, 'loci')
                    if fkey not in pfamilies:
                        pfamilies[fkey] = {'locus': floc, 'count': 0}
                    pfamilies[fkey]['count'] += 1
            print '           N  size  cdr3'
            for fkey, fdict in sorted(pfamilies.items(),
                                      key=lambda x: x[1]['count'],
                                      reverse=True):
                print '       %s %3d  %3d   %3d' % (
                    utils.locstr(fdict['locus']), fdict['count'],
                    len(antn_dicts[fdict['locus']][fkey]['unique_ids']),
                    antn_dicts[fdict['locus']][fkey]['cdr3_length'])

            def pfkey(p):
                return ':'.join(all_antns[p]['unique_ids'])

            pfcounts = [[pfamilies[pfkey(p)]['count'] for p in pids]
                        for pids in cline['paired-uids']]

            def lcstr(pids, pfcs):
                if len(pids) == 0:
                    return ''
                spids, spfcs = zip(*sorted(
                    zip(pids, pfcs), key=operator.itemgetter(1), reverse=True))
                return '%s  %s' % (lgstr(spids, sort=False), ' '.join(
                    str(c) for c in spfcs))

            uid_extra_strs = [
                lcstr(pids, pfs)
                for pids, pfs in zip(cline['paired-uids'], pfcounts)
            ]
            utils.print_reco_event(cline,
                                   uid_extra_strs=uid_extra_strs,
                                   extra_str='      ')

            if iclust >= n_max_clusters:
                break
Exemplo n.º 39
0
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != 'Details':
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info['unique_id'] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print 'oop', begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != '':
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find('_gene') == 1:
                    region = column[0]
                    info[region + '_5p_del'] = int(fk.line[fk.line.index('start:') + 1]) - 1  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1
                    match_end = int(fk.line[fk.line.index('end:') + 1]) - 1
                    assert gl_length >= match_end
                    info[region + '_3p_del'] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        info['fv_insertion'] = ''
        info['jf_insertion'] = ''
        info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info['d_qr_seq'] + info['dj_insertion'] + info['j_qr_seq']

        if '-' in info['seq']:
            print 'ERROR found a dash in %s, returning failure' % unique_id
            while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        if info['seq'] not in self.siminfo[unique_id]['seq']:  # arg. I can't do != because it tacks on v left and j right deletions
            print 'ERROR didn\'t find the right sequence for %s' % unique_id
            print '  ', info['seq']
            print '  ', self.siminfo[unique_id]['seq']
            sys.exit()

        if self.args.debug:
            print unique_id
            utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str='    ')
            utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str='    ')

        for region in utils.regions:
            if info[region + '_gene'] not in self.germline_seqs[region]:
                print 'ERROR %s not in germlines' % info[region + '_gene']
                assert False

            gl_seq = info[region + '_gl_seq']
            if '[' in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace('[', nuke)
                    if gl_seq in self.germline_seqs[region][info[region + '_gene']]:
                        print '  replaced [ with %s' % nuke
                        break
                info[region + '_gl_seq'] = gl_seq

            if info[region + '_gl_seq'] not in self.germline_seqs[region][info[region + '_gene']]:
                print 'ERROR gl match not found for %s in %s' % (info[region + '_gene'], unique_id)
                print '  ', info[region + '_gl_seq']
                print '  ', self.germline_seqs[region][info[region + '_gene']]                
                self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
                while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
            fk.increment()
Exemplo n.º 40
0
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Exemplo n.º 41
0
    (args.basedir, args.locus))
lh_info = read_linearham_output()

# print annotations for the biggest cluster in the most likely partition
annotations = {
    ':'.join(adict['unique_ids']): adict
    for adict in annotation_list
}  # collect the annotations in a dictionary so they're easier to access
most_likely_partition = cpath.partitions[
    cpath.
    i_best]  # a partition is represented as a list of lists of strings, with each string a sequence id
sorted_clusters = sorted(most_likely_partition, key=len, reverse=True)
for cluster in sorted_clusters:
    line = annotations[':'.join(cluster)]
    print ':'.join(line['unique_ids'])
    utils.print_reco_event(line, extra_str='  ')
    print ''

    lh_clusters = [(uidstr, cfo) for uidstr, cfo in lh_info.items()
                   if set(uidstr.split(':')) & set(line['unique_ids'])]
    lh_naive_seqs = []
    if len(lh_clusters) == 0:
        print '  %s zero linearham clusters with any of these uids' % utils.color(
            'red', 'error')
    elif len(lh_clusters) != 1:
        raise Exception('expected 1 linearham cluster but found %d' %
                        len(lh_clusters))
    else:
        lh_uidstr, lh_naive_seqs = lh_clusters[0]
        if set(lh_uidstr.split(':')) != set(line['unique_ids']):
            print '  %s different uids\n       extra in linearham: %s\n   missing from linearham: %s' % (
Exemplo n.º 42
0
    def add_to_info(self,
                    query_name,
                    query_seq,
                    kvals,
                    match_names,
                    best,
                    all_germline_bounds,
                    all_query_bounds,
                    codon_positions,
                    perfplotter=None):
        assert query_name not in self.info
        self.info[query_name] = {}
        self.info[query_name][
            'unique_id'] = query_name  # redundant, but used somewhere down the line
        self.info[query_name]['k_v'] = kvals['v']
        self.info[query_name]['k_d'] = kvals['d']
        self.info[query_name]['all'] = ':'.join(match_names['v'] +
                                                match_names['d'] +
                                                match_names['j'])

        assert codon_positions['v'] != -1
        assert codon_positions['j'] != -1
        self.info[query_name][
            'cdr3_length'] = codon_positions['j'] - codon_positions[
                'v'] + 3  #tryp_position_in_joined_seq - self.cyst_position + 3
        self.info[query_name]['cyst_position'] = codon_positions['v']
        self.info[query_name]['tryp_position'] = codon_positions['j']

        # erosion, insertion, mutation info for best match
        self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0]
        self.info[query_name]['v_3p_del'] = len(
            self.germline_seqs['v'][best['v']]) - all_germline_bounds[
                best['v']][1]  # len(germline v) - gl_match_end
        self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0]
        self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][
            best['d']]) - all_germline_bounds[best['d']][1]
        self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0]
        self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][
            best['j']]) - all_germline_bounds[best['j']][1]

        self.info[query_name][
            'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]]
        self.info[query_name]['vd_insertion'] = query_seq[
            all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]]
        self.info[query_name]['dj_insertion'] = query_seq[
            all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]]
        self.info[query_name]['jf_insertion'] = query_seq[
            all_query_bounds[best['j']][1]:]

        for region in utils.regions:
            self.info[query_name][region + '_gene'] = best[region]
            self.info[query_name][region + '_gl_seq'] = best[region +
                                                             '_gl_seq']
            self.info[query_name][region + '_qr_seq'] = best[region +
                                                             '_qr_seq']
            self.info['all_best_matches'].add(best[region])

        self.info[query_name][
            'seq'] = query_seq  # only need to add this so I can pass it to print_reco_event
        if self.args.debug:
            if not self.args.is_data:
                utils.print_reco_event(self.germline_seqs,
                                       self.reco_info[query_name],
                                       extra_str='      ',
                                       label='true:')
            utils.print_reco_event(self.germline_seqs,
                                   self.info[query_name],
                                   extra_str='      ',
                                   label='inferred:')

        if self.pcounter != None:
            self.pcounter.increment(self.info[query_name])
        if self.true_pcounter != None:
            self.true_pcounter.increment(self.reco_info[query_name])
        if perfplotter != None:
            perfplotter.evaluate(
                self.reco_info[query_name],
                self.info[query_name])  #, subtract_unphysical_erosions=True)