def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [ query for query in self.input_info.keys() ] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True) args = parser.parse_args() args.extra_columns = utils.get_arg_list(args.extra_columns) assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta'] default_glfo_dir = partis_dir + '/data/germlines/human' if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None: print ' note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir args.glfo_dir = default_glfo_dir glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus) if args.plotdir is not None: from parametercounter import ParameterCounter setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity pcounter = ParameterCounter(glfo, args) for line in annotation_list: pcounter.increment(line) pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True sys.exit(0) if cpath is None or cpath.i_best is None: clusters_to_use = [l['unique_ids'] for l in annotation_list] print ' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)) else: ipartition = cpath.i_best if args.partition_index is None else args.partition_index print ' found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))) if args.cluster_index is None: clusters_to_use = cpath.partitions[ipartition] print ' taking all %d clusters' % len(clusters_to_use) else:
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter( self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if ( count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter( self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line[ 'errors'].split(':'): boundary_error_queries.append(':'.join( [str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][ 0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append( dict([ ('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output( line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % ( same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({ 'id_a': line['unique_ids'][0], 'id_b': line['unique_ids'][1], 'score': score }) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % ( len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo