def get_seqfile_info(fname, is_data, germline_seqs=None, cyst_positions=None, tryp_positions=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ if not is_data: assert germline_seqs is not None assert cyst_positions is not None assert tryp_positions is not None if '.csv' in fname: delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.tsv' in fname: delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname: name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq' for seq_record in SeqIO.parse(fname, ftype): reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break else: print 'ERROR unrecognized file format %s' % fname assert False input_info, reco_info = OrderedDict(), OrderedDict() n_queries = 0 for line in reader: utils.intify(line) # if command line specified query or reco ids, skip other ones if queries is not None and line[name_column] not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[line[name_column]] = {'unique_id':line[name_column], 'seq':line[seq_column]} if not is_data: reco_info[line['unique_id']] = line utils.add_match_info(germline_seqs, line, cyst_positions, tryp_positions) n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: print 'ERROR didn\'t end up pulling any input info out of %s' % fname assert False return (input_info, reco_info)
def get_seqfile_info(fname, is_data, germline_seqs=None, cyst_positions=None, tryp_positions=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ if not is_data: assert germline_seqs is not None assert cyst_positions is not None assert tryp_positions is not None if '.csv' in fname: delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.tsv' in fname: delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname: name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq' for seq_record in SeqIO.parse(fname, ftype): reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break else: print 'ERROR unrecognized file format %s' % fname assert False input_info, reco_info = OrderedDict(), OrderedDict() n_queries = 0 for line in reader: utils.intify(line) # if command line specified query or reco ids, skip other ones if queries is not None and line[name_column] not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[line[name_column]] = { 'unique_id': line[name_column], 'seq': line[seq_column] } if not is_data: reco_info[line['unique_id']] = line utils.add_match_info(germline_seqs, line, cyst_positions, tryp_positions) n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: print 'ERROR didn\'t end up pulling any input info out of %s' % fname assert False return (input_info, reco_info)
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter( self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if ( count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter( self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line[ 'errors'].split(':'): boundary_error_queries.append(':'.join( [str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][ 0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append( dict([ ('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output( line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % ( same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({ 'id_a': line['unique_ids'][0], 'id_b': line['unique_ids'][1], 'score': score }) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % ( len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter(self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if (count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter(self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line['errors'].split(':'): boundary_error_queries.append(':'.join([str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append(dict([('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output(line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % (same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({'id_a':line['unique_ids'][0], 'id_b':line['unique_ids'][1], 'score':score}) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % (len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ if '.csv' in fname: delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.tsv' in fname: delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname: name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq' for seq_record in SeqIO.parse(fname, ftype): reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break else: raise Exception('unrecognized file format %s' % fname) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 for line in reader: if '.csv' in fname and name_column not in line: # hackey hackey hackey name_column = 'name' seq_column = 'nucleotide' utils.process_input_line(line, int_columns=('v_5p_del', 'd_5p_del', 'cdr3_length', 'j_5p_del', 'j_3p_del', 'd_3p_del', 'v_3p_del'), literal_columns=('indels')) unique_id = line[name_column] # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]} if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname) reco_info[unique_id] = dict(line) if 'indels' in line and line['indels']['reversed_seq'] != '': # TODO unhackify this reco_info[unique_id]['seq'] = line['indels']['reversed_seq'] if 'indels' not in line: # TODO unhackify this reco_info[unique_id]['indels'] = None if glfo is not None: utils.add_match_info(glfo, reco_info[unique_id]) n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) return (input_info, reco_info)