def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
class IhhhmmmParser(object): def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_max_queries > 0 and iline >= args.n_max_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures(infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % (self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot() # ---------------------------------------------------------------------------------------- def parse_file(self, infile, unique_ids): fk = FileKeeper(infile.readlines()) i_id = 0 while not fk.eof and len(self.sim_need) > 0: self.parse_detail(fk, unique_ids[i_id]) i_id += 1 # ---------------------------------------------------------------------------------------- def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int(fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info['d_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id]['seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[region + '_gene']]: print 'ERROR gl match not found for %s in %s' % (info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() # ---------------------------------------------------------------------------------------- def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace('.fostream', '')).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', '') line = line.split(';') unique_id = line[0] if 'NA' not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info['unique_id'] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if 'IGH'+region.upper() in stuff and region+'_gene' not in info: genes = re.findall('IGH' + region.upper() + '[^ ][^ ]*', stuff) if len(genes) == 0: print 'ERROR no %s genes in %s' % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print 'ERROR bad gene %s for %s' % (gene, unique_id) sys.exit() info[region + '_gene'] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print '%-20s partial fail %s %s %s' % (unique_id, utils.color_gene(info['v_gene']) if 'v_gene' in info else '', utils.color_gene(info['d_gene']) if 'd_gene' in info else '', utils.color_gene(info['j_gene']) if 'j_gene' in info else ''), print ' (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions]) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[:self.args.indir.rfind( '/' )] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len( re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall( 'No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position + 1) pgraph = full_text[position:full_text. find('\n\n', position + 1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches( line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
class IhhhmmmParser(object): def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, "ihhhmmm") self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener("r")(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line["unique_id"] not in self.args.queries: continue self.siminfo[line["unique_id"]] = line self.sim_need.append(line["unique_id"]) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + "/*.fostream") if len(fostream_names) == 0: raise Exception("no fostreams found in %s" % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures(infname) # returns list of unique ids in this file with opener("r")(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print "%-20s no info" % unique_id self.perfplotter.add_fail() n_failed += 1 print "" print "partially failed: %d / %d = %.2f" % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo), ) print "failed: %d / %d = %.2f" % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo)) print "" self.perfplotter.plot() # ---------------------------------------------------------------------------------------- def parse_file(self, infile, unique_ids): fk = FileKeeper(infile.readlines()) i_id = 0 while not fk.eof and len(self.sim_need) > 0: self.parse_detail(fk, unique_ids[i_id]) i_id += 1 # ---------------------------------------------------------------------------------------- def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != "Details": fk.increment() if fk.eof: return fk.increment() info = {} info["unique_id"] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print "oop", begin_line, fk.line sys.exit() else: info[column] = default continue if column != "": info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find("_gene") == 1: region = column[0] info[region + "_5p_del"] = ( int(fk.line[fk.line.index("start:") + 1]) - 1 ) # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index("gene:") + 1]) - 1 match_end = int(fk.line[fk.line.index("end:") + 1]) - 1 assert gl_length >= match_end info[region + "_3p_del"] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return info["fv_insertion"] = "" info["jf_insertion"] = "" info["seq"] = ( info["v_qr_seq"] + info["vd_insertion"] + info["d_qr_seq"] + info["dj_insertion"] + info["j_qr_seq"] ) if "-" in info["seq"]: print "ERROR found a dash in %s, returning failure" % unique_id while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return if ( info["seq"] not in self.siminfo[unique_id]["seq"] ): # arg. I can't do != because it tacks on v left and j right deletions print "ERROR didn't find the right sequence for %s" % unique_id print " ", info["seq"] print " ", self.siminfo[unique_id]["seq"] sys.exit() if self.args.debug: print unique_id for region in utils.regions: infer_gene = info[region + "_gene"] true_gene = self.siminfo[unique_id][region + "_gene"] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color("bold", utils.color("blue", region)) truestr = "" #'(originally %s)' % match_name else: regionstr = utils.color("bold", utils.color("red", region)) truestr = "(true: %s)" % utils.color_gene(true_gene).replace(region, "") print " %s %s %s" % (regionstr, utils.color_gene(infer_gene).replace(region, ""), truestr) utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label="true:", extra_str=" ") utils.print_reco_event(self.germline_seqs, info, label="inferred:", extra_str=" ") for region in utils.regions: if info[region + "_gene"] not in self.germline_seqs[region]: print "ERROR %s not in germlines" % info[region + "_gene"] assert False gl_seq = info[region + "_gl_seq"] if "[" in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace("[", nuke) if gl_seq in self.germline_seqs[region][info[region + "_gene"]]: print " replaced [ with %s" % nuke break info[region + "_gl_seq"] = gl_seq if info[region + "_gl_seq"] not in self.germline_seqs[region][info[region + "_gene"]]: print "ERROR gl match not found for %s in %s" % (info[region + "_gene"], unique_id) print " ", info[region + "_gl_seq"] print " ", self.germline_seqs[region][info[region + "_gene"]] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() # ---------------------------------------------------------------------------------------- def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace(".fostream", "")).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', "") line = line.split(";") unique_id = line[0] if "NA" not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info["unique_id"] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if "IGH" + region.upper() in stuff and region + "_gene" not in info: genes = re.findall("IGH" + region.upper() + "[^ ][^ ]*", stuff) if len(genes) == 0: print "ERROR no %s genes in %s" % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print "ERROR bad gene %s for %s" % (gene, unique_id) sys.exit() info[region + "_gene"] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print "%-20s partial fail %s %s %s" % ( unique_id, utils.color_gene(info["v_gene"]) if "v_gene" in info else "", utils.color_gene(info["d_gene"]) if "d_gene" in info else "", utils.color_gene(info["j_gene"]) if "j_gene" in info else "", ), print " (true %s %s %s)" % tuple( [self.siminfo[unique_id][region + "_gene"] for region in utils.regions] ) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids
class IgblastParser(object): def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed = 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries: break iline += 1 if self.args.queries != None and int( line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line paragraphs = None print 'reading', self.args.infname info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id': query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % ( self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total) # ---------------------------------------------------------------------------------------- def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail( self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' ERROR no %s match for %d' % (region, query_name) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name][ 'seq'][qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start:end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print 'ERROR apportionment failed on %s' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: print ' %s %3d %3d %s %s' % ( region, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(qr_info[region + '_gene']), qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ') # ---------------------------------------------------------------------------------------- def process_single_block(self, block, query_name, qr_info): assert block[0].find('Query_') == 0 vals = block[0].split() qr_start = int( vals[1]) - 1 # converting from one-indexed to zero-indexed qr_seq = vals[2] qr_end = int( vals[3] ) # ...and from inclusive of both bounds to normal programming conventions if qr_seq not in self.seqinfo[query_name]['seq']: if '-' in qr_seq: print ' WARNING insertion inside query seq for %s, treating as partial failure' % query_name qr_info['fail'] = True return else: print ' ERROR query seq from igblast info not found in original query seq for %d' % query_name print ' %s' % qr_seq print ' %s' % self.seqinfo[query_name]['seq'] sys.exit() if 'seq' in qr_info: qr_info['seq'] += qr_seq else: qr_info['seq'] = qr_seq # keep track of the absolute first and absolute last bases matched so we can later work out the fv and jf insertions if 'match_start' not in qr_info or qr_start < qr_info['match_start']: qr_info['match_start'] = qr_start if 'match_end' not in qr_info or qr_end > qr_info['match_end']: qr_info['match_end'] = qr_end if self.args.debug: print ' query: %3d %3d %s' % (qr_start, qr_end, qr_seq) for line in block[1:]: gene = line[line.rfind('IGH'):line.rfind('</a>')] region = utils.get_region(gene) if gene not in self.germline_seqs[region]: print ' ERROR %s not found in germlines' % gene qr_info['fail'] = True return vals = line.split() gl_start = int( vals[-3]) - 1 # converting from one-indexed to zero-indexed gl_seq = vals[-2] gl_end = int( vals[-1] ) # ...and from inclusive of both bounds to normal programming conventions if region + '_gene' in qr_info: if qr_info[region + '_gene'] == gene: if self.args.debug: print ' %s match: %s' % ( region, clean_alignment_crap(qr_seq, gl_seq)) qr_info[region + '_gl_seq'] = qr_info[ region + '_gl_seq'] + clean_alignment_crap( qr_seq, gl_seq) assert gl_end <= len(self.germline_seqs[region][gene]) qr_info[region + '_3p_del'] = len( self.germline_seqs[region][gene]) - gl_end qr_info[region + '_qr_bounds'] = (qr_info[region + '_qr_bounds'][0], find_qr_bounds( qr_start, qr_end, gl_seq)[1]) else: continue else: qr_info[region + '_gene'] = gene qr_info[region + '_gl_seq'] = clean_alignment_crap( qr_seq, gl_seq) # deletions qr_info[region + '_5p_del'] = gl_start assert gl_end <= len(self.germline_seqs[region][gene]) qr_info[region + '_3p_del'] = len( self.germline_seqs[region][gene]) - gl_end # bounds qr_info[region + '_qr_bounds'] = find_qr_bounds( qr_start, qr_end, gl_seq) if self.args.debug: print ' %s match: %s' % ( region, clean_alignment_crap(qr_seq, gl_seq))
class IgblastParser(object): def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed, self.n_skipped = 0, 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_queries > 0 and iline >= self.args.n_queries: break iline += 1 if self.args.queries != None and int(line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line print 'reading', self.args.infname get_genes_to_skip(self.args.infname, self.germline_seqs, method='igblast', debug=False) paragraphs = None info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_queries > 0 and iquery >= self.args.n_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id':query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % (self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total) print 'skipped: %d / %d = %f' % (self.n_skipped, self.n_total, float(self.n_skipped) / self.n_total) for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) # ---------------------------------------------------------------------------------------- def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'skip_gene' in qr_info: self.n_skipped += 1 return if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' %d: no %s match' % (query_name, region) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start : end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print ' %s: apportionment failed' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: true_gene = self.seqinfo[query_name][region + '_gene'] infer_gene = qr_info[region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') # print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ') # ---------------------------------------------------------------------------------------- def process_single_block(self, block, query_name, qr_info): assert block[0].find('Query_') == 0 vals = block[0].split() qr_start = int(vals[1]) - 1 # converting from one-indexed to zero-indexed qr_seq = vals[2] qr_end = int(vals[3]) # ...and from inclusive of both bounds to normal programming conventions if qr_seq not in self.seqinfo[query_name]['seq']: if '-' in qr_seq: print ' %s: insertion inside query seq, treating as partial failure' % query_name qr_info['fail'] = True return else: print ' ERROR query seq from igblast info not found in original query seq for %d' % query_name print ' %s' % qr_seq print ' %s' % self.seqinfo[query_name]['seq'] sys.exit() if 'seq' in qr_info: qr_info['seq'] += qr_seq else: qr_info['seq'] = qr_seq # keep track of the absolute first and absolute last bases matched so we can later work out the fv and jf insertions if 'match_start' not in qr_info or qr_start < qr_info['match_start']: qr_info['match_start'] = qr_start if 'match_end' not in qr_info or qr_end > qr_info['match_end']: qr_info['match_end'] = qr_end # ---------------------------------------------------------------------------------------- # skipping bullshit def skip_gene(gene): if self.args.debug: print ' %s in list of genes to skip' % utils.color_gene(gene) if gene not in genes_actually_skipped: genes_actually_skipped[gene] = 0 genes_actually_skipped[gene] += 1 qr_info['skip_gene'] = True if self.args.debug: print ' query: %3d %3d %s' % (qr_start, qr_end, qr_seq) for line in block[1:]: gene = line[line.rfind('IGH') : line.rfind('</a>')] region = utils.get_region(gene) true_gene = self.seqinfo[query_name][region + '_gene'] for gset in equivalent_genes: if gene in gset and true_gene in gset and gene != true_gene: # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name if self.args.debug: print ' %s: replacing name %s with true name %s' % (query_name, gene, true_gene) gene = true_gene if gene in just_always_friggin_skip: continue # go on to the next match if not self.args.dont_skip_or15_genes and '/OR1' in true_gene: skip_gene(true_gene) return if self.args.skip_missing_genes: if gene in genes_to_skip: continue # go on to the next match # skip_gene(gene) # return if true_gene in genes_to_skip: skip_gene(true_gene) return if gene not in self.germline_seqs[region]: print ' %s: %s not in germlines (skipping)' % (query_name, gene) skip_gene(gene) return vals = line.split() gl_start = int(vals[-3]) - 1 # converting from one-indexed to zero-indexed gl_seq = vals[-2] gl_end = int(vals[-1]) # ...and from inclusive of both bounds to normal programming conventions if region + '_gene' in qr_info: if qr_info[region + '_gene'] == gene: if self.args.debug: print ' %s match: %s' % (region, clean_alignment_crap(qr_seq, gl_seq)) qr_info[region + '_gl_seq'] = qr_info[region + '_gl_seq'] + clean_alignment_crap(qr_seq, gl_seq) if gl_end > len(self.germline_seqs[region][gene]): # not really sure what's wrong... but it seems to be rare qr_info['fail'] = True return qr_info[region + '_3p_del'] = len(self.germline_seqs[region][gene]) - gl_end qr_info[region + '_qr_bounds'] = (qr_info[region + '_qr_bounds'][0], find_qr_bounds(qr_start, qr_end, gl_seq)[1]) else: continue else: qr_info[region + '_gene'] = gene qr_info[region + '_gl_seq'] = clean_alignment_crap(qr_seq, gl_seq) # deletions qr_info[region + '_5p_del'] = gl_start assert gl_end <= len(self.germline_seqs[region][gene]) qr_info[region + '_3p_del'] = len(self.germline_seqs[region][gene]) - gl_end # bounds qr_info[region + '_qr_bounds'] = find_qr_bounds(qr_start, qr_end, gl_seq) if self.args.debug: print ' %s match: %s' % (region, clean_alignment_crap(qr_seq, gl_seq))
class IhhhmmmParser(object): def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_max_queries > 0 and iline >= args.n_max_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures( infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len( self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot() # ---------------------------------------------------------------------------------------- def parse_file(self, infile, unique_ids): fk = FileKeeper(infile.readlines()) i_id = 0 while not fk.eof and len(self.sim_need) > 0: self.parse_detail(fk, unique_ids[i_id]) i_id += 1 # ---------------------------------------------------------------------------------------- def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int( fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info[ 'd_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id][ 'seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[ region + '_gene']]: print 'ERROR gl match not found for %s in %s' % ( info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() # ---------------------------------------------------------------------------------------- def find_partial_failures(self, fostream_name): unique_ids = [] for line in open(fostream_name.replace('.fostream', '')).readlines(): if len(self.sim_need) == 0: return if len(line.strip()) == 0: # skip blank lines continue line = line.replace('"', '') line = line.split(';') unique_id = line[0] if 'NA' not in line: # skip lines that were ok unique_ids.append(unique_id) continue if unique_id not in self.sim_need: continue if unique_id not in self.siminfo: continue # not looking for this <unique_id> a.t.m. info = {} info['unique_id'] = unique_id for stuff in line: for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all) if 'IGH' + region.upper( ) in stuff and region + '_gene' not in info: genes = re.findall( 'IGH' + region.upper() + '[^ ][^ ]*', stuff) if len(genes) == 0: print 'ERROR no %s genes in %s' % (region, stuff) gene = genes[0] if gene not in self.germline_seqs[region]: print 'ERROR bad gene %s for %s' % (gene, unique_id) sys.exit() info[region + '_gene'] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print '%-20s partial fail %s %s %s' % ( unique_id, utils.color_gene(info['v_gene']) if 'v_gene' in info else '', utils.color_gene(info['d_gene']) if 'd_gene' in info else '', utils.color_gene(info['j_gene']) if 'j_gene' in info else ''), print ' (true %s %s %s)' % tuple([ self.siminfo[unique_id][region + '_gene'] for region in utils.regions ]) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids