예제 #1
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[ : self.args.indir.rfind('/')]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3:
                        failregions = re.findall('No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position+1)
                        pgraph = full_text[position : full_text.find('\n\n', position+1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:')
                utils.print_reco_event(self.germline_seqs, line, label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
예제 #2
0
class IhhhmmmParser(object):
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm')

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener('r')(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                self.siminfo[line['unique_id']] = line
                self.sim_need.append(line['unique_id'])
                iline += 1
                if args.n_max_queries > 0 and iline >= args.n_max_queries:
                    break

        fostream_names = glob.glob(self.args.indir + '/*.fostream')
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(infname)  # returns list of unique ids in this file

            with opener('r')(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print '%-20s  no info' % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ''
        print 'partially failed: %d / %d = %.2f' % (self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo))
        print 'failed:           %d / %d = %.2f' % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo))
        print ''

        self.perfplotter.plot()

    # ----------------------------------------------------------------------------------------
    def parse_file(self, infile, unique_ids):
        fk = FileKeeper(infile.readlines())
        i_id = 0
        while not fk.eof and len(self.sim_need) > 0:
            self.parse_detail(fk, unique_ids[i_id])
            i_id += 1
        
    # ----------------------------------------------------------------------------------------
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != 'Details':
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info['unique_id'] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print 'oop', begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != '':
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find('_gene') == 1:
                    region = column[0]
                    info[region + '_5p_del'] = int(fk.line[fk.line.index('start:') + 1]) - 1  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1
                    match_end = int(fk.line[fk.line.index('end:') + 1]) - 1
                    assert gl_length >= match_end
                    info[region + '_3p_del'] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        info['fv_insertion'] = ''
        info['jf_insertion'] = ''
        info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info['d_qr_seq'] + info['dj_insertion'] + info['j_qr_seq']

        if '-' in info['seq']:
            print 'ERROR found a dash in %s, returning failure' % unique_id
            while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        if info['seq'] not in self.siminfo[unique_id]['seq']:  # arg. I can't do != because it tacks on v left and j right deletions
            print 'ERROR didn\'t find the right sequence for %s' % unique_id
            print '  ', info['seq']
            print '  ', self.siminfo[unique_id]['seq']
            sys.exit()

        if self.args.debug:
            print unique_id
            utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str='    ')
            utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str='    ')

        for region in utils.regions:
            if info[region + '_gene'] not in self.germline_seqs[region]:
                print 'ERROR %s not in germlines' % info[region + '_gene']
                assert False

            gl_seq = info[region + '_gl_seq']
            if '[' in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace('[', nuke)
                    if gl_seq in self.germline_seqs[region][info[region + '_gene']]:
                        print '  replaced [ with %s' % nuke
                        break
                info[region + '_gl_seq'] = gl_seq

            if info[region + '_gl_seq'] not in self.germline_seqs[region][info[region + '_gene']]:
                print 'ERROR gl match not found for %s in %s' % (info[region + '_gene'], unique_id)
                print '  ', info[region + '_gl_seq']
                print '  ', self.germline_seqs[region][info[region + '_gene']]                
                self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
                while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[1] != 'Details':  # skip stuff until start of next Detail block
            fk.increment()
        
    # ----------------------------------------------------------------------------------------
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace('.fostream', '')).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', '')
            line = line.split(';')

            unique_id = line[0]
            
            if 'NA' not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info['unique_id'] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if 'IGH'+region.upper() in stuff and region+'_gene' not in info:
                        genes = re.findall('IGH' + region.upper() + '[^ ][^ ]*', stuff)
                        if len(genes) == 0:
                            print 'ERROR no %s genes in %s' % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print 'ERROR bad gene %s for %s' % (gene, unique_id)
                            sys.exit()
                        info[region + '_gene'] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print '%-20s  partial fail %s %s %s' % (unique_id,
                                                     utils.color_gene(info['v_gene']) if 'v_gene' in info else '',
                                                     utils.color_gene(info['d_gene']) if 'd_gene' in info else '',
                                                     utils.color_gene(info['j_gene']) if 'j_gene' in info else ''),
                print '  (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions])
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids
예제 #3
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir,
                                         'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[:self.args.indir.rfind(
            '/'
        )]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(
                            re.findall('[123]. Alignment for [VDJ]-GENE',
                                       full_text)) < 3:
                        failregions = re.findall(
                            'No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position + 1)
                        pgraph = full_text[position:full_text.
                                           find('\n\n', position + 1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(
                    line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs,
                                       self.seqinfo[unique_id],
                                       label='true:')
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total,
                                        float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total,
                                         float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found +
                                                 n_found, n_not_found /
                                                 float(n_not_found + n_found))
예제 #4
0
class IhhhmmmParser(object):
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, "ihhhmmm")

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener("r")(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line["unique_id"] not in self.args.queries:
                    continue
                self.siminfo[line["unique_id"]] = line
                self.sim_need.append(line["unique_id"])
                iline += 1
                if args.n_queries > 0 and iline >= args.n_queries:
                    break

        fostream_names = glob.glob(self.args.indir + "/*.fostream")
        if len(fostream_names) == 0:
            raise Exception("no fostreams found in %s" % args.indir)
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(infname)  # returns list of unique ids in this file

            with opener("r")(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print "%-20s  no info" % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ""
        print "partially failed: %d / %d = %.2f" % (
            self.n_partially_failed,
            len(self.siminfo),
            float(self.n_partially_failed) / len(self.siminfo),
        )
        print "failed:           %d / %d = %.2f" % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo))
        print ""

        self.perfplotter.plot()

    # ----------------------------------------------------------------------------------------
    def parse_file(self, infile, unique_ids):
        fk = FileKeeper(infile.readlines())
        i_id = 0
        while not fk.eof and len(self.sim_need) > 0:
            self.parse_detail(fk, unique_ids[i_id])
            i_id += 1

    # ----------------------------------------------------------------------------------------
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != "Details":
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info["unique_id"] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print "oop", begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != "":
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find("_gene") == 1:
                    region = column[0]
                    info[region + "_5p_del"] = (
                        int(fk.line[fk.line.index("start:") + 1]) - 1
                    )  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index("gene:") + 1]) - 1
                    match_end = int(fk.line[fk.line.index("end:") + 1]) - 1
                    assert gl_length >= match_end
                    info[region + "_3p_del"] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                fk.increment()
            return

        info["fv_insertion"] = ""
        info["jf_insertion"] = ""
        info["seq"] = (
            info["v_qr_seq"] + info["vd_insertion"] + info["d_qr_seq"] + info["dj_insertion"] + info["j_qr_seq"]
        )

        if "-" in info["seq"]:
            print "ERROR found a dash in %s, returning failure" % unique_id
            while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                fk.increment()
            return

        if (
            info["seq"] not in self.siminfo[unique_id]["seq"]
        ):  # arg. I can't do != because it tacks on v left and j right deletions
            print "ERROR didn't find the right sequence for %s" % unique_id
            print "  ", info["seq"]
            print "  ", self.siminfo[unique_id]["seq"]
            sys.exit()

        if self.args.debug:
            print unique_id
            for region in utils.regions:
                infer_gene = info[region + "_gene"]
                true_gene = self.siminfo[unique_id][region + "_gene"]
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color("bold", utils.color("blue", region))
                    truestr = ""  #'(originally %s)' % match_name
                else:
                    regionstr = utils.color("bold", utils.color("red", region))
                    truestr = "(true: %s)" % utils.color_gene(true_gene).replace(region, "")
                print "  %s %s %s" % (regionstr, utils.color_gene(infer_gene).replace(region, ""), truestr)

            utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label="true:", extra_str="    ")
            utils.print_reco_event(self.germline_seqs, info, label="inferred:", extra_str="    ")

        for region in utils.regions:
            if info[region + "_gene"] not in self.germline_seqs[region]:
                print "ERROR %s not in germlines" % info[region + "_gene"]
                assert False

            gl_seq = info[region + "_gl_seq"]
            if "[" in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace("[", nuke)
                    if gl_seq in self.germline_seqs[region][info[region + "_gene"]]:
                        print "  replaced [ with %s" % nuke
                        break
                info[region + "_gl_seq"] = gl_seq

            if info[region + "_gl_seq"] not in self.germline_seqs[region][info[region + "_gene"]]:
                print "ERROR gl match not found for %s in %s" % (info[region + "_gene"], unique_id)
                print "  ", info[region + "_gl_seq"]
                print "  ", self.germline_seqs[region][info[region + "_gene"]]
                self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
                while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[1] != "Details":  # skip stuff until start of next Detail block
            fk.increment()

    # ----------------------------------------------------------------------------------------
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace(".fostream", "")).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', "")
            line = line.split(";")

            unique_id = line[0]

            if "NA" not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info["unique_id"] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if "IGH" + region.upper() in stuff and region + "_gene" not in info:
                        genes = re.findall("IGH" + region.upper() + "[^ ][^ ]*", stuff)
                        if len(genes) == 0:
                            print "ERROR no %s genes in %s" % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print "ERROR bad gene %s for %s" % (gene, unique_id)
                            sys.exit()
                        info[region + "_gene"] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print "%-20s  partial fail %s %s %s" % (
                    unique_id,
                    utils.color_gene(info["v_gene"]) if "v_gene" in info else "",
                    utils.color_gene(info["d_gene"]) if "d_gene" in info else "",
                    utils.color_gene(info["j_gene"]) if "j_gene" in info else "",
                ),
                print "  (true %s %s %s)" % tuple(
                    [self.siminfo[unique_id][region + "_gene"] for region in utils.regions]
                )
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids
예제 #5
0
class IgblastParser(object):
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)

        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'igblast')
        self.n_total, self.n_partially_failed = 0, 0

        # get sequence info that was passed to igblast
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries:
                    break
                iline += 1
                if self.args.queries != None and int(
                        line['unique_id']) not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[int(line['unique_id'])] = line

        paragraphs = None
        print 'reading', self.args.infname
        info = {}
        with opener('r')(self.args.infname) as infile:
            line = infile.readline()
            # first find the start of the next query's section
            while line.find('<b>Query=') != 0:
                line = infile.readline()
            # then keep going till eof
            iquery = 0
            while line != '':
                if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries:
                    break
                # first find the query name
                query_name = int(line.split()[1])
                # and collect the lines for this query
                query_lines = []
                line = infile.readline()
                while line.find('<b>Query=') != 0:
                    query_lines.append(line.strip())
                    line = infile.readline()
                    if line == '':
                        break
                iquery += 1
                # then see if we want this query
                if self.args.queries != None and query_name not in self.args.queries:
                    continue
                if query_name not in self.seqinfo:
                    print 'ERROR %d not in reco info' % query_name
                    sys.exit()
                if self.args.debug:
                    print query_name
                # and finally add the query to <info[query_name]>
                info[query_name] = {'unique_id': query_name}
                self.n_total += 1
                self.process_query(info[query_name], query_name, query_lines)

        self.perfplotter.plot()
        print 'partially failed: %d / %d = %f' % (
            self.n_partially_failed, self.n_total,
            float(self.n_partially_failed) / self.n_total)

    # ----------------------------------------------------------------------------------------
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH',
                              line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(
                    self.seqinfo[query_name],
                    qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '  ERROR no %s match for %d' % (region, query_name)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                                  qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name][
                    'seq'][:qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name][
                'seq'][:qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name][
                    'seq'][qr_info['match_end'] -
                           len(self.seqinfo[query_name]['seq']):]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][
                qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start:end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug,
                                        self.germline_seqs)
        except AssertionError:
            print 'ERROR apportionment failed on %s' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name],
                                              qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                print '    %s %3d %3d %s %s' % (
                    region, qr_info[region + '_qr_bounds'][0],
                    qr_info[region + '_qr_bounds'][1],
                    utils.color_gene(qr_info[region + '_gene']),
                    qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start:end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs,
                                   self.seqinfo[query_name],
                                   label='true:',
                                   extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')

    # ----------------------------------------------------------------------------------------
    def process_single_block(self, block, query_name, qr_info):
        assert block[0].find('Query_') == 0
        vals = block[0].split()
        qr_start = int(
            vals[1]) - 1  # converting from one-indexed to zero-indexed
        qr_seq = vals[2]
        qr_end = int(
            vals[3]
        )  # ...and from inclusive of both bounds to normal programming conventions
        if qr_seq not in self.seqinfo[query_name]['seq']:
            if '-' in qr_seq:
                print '  WARNING insertion inside query seq for %s, treating as partial failure' % query_name
                qr_info['fail'] = True
                return
            else:
                print '  ERROR query seq from igblast info not found in original query seq for %d' % query_name
                print '    %s' % qr_seq
                print '    %s' % self.seqinfo[query_name]['seq']
                sys.exit()

        if 'seq' in qr_info:
            qr_info['seq'] += qr_seq
        else:
            qr_info['seq'] = qr_seq

        # keep track of the absolute first and absolute last bases matched so we can later work out the fv and jf insertions
        if 'match_start' not in qr_info or qr_start < qr_info['match_start']:
            qr_info['match_start'] = qr_start
        if 'match_end' not in qr_info or qr_end > qr_info['match_end']:
            qr_info['match_end'] = qr_end

        if self.args.debug:
            print '      query: %3d %3d %s' % (qr_start, qr_end, qr_seq)
        for line in block[1:]:
            gene = line[line.rfind('IGH'):line.rfind('</a>')]
            region = utils.get_region(gene)
            if gene not in self.germline_seqs[region]:
                print '  ERROR %s not found in germlines' % gene
                qr_info['fail'] = True
                return

            vals = line.split()
            gl_start = int(
                vals[-3]) - 1  # converting from one-indexed to zero-indexed
            gl_seq = vals[-2]
            gl_end = int(
                vals[-1]
            )  # ...and from inclusive of both bounds to normal programming conventions

            if region + '_gene' in qr_info:
                if qr_info[region + '_gene'] == gene:
                    if self.args.debug:
                        print '        %s match: %s' % (
                            region, clean_alignment_crap(qr_seq, gl_seq))
                    qr_info[region + '_gl_seq'] = qr_info[
                        region + '_gl_seq'] + clean_alignment_crap(
                            qr_seq, gl_seq)
                    assert gl_end <= len(self.germline_seqs[region][gene])
                    qr_info[region + '_3p_del'] = len(
                        self.germline_seqs[region][gene]) - gl_end
                    qr_info[region +
                            '_qr_bounds'] = (qr_info[region + '_qr_bounds'][0],
                                             find_qr_bounds(
                                                 qr_start, qr_end, gl_seq)[1])
                else:
                    continue
            else:
                qr_info[region + '_gene'] = gene
                qr_info[region + '_gl_seq'] = clean_alignment_crap(
                    qr_seq, gl_seq)
                # deletions
                qr_info[region + '_5p_del'] = gl_start
                assert gl_end <= len(self.germline_seqs[region][gene])
                qr_info[region + '_3p_del'] = len(
                    self.germline_seqs[region][gene]) - gl_end
                # bounds
                qr_info[region + '_qr_bounds'] = find_qr_bounds(
                    qr_start, qr_end, gl_seq)
                if self.args.debug:
                    print '        %s match: %s' % (
                        region, clean_alignment_crap(qr_seq, gl_seq))
예제 #6
0
class IgblastParser(object):
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True)

        self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast')
        self.n_total, self.n_partially_failed, self.n_skipped = 0, 0, 0

        # get sequence info that was passed to igblast
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break
                iline += 1
                if self.args.queries != None and int(line['unique_id']) not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[int(line['unique_id'])] = line

        print 'reading', self.args.infname

        get_genes_to_skip(self.args.infname, self.germline_seqs, method='igblast', debug=False)

        paragraphs = None
        info = {}
        with opener('r')(self.args.infname) as infile:
            line = infile.readline()
            # first find the start of the next query's section
            while line.find('<b>Query=') != 0:
                line = infile.readline()
            # then keep going till eof
            iquery = 0
            while line != '':
                if self.args.n_queries > 0 and iquery >= self.args.n_queries:
                    break
                # first find the query name
                query_name = int(line.split()[1])
                # and collect the lines for this query
                query_lines = []
                line = infile.readline()
                while line.find('<b>Query=') != 0:
                    query_lines.append(line.strip())
                    line = infile.readline()
                    if line == '':
                        break
                iquery += 1
                # then see if we want this query
                if self.args.queries != None and query_name not in self.args.queries:
                    continue
                if query_name not in self.seqinfo:
                    print 'ERROR %d not in reco info' % query_name
                    sys.exit()
                if self.args.debug:
                    print query_name
                # and finally add the query to <info[query_name]>
                info[query_name] = {'unique_id':query_name}
                self.n_total += 1
                self.process_query(info[query_name], query_name, query_lines)

        self.perfplotter.plot()
        print 'partially failed: %d / %d = %f' % (self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total)
        print 'skipped: %d / %d = %f' % (self.n_skipped, self.n_total, float(self.n_skipped) / self.n_total)
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))

    # ----------------------------------------------------------------------------------------
    def process_query(self, qr_info, query_name, query_lines):
        # split query_lines up into blocks
        blocks = []
        for line in query_lines:
            if line.find('Query_') == 0:
                blocks.append([])
            if len(line) == 0:
                continue
            if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0:
                continue
            if len(blocks) == 0:
                print 'wtf? %s' % query_name  # it's probably kicking a reverse match
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)  # NOTE that's really a total failure
                self.n_partially_failed += 1
                return
            blocks[-1].append(line)

        # then process each block
        for block in blocks:
            self.process_single_block(block, query_name, qr_info)
            if 'skip_gene' in qr_info:
                self.n_skipped += 1
                return
            if 'fail' in qr_info:
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        for region in utils.regions:
            if region + '_gene' not in qr_info:
                print '    %d: no %s match' % (query_name, region)
                self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
                self.n_partially_failed += 1
                return

        # expand v match to left end and j match to right end
        qr_info['v_5p_del'] = 0
        qr_info['fv_insertion'] = ''
        if qr_info['match_start'] > 0:
            if self.args.debug:
                print '    add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']]
            qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq']

        qr_info['j_3p_del'] = 0
        qr_info['jf_insertion'] = ''
        if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']:
            if self.args.debug:
                print '    add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]
            qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ]

        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]

        for region in utils.regions:
            start = qr_info[region + '_qr_bounds'][0]
            end = qr_info[region + '_qr_bounds'][1]
            qr_info[region + '_qr_seq'] = qr_info['seq'][start : end]

        try:
            resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs)
        except AssertionError:
            print '    %s: apportionment failed' % query_name
            self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info)
            self.n_partially_failed += 1
            return

        if self.args.debug:
            print '  query seq:', qr_info['seq']
            for region in utils.regions:
                true_gene = self.seqinfo[query_name][region + '_gene']
                infer_gene = qr_info[region + '_gene']
                if utils.are_alleles(infer_gene, true_gene):
                    regionstr = utils.color('bold', utils.color('blue', region))
                    truestr = ''  #'(originally %s)' % match_name
                else:
                    regionstr = utils.color('bold', utils.color('red', region))
                    truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '')
                # print '  %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr)

                print '    %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq'])
        for boundary in utils.boundaries:
            start = qr_info[boundary[0] + '_qr_bounds'][1]
            end = qr_info[boundary[1] + '_qr_bounds'][0]
            qr_info[boundary + '_insertion'] = qr_info['seq'][start : end]
            if self.args.debug:
                print '   ', boundary, qr_info[boundary + '_insertion']

        self.perfplotter.evaluate(self.seqinfo[query_name], qr_info)
        # for key, val in qr_info.items():
        #     print key, val
        if self.args.debug:
            utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str='  ')
            utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
            
    # ----------------------------------------------------------------------------------------
    def process_single_block(self, block, query_name, qr_info):
        assert block[0].find('Query_') == 0
        vals = block[0].split()
        qr_start = int(vals[1]) - 1  # converting from one-indexed to zero-indexed
        qr_seq = vals[2]
        qr_end = int(vals[3])  # ...and from inclusive of both bounds to normal programming conventions
        if qr_seq not in self.seqinfo[query_name]['seq']:
            if '-' in qr_seq:
                print '    %s: insertion inside query seq, treating as partial failure' % query_name
                qr_info['fail'] = True
                return
            else:
                print '  ERROR query seq from igblast info not found in original query seq for %d' % query_name
                print '    %s' % qr_seq
                print '    %s' % self.seqinfo[query_name]['seq']
                sys.exit()

        if 'seq' in qr_info:
            qr_info['seq'] += qr_seq
        else:
            qr_info['seq'] = qr_seq


        # keep track of the absolute first and absolute last bases matched so we can later work out the fv and jf insertions
        if 'match_start' not in qr_info or qr_start < qr_info['match_start']:
            qr_info['match_start'] = qr_start
        if 'match_end' not in qr_info or qr_end > qr_info['match_end']:
            qr_info['match_end'] = qr_end

        # ----------------------------------------------------------------------------------------
        # skipping bullshit
        def skip_gene(gene):
            if self.args.debug:
                print '    %s in list of genes to skip' % utils.color_gene(gene)
            if gene not in genes_actually_skipped:
                genes_actually_skipped[gene] = 0
            genes_actually_skipped[gene] += 1
            qr_info['skip_gene'] = True

        if self.args.debug:
            print '      query: %3d %3d %s' % (qr_start, qr_end, qr_seq)
        for line in block[1:]:
            gene = line[line.rfind('IGH') : line.rfind('</a>')]
            region = utils.get_region(gene)
            true_gene = self.seqinfo[query_name][region + '_gene']

            for gset in equivalent_genes:
                if gene in gset and true_gene in gset and gene != true_gene:  # if the true gene and the inferred gene are in the same equivalence set, treat it as correct, i.e. just pretend it inferred the right name
                    if self.args.debug:
                        print '   %s: replacing name %s with true name %s' % (query_name, gene, true_gene)
                    gene = true_gene

            if gene in just_always_friggin_skip:
                continue  # go on to the next match

            if not self.args.dont_skip_or15_genes and '/OR1' in true_gene:
                skip_gene(true_gene)
                return

            if self.args.skip_missing_genes:
                if gene in genes_to_skip:
                    continue  # go on to the next match
                    # skip_gene(gene)
                    # return
                if true_gene in genes_to_skip:
                    skip_gene(true_gene)
                    return

            if gene not in self.germline_seqs[region]:
                print '    %s: %s not in germlines (skipping)' % (query_name, gene)
                skip_gene(gene)
                return
                
            vals = line.split()
            gl_start = int(vals[-3]) - 1  # converting from one-indexed to zero-indexed
            gl_seq = vals[-2]
            gl_end = int(vals[-1])  # ...and from inclusive of both bounds to normal programming conventions

            if region + '_gene' in qr_info:
                if qr_info[region + '_gene'] == gene:
                    if self.args.debug:
                        print '        %s match: %s' % (region, clean_alignment_crap(qr_seq, gl_seq))
                    qr_info[region + '_gl_seq'] = qr_info[region + '_gl_seq'] + clean_alignment_crap(qr_seq, gl_seq)
                    if gl_end > len(self.germline_seqs[region][gene]):  # not really sure what's wrong... but it seems to be rare
                        qr_info['fail'] = True
                        return
                    qr_info[region + '_3p_del'] = len(self.germline_seqs[region][gene]) - gl_end
                    qr_info[region + '_qr_bounds'] = (qr_info[region + '_qr_bounds'][0], find_qr_bounds(qr_start, qr_end, gl_seq)[1])
                else:
                    continue
            else:
                qr_info[region + '_gene'] = gene
                qr_info[region + '_gl_seq'] = clean_alignment_crap(qr_seq, gl_seq)
                # deletions
                qr_info[region + '_5p_del'] = gl_start
                assert gl_end <= len(self.germline_seqs[region][gene])
                qr_info[region + '_3p_del'] = len(self.germline_seqs[region][gene]) - gl_end
                # bounds
                qr_info[region + '_qr_bounds'] = find_qr_bounds(qr_start, qr_end, gl_seq)
                if self.args.debug:
                    print '        %s match: %s' % (region, clean_alignment_crap(qr_seq, gl_seq))
예제 #7
0
class IhhhmmmParser(object):
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'ihhhmmm')

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener('r')(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                self.siminfo[line['unique_id']] = line
                self.sim_need.append(line['unique_id'])
                iline += 1
                if args.n_max_queries > 0 and iline >= args.n_max_queries:
                    break

        fostream_names = glob.glob(self.args.indir + '/*.fostream')
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(
                infname)  # returns list of unique ids in this file

            with opener('r')(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print '%-20s  no info' % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ''
        print 'partially failed: %d / %d = %.2f' % (
            self.n_partially_failed, len(self.siminfo),
            float(self.n_partially_failed) / len(self.siminfo))
        print 'failed:           %d / %d = %.2f' % (n_failed, len(
            self.siminfo), float(n_failed) / len(self.siminfo))
        print ''

        self.perfplotter.plot()

    # ----------------------------------------------------------------------------------------
    def parse_file(self, infile, unique_ids):
        fk = FileKeeper(infile.readlines())
        i_id = 0
        while not fk.eof and len(self.sim_need) > 0:
            self.parse_detail(fk, unique_ids[i_id])
            i_id += 1

    # ----------------------------------------------------------------------------------------
    def parse_detail(self, fk, unique_id):
        assert fk.iline < len(fk.lines)

        while fk.line[1] != 'Details':
            fk.increment()
            if fk.eof:
                return

        fk.increment()
        info = {}
        info['unique_id'] = unique_id
        for begin_line, column, index, required, default in line_order:
            if fk.line[0].find(begin_line) != 0:
                if required:
                    print 'oop', begin_line, fk.line
                    sys.exit()
                else:
                    info[column] = default
                    continue
            if column != '':
                info[column] = clean_value(column, fk.line[index])
                # if '[' in info[column]:
                #     print 'added', column, clean_value(column, fk.line[index])
                if column.find('_gene') == 1:
                    region = column[0]
                    info[region + '_5p_del'] = int(
                        fk.line[fk.line.index('start:') +
                                1]) - 1  # NOTE their indices are 1-based
                    gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1
                    match_end = int(fk.line[fk.line.index('end:') + 1]) - 1
                    assert gl_length >= match_end
                    info[region + '_3p_del'] = gl_length - match_end

            fk.increment()

        if unique_id not in self.sim_need:
            while not fk.eof and fk.line[
                    1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        info['fv_insertion'] = ''
        info['jf_insertion'] = ''
        info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info[
            'd_qr_seq'] + info['dj_insertion'] + info['j_qr_seq']

        if '-' in info['seq']:
            print 'ERROR found a dash in %s, returning failure' % unique_id
            while not fk.eof and fk.line[
                    1] != 'Details':  # skip stuff until start of next Detail block
                fk.increment()
            return

        if info['seq'] not in self.siminfo[unique_id][
                'seq']:  # arg. I can't do != because it tacks on v left and j right deletions
            print 'ERROR didn\'t find the right sequence for %s' % unique_id
            print '  ', info['seq']
            print '  ', self.siminfo[unique_id]['seq']
            sys.exit()

        if self.args.debug:
            print unique_id
            utils.print_reco_event(self.germline_seqs,
                                   self.siminfo[unique_id],
                                   label='true:',
                                   extra_str='    ')
            utils.print_reco_event(self.germline_seqs,
                                   info,
                                   label='inferred:',
                                   extra_str='    ')

        for region in utils.regions:
            if info[region + '_gene'] not in self.germline_seqs[region]:
                print 'ERROR %s not in germlines' % info[region + '_gene']
                assert False

            gl_seq = info[region + '_gl_seq']
            if '[' in gl_seq:  # ambiguous
                for nuke in utils.nukes:
                    gl_seq = gl_seq.replace('[', nuke)
                    if gl_seq in self.germline_seqs[region][info[region +
                                                                 '_gene']]:
                        print '  replaced [ with %s' % nuke
                        break
                info[region + '_gl_seq'] = gl_seq

            if info[region + '_gl_seq'] not in self.germline_seqs[region][info[
                    region + '_gene']]:
                print 'ERROR gl match not found for %s in %s' % (
                    info[region + '_gene'], unique_id)
                print '  ', info[region + '_gl_seq']
                print '  ', self.germline_seqs[region][info[region + '_gene']]
                self.perfplotter.add_partial_fail(self.siminfo[unique_id],
                                                  info)
                while not fk.eof and fk.line[
                        1] != 'Details':  # skip stuff until start of next Detail block
                    fk.increment()
                return

        self.perfplotter.evaluate(self.siminfo[unique_id], info)
        self.details[unique_id] = info
        self.sim_need.remove(unique_id)

        while not fk.eof and fk.line[
                1] != 'Details':  # skip stuff until start of next Detail block
            fk.increment()

    # ----------------------------------------------------------------------------------------
    def find_partial_failures(self, fostream_name):
        unique_ids = []
        for line in open(fostream_name.replace('.fostream', '')).readlines():
            if len(self.sim_need) == 0:
                return
            if len(line.strip()) == 0:  # skip blank lines
                continue

            line = line.replace('"', '')
            line = line.split(';')

            unique_id = line[0]

            if 'NA' not in line:  # skip lines that were ok
                unique_ids.append(unique_id)
                continue
            if unique_id not in self.sim_need:
                continue
            if unique_id not in self.siminfo:
                continue  # not looking for this <unique_id> a.t.m.

            info = {}
            info['unique_id'] = unique_id
            for stuff in line:
                for region in utils.regions:  # add the first instance of IGH[VDJ] (if it's there at all)
                    if 'IGH' + region.upper(
                    ) in stuff and region + '_gene' not in info:
                        genes = re.findall(
                            'IGH' + region.upper() + '[^ ][^ ]*', stuff)
                        if len(genes) == 0:
                            print 'ERROR no %s genes in %s' % (region, stuff)
                        gene = genes[0]
                        if gene not in self.germline_seqs[region]:
                            print 'ERROR bad gene %s for %s' % (gene,
                                                                unique_id)
                            sys.exit()
                        info[region + '_gene'] = gene
            self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
            if self.args.debug:
                print '%-20s  partial fail %s %s %s' % (
                    unique_id, utils.color_gene(info['v_gene']) if 'v_gene'
                    in info else '', utils.color_gene(info['d_gene']) if
                    'd_gene' in info else '', utils.color_gene(info['j_gene'])
                    if 'j_gene' in info else ''),
                print '  (true %s %s %s)' % tuple([
                    self.siminfo[unique_id][region + '_gene']
                    for region in utils.regions
                ])
            self.failtails[unique_id] = info
            self.n_partially_failed += 1
            self.sim_need.remove(unique_id)

        return unique_ids