def write_cyst_file(known_cyst_positions):
    unaligned_genes = utils.read_germlines(args.dirname, only_region='v')['v']
    aligned_genes = utils.read_germlines(args.dirname, only_region='v', aligned=True)['v']

    common_gene = None  # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions
    for gene, info in known_cyst_positions.items():
        if gene in aligned_genes:
            common_gene = gene
            break
    if common_gene is None:
        raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname))

    aligned_seq = aligned_genes[common_gene]
    seq = unaligned_genes[common_gene]
    cpos = known_cyst_positions[common_gene]['cysteine-position']
    utils.check_conserved_cysteine(seq, cpos)
    cpos_in_alignment = cpos
    ipos = 0  # position in unaligned sequence
    n_dots_passed = 0  # number of gapped positions in the aligned sequences that we pass before getting to cpos (i.e. while ipos < cpos)
    while ipos < cpos:
        if aligned_seq[ipos + n_dots_passed] in utils.gap_chars:
            cpos_in_alignment += 1
            n_dots_passed += 1
        else:
            ipos += 1
    utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment)
    displacement = cpos_in_alignment - cpos
    print '  cpos displacement: %d' % displacement
    cyst_positions = []
    for gene, seq in unaligned_genes.items():
        
        cyst_positions.append({'gene' : gene, 'cyst_start' : cpos})
예제 #2
0
    def __init__(
        self, seqfname, joinfnames, datadir
    ):  # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output)
        self.debug = 0
        self.n_max_queries = -1
        self.queries = []

        self.germline_seqs = utils.read_germlines(datadir, remove_N_nukes=False)
        assert os.path.exists(os.getenv("www"))
        self.perfplotter = PerformancePlotter(
            self.germline_seqs, os.getenv("www") + "/partis/joinsolver_performance", "js"
        )

        # get info that was passed to joinsolver
        self.seqinfo = {}
        with opener("r")(seqfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if len(self.queries) > 0 and line["unique_id"] not in self.queries:
                    continue
                self.seqinfo[line["unique_id"]] = line
                iline += 1
                if self.n_max_queries > 0 and iline >= self.n_max_queries:
                    break

        self.n_failed, self.n_total = 0, 0
        for joinfname in joinfnames:
            self.parse_file(joinfname)

        self.perfplotter.plot()
        print "failed: %d / %d = %f" % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
    def __init__(self, args):
        self.args = args
        self.germline_seqs = utils.read_germlines(
            self.args.datadir)  #, add_fp=True)

        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        self.precluster_info = {}

        if self.args.seqfile is not None:
            self.input_info, self.reco_info = get_seqfile_info(
                self.args.seqfile, self.args.is_data, self.germline_seqs,
                self.cyst_positions, self.tryp_positions,
                self.args.n_max_queries, self.args.queries, self.args.reco_ids)

        self.outfile = None
        if self.args.outfname != None:
            if os.path.exists(self.args.outfname):
                os.remove(self.args.outfname)
            self.outfile = open(self.args.outfname, 'a')
def align_new_genes(old_aligned_genes, genes_without_alignments, all_new_genes):
    print 'missing alignments for %d genes' % len(genes_without_alignments)
    old_aligned_fname = args.dirname + '/old-aligned.fasta'
    missing_fname = args.dirname + '/missing-alignments.fasta'
    msa_table_fname = args.dirname + '/msa-table.txt'
    all_fname = args.dirname + '/all.fa'
    with open(old_aligned_fname, 'w') as tmpfile:
        for gene, seq in old_aligned_genes.items():
            tmpfile.write('>%s\n%s\n' % (gene, seq.replace('.', '-')))
    with open(missing_fname, 'w') as tmpfile:
        for gene, seq in genes_without_alignments.items():
            tmpfile.write('>%s\n%s\n' % (gene, seq.replace('.', '-')))
    check_call('ruby bin/makemergetable.rb ' + old_aligned_fname + ' 1>' + msa_table_fname, shell=True)
    check_call('cat ' + old_aligned_fname + ' ' + missing_fname + ' >' + all_fname, shell=True)
    check_call('mafft --merge ' + msa_table_fname + ' ' + all_fname + ' >' + args.dirname + '/' + aligned_fname, shell=True)  # options=  # "--localpair --maxiterate 1000"

    # then rewrite aligned file with only new genes, converting to upper case and dots for gaps
    all_aligned_germlines = utils.read_germlines(args.dirname, only_region='v', aligned=True)
    with open(args.dirname + '/' + aligned_fname, 'w') as tmpfile:
        for gene, seq in all_aligned_germlines['v'].items():
            if gene not in all_new_genes:
                continue
            tmpfile.write('>%s\n%s\n' % (gene, seq.replace('-', '.').upper()))

    os.remove(old_aligned_fname)
    os.remove(missing_fname)
    os.remove(msa_table_fname)
    os.remove(all_fname)
예제 #5
0
    def __init__(
        self, seqfname, joinfnames, datadir
    ):  # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output)
        self.debug = 0
        self.n_max_queries = -1
        self.queries = []

        self.germline_seqs = utils.read_germlines(datadir,
                                                  remove_N_nukes=False)
        assert os.path.exists(os.getenv('www'))
        self.perfplotter = PerformancePlotter(
            self.germline_seqs,
            os.getenv('www') + '/partis/joinsolver_performance', 'js')

        # get info that was passed to joinsolver
        self.seqinfo = {}
        with opener('r')(seqfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if len(self.queries
                       ) > 0 and line['unique_id'] not in self.queries:
                    continue
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.n_max_queries > 0 and iline >= self.n_max_queries:
                    break

        self.n_failed, self.n_total = 0, 0
        for joinfname in joinfnames:
            self.parse_file(joinfname)

        self.perfplotter.plot()
        print 'failed: %d / %d = %f' % (self.n_failed, self.n_total,
                                        float(self.n_failed) / self.n_total)
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'ihhhmmm')

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener('r')(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                self.siminfo[line['unique_id']] = line
                self.sim_need.append(line['unique_id'])
                iline += 1
                if args.n_queries > 0 and iline >= args.n_queries:
                    break

        fostream_names = glob.glob(self.args.indir + '/*.fostream')
        if len(fostream_names) == 0:
            raise Exception('no fostreams found in %s' % args.indir)
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(
                infname)  # returns list of unique ids in this file

            with opener('r')(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print '%-20s  no info' % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ''
        print 'partially failed: %d / %d = %.2f' % (
            self.n_partially_failed, len(self.siminfo),
            float(self.n_partially_failed) / len(self.siminfo))
        print 'failed:           %d / %d = %.2f' % (n_failed, len(
            self.siminfo), float(n_failed) / len(self.siminfo))
        print ''

        self.perfplotter.plot()
예제 #7
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True)
        self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, "ihhhmmm")

        self.details = OrderedDict()
        self.failtails = {}
        self.n_partially_failed = 0

        # get sequence info that was passed to ihhhmmm
        self.siminfo = OrderedDict()
        self.sim_need = []  # list of queries that we still need to find
        with opener("r")(self.args.simfname) as seqfile:
            reader = csv.DictReader(seqfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line["unique_id"] not in self.args.queries:
                    continue
                self.siminfo[line["unique_id"]] = line
                self.sim_need.append(line["unique_id"])
                iline += 1
                if args.n_queries > 0 and iline >= args.n_queries:
                    break

        fostream_names = glob.glob(self.args.indir + "/*.fostream")
        if len(fostream_names) == 0:
            raise Exception("no fostreams found in %s" % args.indir)
        fostream_names.sort()  # maybe already sorted?
        for infname in fostream_names:
            if len(self.sim_need) == 0:
                break

            # try to get whatever you can for the failures
            unique_ids = self.find_partial_failures(infname)  # returns list of unique ids in this file

            with opener("r")(infname) as infile:
                self.parse_file(infile, unique_ids)

        # now check that we got results for all the queries we wanted
        n_failed = 0
        for unique_id in self.siminfo:
            if unique_id not in self.details and unique_id not in self.failtails:
                print "%-20s  no info" % unique_id
                self.perfplotter.add_fail()
                n_failed += 1

        print ""
        print "partially failed: %d / %d = %.2f" % (
            self.n_partially_failed,
            len(self.siminfo),
            float(self.n_partially_failed) / len(self.siminfo),
        )
        print "failed:           %d / %d = %.2f" % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo))
        print ""

        self.perfplotter.plot()
예제 #8
0
    def __init__(self, args):
        self.args = args
        self.germline_seqs = utils.read_germlines(self.args.datadir)  #, add_fp=True)

        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        self.precluster_info = {}

        if self.args.seqfile is not None:
            self.input_info, self.reco_info = get_seqfile_info(self.args.seqfile, self.args.is_data,
                                                               self.germline_seqs, self.cyst_positions, self.tryp_positions,
                                                               self.args.n_max_queries, self.args.queries, self.args.reco_ids)

        self.outfile = None
        if self.args.outfname != None:
            if os.path.exists(self.args.outfname):
                os.remove(self.args.outfname)
            self.outfile = open(self.args.outfname, 'a')
    def __init__(self, args, seed, sublabel=None, total_length_from_right=-1):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(
                self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot
        self.total_length_from_right = total_length_from_right  # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.)

        self.all_seqs = {}  # all the Vs, all the Ds...
        self.index_keys = {
        }  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {
        }  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.all_seqs = utils.read_germlines(self.args.datadir)
        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' +
                                    utils.get_parameter_fname('all'))
        self.read_insertion_content()
        if self.args.naivety == 'M':  # read shm info if non-naive is requested
            # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
            with opener('r')(
                    self.args.gtrfname) as gtrfile:  # read gtr parameters
                reader = csv.DictReader(gtrfile)
                for line in reader:
                    parameters = line['parameter'].split('.')
                    region = parameters[0][3].lower()
                    assert region == 'v' or region == 'd' or region == 'j'
                    model = parameters[1].lower()
                    parameter_name = parameters[2]
                    assert model in self.mute_models[region]
                    self.mute_models[region][model][parameter_name] = line[
                        'value']
            treegen = treegenerator.TreeGenerator(args,
                                                  self.args.parameter_dir,
                                                  seed=seed)
            self.treefname = self.workdir + '/trees.tre'
            treegen.generate_trees(seed, self.treefname)
            with opener('r')(
                    self.treefname
            ) as treefile:  # read in the trees (and other info) that we just generated
                self.treeinfo = treefile.readlines()
            if not self.args.no_clean:
                os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(
                self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
예제 #10
0
from performanceplotter import PerformancePlotter
import csv
import utils
#----------------------------
#Get user input
germlineDirectory = raw_input(
    'Enter the path of the germline sequences): ') or 'data/imgt'
originalInputFile = raw_input(
    'Enter the path of the original input file into mixcr): '
) or 'simu-10-leaves-1-mutate.csv'
mixcrOutput = raw_input(
    'Enter the path of the output from mixcr: ') or 'edited_output_file.txt'
mixcrPlotDir = 'mixcrPlotDir'
#----------------------------
#hardcoded default germline sequences
germline_seqs = utils.read_germlines(germlineDirectory)

#create an instance of the performance plotter class
perfplotter = PerformancePlotter(germline_seqs, 'mixcr')

#The true dictionary contains the correct locations taken from the original simulated data file
#The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr
trueDictionary = {}
iDictionary = {}
with open(originalInputFile) as inFile1:
    with open(mixcrOutput) as inFile2:
        reader1 = csv.DictReader(inFile1)
        reader2 = csv.DictReader(inFile2, delimiter='\t')
        for row1, row2 in zip(reader1, reader2):
            unique_id = row1['unique_id']
            #print unique_id
예제 #11
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line['unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[ : self.args.indir.rfind('/')]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3:
                        failregions = re.findall('No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position+1)
                        pgraph = full_text[position : full_text.find('\n\n', position+1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:')
                utils.print_reco_event(self.germline_seqs, line, label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
예제 #12
0
sys.path.insert(1, './python')
import csv
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--simfname')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

germline_seqs = utils.read_germlines(args.datadir)
cyst_positions = utils.read_cyst_positions(args.datadir)
with open(args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region
    tryp_reader = csv.reader(csv_file)
    tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, germline_seqs, cyst_positions, tryp_positions)

cp = ClusterPath()
cp.readfile(args.infname)
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
예제 #13
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir)

        perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir,
                                         'imgt')

        # get sequence info that was passed to imgt
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.queries != None and line[
                        'unique_id'] not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[line['unique_id']] = line
                iline += 1
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break

        paragraphs, csv_info = None, None
        if self.args.infname != None and '.html' in self.args.infname:
            print 'reading', self.args.infname
            with opener('r')(self.args.infname) as infile:
                soup = BeautifulSoup(infile)
                paragraphs = soup.find_all('pre')

        summarydir = self.args.indir[:self.args.indir.rfind(
            '/'
        )]  # one directoy up from <indir>, which has the detailed per-sequence files
        summary_fname = glob.glob(summarydir + '/1_Summary_*.txt')
        assert len(summary_fname) == 1
        summary_fname = summary_fname[0]
        get_genes_to_skip(summary_fname, self.germline_seqs)

        n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0
        for unique_id in self.seqinfo:
            if self.args.debug:
                print unique_id,
            imgtinfo = []
            # print 'true'
            # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id])
            if self.args.infname != None and '.html' in self.args.infname:
                for pre in paragraphs:  # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though
                    if unique_id in pre.text:
                        imgtinfo.append(pre.text)
            else:
                n_total += 1
                assert self.args.infname == None
                infnames = glob.glob(self.args.indir + '/' + unique_id + '*')
                assert len(infnames) <= 1
                if len(infnames) != 1:
                    if self.args.debug:
                        print ' couldn\'t find it'
                    n_not_found += 1
                    continue
                n_found += 1
                with opener('r')(infnames[0]) as infile:
                    full_text = infile.read()
                    if len(
                            re.findall('[123]. Alignment for [VDJ]-GENE',
                                       full_text)) < 3:
                        failregions = re.findall(
                            'No [VDJ]-GENE has been identified', full_text)
                        if self.args.debug and len(failregions) > 0:
                            print '    ', failregions
                        n_failed += 1
                        continue

                    # loop over the paragraphs I want
                    position = full_text.find(unique_id)  # don't need this one
                    for ir in range(4):
                        position = full_text.find(unique_id, position + 1)
                        pgraph = full_text[position:full_text.
                                           find('\n\n', position + 1)]
                        if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph:
                            ir -= 1
                            continue
                        imgtinfo.append(pgraph)  # query seq paragraph

            if len(imgtinfo) == 0:
                print '%s no info' % unique_id
                continue
            else:
                if self.args.debug:
                    print ''
            line = self.parse_query_text(unique_id, imgtinfo)
            if 'skip_gene' in line:
                # assert self.args.skip_missing_genes
                n_skipped += 1
                continue
            try:
                assert 'failed' not in line
                joinparser.add_insertions(line, debug=self.args.debug)
                joinparser.resolve_overlapping_matches(
                    line, debug=False, germlines=self.germline_seqs)
            except (AssertionError, KeyError):
                print '    giving up'
                n_failed += 1
                perfplotter.add_partial_fail(self.seqinfo[unique_id], line)
                # print '    perfplotter: not sure what to do with a fail'
                continue
            perfplotter.evaluate(self.seqinfo[unique_id], line)
            if self.args.debug:
                utils.print_reco_event(self.germline_seqs,
                                       self.seqinfo[unique_id],
                                       label='true:')
                utils.print_reco_event(self.germline_seqs,
                                       line,
                                       label='inferred:')

        perfplotter.plot()
        print 'failed: %d / %d = %f' % (n_failed, n_total,
                                        float(n_failed) / n_total)
        print 'skipped: %d / %d = %f' % (n_skipped, n_total,
                                         float(n_skipped) / n_total)
        print '    ',
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
        print ''
        if n_not_found > 0:
            print '  not found: %d / %d = %f' % (n_not_found, n_not_found +
                                                 n_found, n_not_found /
                                                 float(n_not_found + n_found))
예제 #14
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir,
                                                  remove_N_nukes=True)

        self.perfplotter = PerformancePlotter(self.germline_seqs,
                                              self.args.plotdir, 'igblast')
        self.n_total, self.n_partially_failed = 0, 0

        # get sequence info that was passed to igblast
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries:
                    break
                iline += 1
                if self.args.queries != None and int(
                        line['unique_id']) not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(
                        re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[int(line['unique_id'])] = line

        paragraphs = None
        print 'reading', self.args.infname
        info = {}
        with opener('r')(self.args.infname) as infile:
            line = infile.readline()
            # first find the start of the next query's section
            while line.find('<b>Query=') != 0:
                line = infile.readline()
            # then keep going till eof
            iquery = 0
            while line != '':
                if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries:
                    break
                # first find the query name
                query_name = int(line.split()[1])
                # and collect the lines for this query
                query_lines = []
                line = infile.readline()
                while line.find('<b>Query=') != 0:
                    query_lines.append(line.strip())
                    line = infile.readline()
                    if line == '':
                        break
                iquery += 1
                # then see if we want this query
                if self.args.queries != None and query_name not in self.args.queries:
                    continue
                if query_name not in self.seqinfo:
                    print 'ERROR %d not in reco info' % query_name
                    sys.exit()
                if self.args.debug:
                    print query_name
                # and finally add the query to <info[query_name]>
                info[query_name] = {'unique_id': query_name}
                self.n_total += 1
                self.process_query(info[query_name], query_name, query_lines)

        self.perfplotter.plot()
        print 'partially failed: %d / %d = %f' % (
            self.n_partially_failed, self.n_total,
            float(self.n_partially_failed) / self.n_total)
예제 #15
0
# ----------------------------------------------------------------------------------------
parser = argparse.ArgumentParser()
parser.add_argument('ighv_fname', help='input germline v set (presumably a new one), in fasta')
parser.add_argument('--dirname', help='directory name for output (if not specified, we use <infname> with suffix removed)')
parser.add_argument('--reference-dir', default='data/imgt', help='directory with reference/old germline sets')
args = parser.parse_args()
if args.dirname is None:
    args.dirname = os.path.os.path.splitext(args.ighv_fname)[0]

files_to_copy = ['ighd.fasta', 'ighj.fasta', 'j_tryp.csv']
unaligned_fname = 'ighv.fasta'
aligned_fname = 'ighv-aligned.fasta'

# ----------------------------------------------------------------------------------------
# figure out which v genes we need to align
old_aligned_genes = utils.read_germlines(args.reference_dir, only_region='v', aligned=True)
all_new_genes = utils.read_germlines(args.dirname, only_region='v')  # all genes in ighv_fname, not just the new ones
genes_without_alignments = {}
for gene in all_new_genes['v']:
    if gene not in old_aligned_genes['v']:
        genes_without_alignments[gene] = all_new_genes['v'][gene]

# clean_dir()
# shutil.copyfile(args.ighv_fname, args.dirname + '/' + unaligned_fname)
# if len(genes_without_alignments) > 0:
#     align_new_genes(old_aligned_genes['v'], genes_without_alignments, all_new_genes['v'])
# for fname in files_to_copy:
#     shutil.copyfile(args.reference_dir + '/' + fname, args.dirname + '/' + fname)

known_cyst_positions = utils.read_cyst_positions(args.reference_dir)
write_cyst_file(known_cyst_positions)
예제 #16
0
#This script takes in the inferences for gene locations from project Mixcr in the form of a text file and outputs a directory containing the results in both table and histogram form.
#----------------------------
#Import relevant packages
from performanceplotter import PerformancePlotter
import csv
import utils
#----------------------------
#Get user input
germlineDirectory = raw_input('Enter the path of the germline sequences): ') or 'data/imgt'
originalInputFile = raw_input('Enter the path of the original input file into mixcr): ') or 'simu-10-leaves-1-mutate.csv'
mixcrOutput = raw_input('Enter the path of the output from mixcr: ') or 'edited_output_file.txt'
mixcrPlotDir='mixcrPlotDir'
#----------------------------
#hardcoded default germline sequences
germline_seqs = utils.read_germlines(germlineDirectory) 

#create an instance of the performance plotter class
perfplotter = PerformancePlotter(germline_seqs, 'mixcr')

#The true dictionary contains the correct locations taken from the original simulated data file
#The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr
trueDictionary = {}
iDictionary = {}
with open(originalInputFile) as inFile1:
	with open(mixcrOutput) as inFile2:
		reader1 = csv.DictReader(inFile1)
		reader2 = csv.DictReader(inFile2, delimiter='\t')
		for row1, row2 in zip(reader1, reader2):
			unique_id = row1['unique_id']
			#print unique_id
			trueDictionary[unique_id] = {}
예제 #17
0
    def __init__(self, args):
        self.args = args

        self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True)

        self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast')
        self.n_total, self.n_partially_failed, self.n_skipped = 0, 0, 0

        # get sequence info that was passed to igblast
        self.seqinfo = {}
        with opener('r')(self.args.simfname) as simfile:
            reader = csv.DictReader(simfile)
            iline = 0
            for line in reader:
                if self.args.n_queries > 0 and iline >= self.args.n_queries:
                    break
                iline += 1
                if self.args.queries != None and int(line['unique_id']) not in self.args.queries:
                    continue
                if len(re.findall('_[FP]', line['j_gene'])) > 0:
                    line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '')
                self.seqinfo[int(line['unique_id'])] = line

        print 'reading', self.args.infname

        get_genes_to_skip(self.args.infname, self.germline_seqs, method='igblast', debug=False)

        paragraphs = None
        info = {}
        with opener('r')(self.args.infname) as infile:
            line = infile.readline()
            # first find the start of the next query's section
            while line.find('<b>Query=') != 0:
                line = infile.readline()
            # then keep going till eof
            iquery = 0
            while line != '':
                if self.args.n_queries > 0 and iquery >= self.args.n_queries:
                    break
                # first find the query name
                query_name = int(line.split()[1])
                # and collect the lines for this query
                query_lines = []
                line = infile.readline()
                while line.find('<b>Query=') != 0:
                    query_lines.append(line.strip())
                    line = infile.readline()
                    if line == '':
                        break
                iquery += 1
                # then see if we want this query
                if self.args.queries != None and query_name not in self.args.queries:
                    continue
                if query_name not in self.seqinfo:
                    print 'ERROR %d not in reco info' % query_name
                    sys.exit()
                if self.args.debug:
                    print query_name
                # and finally add the query to <info[query_name]>
                info[query_name] = {'unique_id':query_name}
                self.n_total += 1
                self.process_query(info[query_name], query_name, query_lines)

        self.perfplotter.plot()
        print 'partially failed: %d / %d = %f' % (self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total)
        print 'skipped: %d / %d = %f' % (self.n_skipped, self.n_total, float(self.n_skipped) / self.n_total)
        for g, n in genes_actually_skipped.items():
            print '  %d %s' % (n, utils.color_gene(g))
예제 #18
0
# for human in A B C; do
#     datadir=data/human-beings/$human/M/data
#     bzgrep -m100 . $datadir/data.tsv.bz2 | sed 's/[ \t][ \t]*/,/g'|cut -f2 -d, |sed 's/nucleotide/seq/'> $datadir/head-data.csv
# done
naivety = 'M'
infname = ''
if data_type == 'simu':
    infname = '/home/dralph/Dropbox/work/recombinator/output/' + human + '/' + naivety + '/simu.csv'
else:
    infname = 'data/human-beings/' + human + '/' + naivety + '/' + data_type + '/head-data.csv'
baseoutdir = 'data/human-beings/' + human + '/' + naivety + '/' + data_type

print 'opening ',infname
print '  output',baseoutdir
with opener('r')(infname) as infile:
    germlines = utils.read_germlines('../../../recombinator')
    reader = csv.DictReader(infile)
    il = 0
    for inline in reader:
        il += 1
        print inline['seq'][-100:]
       # if len(inline['seq']) != 130:
       #     assert 'simulated' in infname
        searcher = Searcher(inline['seq'][-100:], debug=False, n_matches_max=5)
        found_str = searcher.search()
        values['found_strings']['v'][0].append(found_str)  # toss them in ['v'][0] -- doesn't really make sense, but they're fine anywhere
        if found_str != 'vjd':  # skip the ones where we didn't find matches in this order (see freqs above).
            continue
        for region in utils.regions:
            for imatch in range(len(searcher.matches[region])):
                if imatch > 2:
예제 #19
0
    def __init__(self, args, seed, sublabel=None, total_length_from_right=-1):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot
        self.total_length_from_right = total_length_from_right  # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.)
    
        self.all_seqs = {}  # all the Vs, all the Ds...
        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {}  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.all_seqs = utils.read_germlines(self.args.datadir)
        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all'))
        self.read_insertion_content()
        if self.args.naivety == 'M':  # read shm info if non-naive is requested
            # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
            with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
                reader = csv.DictReader(gtrfile)
                for line in reader:
                    parameters = line['parameter'].split('.')
                    region = parameters[0][3].lower()
                    assert region == 'v' or region == 'd' or region == 'j'
                    model = parameters[1].lower()
                    parameter_name = parameters[2]
                    assert model in self.mute_models[region]
                    self.mute_models[region][model][parameter_name] = line['value']
            treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed)
            self.treefname = self.workdir + '/trees.tre'
            treegen.generate_trees(seed, self.treefname)
            with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
                self.treeinfo = treefile.readlines()
            if not self.args.no_clean:
                os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
예제 #20
0
import numpy
from subprocess import check_call
import itertools
from collections import OrderedDict

import utils
import plotting

# ----------------------------------------------------------------------------------------
datadir = 'data/imgt'
xtitles = {
    'indels' : 'fraction of positions indel\'d',
    'subs' : 'substitution fraction'
}
glfo = {}
glfo['seqs'] = utils.read_germlines(datadir)
glfo['aligned-v-genes'] = utils.read_germlines(datadir, only_region='v', aligned=True)
vgenes = glfo['aligned-v-genes']['v'].keys()
pversions = OrderedDict()
for vg in vgenes:
    pv = utils.primary_version(vg)
    if pv not in pversions:
        pversions[pv] = []
    pversions[pv].append(vg)

# remove primary versions that only have one gene
for pv in pversions:
    if len(pversions[pv]) == 1:
        print 'removing single-gene pv %s' % pv
        del pversions[pv]