def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines( self.args.datadir) #, add_fp=True) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.precluster_info = {} if self.args.seqfile is not None: self.input_info, self.reco_info = get_seqfile_info( self.args.seqfile, self.args.is_data, self.germline_seqs, self.cyst_positions, self.tryp_positions, self.args.n_max_queries, self.args.queries, self.args.reco_ids) self.outfile = None if self.args.outfname != None: if os.path.exists(self.args.outfname): os.remove(self.args.outfname) self.outfile = open(self.args.outfname, 'a')
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) #, add_fp=True) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.precluster_info = {} if self.args.seqfile is not None: self.input_info, self.reco_info = get_seqfile_info(self.args.seqfile, self.args.is_data, self.germline_seqs, self.cyst_positions, self.tryp_positions, self.args.n_max_queries, self.args.queries, self.args.reco_ids) self.outfile = None if self.args.outfname != None: if os.path.exists(self.args.outfname): os.remove(self.args.outfname) self.outfile = open(self.args.outfname, 'a')
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info( simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict( ), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[ n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float( line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
#!/usr/bin/env python import sys sys.path.insert(1, './python') import csv csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() glfo = utils.read_germline_set(args.datadir) reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
sys.path.insert(1, './python') import csv import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() germline_seqs = utils.read_germlines(args.datadir) cyst_positions = utils.read_cyst_positions(args.datadir) with open(args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region tryp_reader = csv.reader(csv_file) tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, germline_seqs, cyst_positions, tryp_positions) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '') if not os.path.exists(partis_dir): print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir sys.path.insert(1, partis_dir + '/python') import utils import seqfileopener parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('--debug', action='store_true') parser.add_argument('--chimera-freq', default=1., type=float, help='fraction of sequences to make chimeric') parser.add_argument('--min-chunk-len', default=15, type=int, help='require that each bit of the chimera is at least this long') args = parser.parse_args() input_info, _ = seqfileopener.get_seqfile_info(args.infile, is_data=False) if len(input_info) < 50: print '%s making chimeras with only %d sequences, and since we choose from among the existing sequence for templates this won\'t be very effective' % (utils.color('yellow', 'warning'), len(input_info)) n_chimeric = 0 outfo = collections.OrderedDict() for uid, seqfo in input_info.items(): if args.debug: print uid if numpy.random.uniform(0, 1) > args.chimera_freq: # no chimeras for this sequence if args.debug: print ' non-chimeric' continue break_point = random.randint(args.min_chunk_len, len(seqfo['seqs'][0]) - args.min_chunk_len)
import utils from seqfileopener import get_seqfile_info from opener import opener parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--outdir', required=True) parser.add_argument('--start-indices', required=True) # colon-separated list of start indices. E.g. with '0:1:2' we will write three output files. The first seq line in <infname> goes to 0, the next to 1, the third to 2, and then we skip 97 seqs, then yadda yadda parser.add_argument('--modulo', type=int, default=100) args = parser.parse_args() args.start_indices = utils.get_arg_list(args.start_indices, intify=True) print 'subsetting %s: every %d th sequence' % (args.infname, args.modulo) infile = opener('r')(args.infname) input_info, _ = get_seqfile_info(args.infname, is_data=True) #, n_max_queries=1000) for key, d in input_info.items(): # get field names (they should be the same for each row, this just grabs the first one) fieldnames = d.keys() break utils.prep_dir(args.outdir) #, '*.bz2') outfiles, writers = {}, {} for iout in args.start_indices: outfname = args.outdir + ('/every-' + str(args.modulo) + '-subset-%d.csv.bz2' % iout) outfiles[iout] = opener('w')(outfname) writers[iout] = csv.DictWriter(outfiles[iout], fieldnames, delimiter=',') writers[iout].writeheader() iline = 0 n_written = 0 for line in input_info.values():
from opener import opener parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--outdir', required=True) parser.add_argument( '--start-indices', required=True ) # colon-separated list of start indices. E.g. with '0:1:2' we will write three output files. The first seq line in <infname> goes to 0, the next to 1, the third to 2, and then we skip 97 seqs, then yadda yadda parser.add_argument('--modulo', type=int, default=100) args = parser.parse_args() args.start_indices = utils.get_arg_list(args.start_indices, intify=True) print 'subsetting %s: every %d th sequence' % (args.infname, args.modulo) infile = opener('r')(args.infname) input_info, _ = get_seqfile_info(args.infname, is_data=True) #, n_max_queries=1000) for key, d in input_info.items( ): # get field names (they should be the same for each row, this just grabs the first one) fieldnames = d.keys() break utils.prep_dir(args.outdir) #, '*.bz2') outfiles, writers = {}, {} for iout in args.start_indices: outfname = args.outdir + ('/every-' + str(args.modulo) + '-subset-%d.csv.bz2' % iout) outfiles[iout] = opener('w')(outfname) writers[iout] = csv.DictWriter(outfiles[iout], fieldnames, delimiter=',') writers[iout].writeheader() iline = 0