def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ def do_this_test(pt): if 'partition' not in pt: return False if input_stype not in pt: return False if args.quick and pt not in self.quick_tests: return False return True ptest_list = [k for k in self.tests.keys() if do_this_test(k)] if len(ptest_list) == 0: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' precision sensitivity test description' for ptest in ptest_list: cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') ccfs = cp.ccfs[cp.i_best] if None in ccfs: raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv') self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = ccfs if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ ptest = "partition-" + input_stype + "-simu" if args.quick and ptest not in self.quick_tests: return if debug: print " version %s input %s partitioning" % (version_stype, input_stype) print " adj mi ccf under/over test description" for ptest in [k for k in self.tests.keys() if "partition" in k and input_stype in k]: if args.quick and ptest not in self.quick_tests: continue cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + "/" + ptest + ".csv") if "data" in ptest: raise Exception("needs fixing") ref_cp = ClusterPath(-1) ref_cp.readfile(self.dirs["xxxref"] + "/" + ptest + ".csv") self.perf_info["xxx"][ptest] = utils.adjusted_mutual_information( cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best] ) # adj mi between the reference and the new data partitions if debug: print " %5.2f %-28s to reference partition" % (self.perf_info["xxx"][ptest], ptest) else: self.perf_info[version_stype][ptest + "-adj_mi"] = cp.adj_mis[cp.i_best] # adj mi to true partition self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][ ptest + "-ccf_over" ] = cp.ccfs[cp.i_best] if debug: print " %5.2f %5.2f %5.2f %-28s to true partition" % ( self.perf_info[version_stype][ptest + "-adj_mi"], self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][ptest + "-ccf_over"], ptest, )
def make_cluster_size_distribution(self, base_plotdir, partition=None, infiles=None): subd, plotdir = self.init_subd('sizes', base_plotdir) if partition is not None: # one partition csize_hists = { 'best': self.plotting.get_cluster_size_hist(partition) } elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( self.plotting.get_cluster_size_hist( cp.partitions[cp.i_best])) csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False self.plotting.plot_cluster_size_hists(plotdir + '/cluster-sizes.svg', csize_hists, title='', log='x') return [[subd + '/cluster-sizes.svg']]
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ def do_this_test(pt): if 'partition' not in pt: return False if input_stype not in pt: return False if args.quick and pt not in self.quick_tests: return False return True ptest_list = [k for k in self.tests.keys() if do_this_test(k)] if len(ptest_list) == 0: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' purity completeness test description' for ptest in ptest_list: cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') ccfs = cp.ccfs[cp.i_best] if None in ccfs: raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv') self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'] = ccfs if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'], ptest)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best' : plotting.get_cluster_size_hist(partition)} self.plot_within_vs_between_hists(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best' : plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): import plotting print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) fnames = [] if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best': plotting.get_cluster_size_hist(partition)} # self.plot_within_vs_between_hists(partition, annotations, plotdir) fnames += self.plot_size_vs_shm(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best': plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') fnames.append(['cluster-sizes.svg']) if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir, fnames=fnames, new_table_each_row=True) print '(%.1f sec)' % (time.time() - start)
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ ptest = 'partition-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' precision sensitivity test description' for ptest in [k for k in self.tests.keys() if 'partition' in k and input_stype in k]: if args.quick and ptest not in self.quick_tests: continue cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') if 'data' in ptest: raise Exception('needs fixing') # ref_cp = ClusterPath(-1) # ref_cp.readfile(self.dirs['xxxref'] + '/' + ptest + '.csv') # self.perf_info['xxx'][ptest] = utils.adjusted_mutual_information(cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best]) # adj mi between the reference and the new data partitions # if debug: # print ' %5.2f %-28s to reference partition' % (self.perf_info['xxx'][ptest], ptest) else: self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = cp.ccfs[cp.i_best] if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
#!/usr/bin/env python import sys sys.path.insert(1, './python') import csv csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() glfo = utils.read_germline_set(args.datadir) reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
parser = argparse.ArgumentParser() parser.add_argument('--infile') parser.add_argument('--locus') parser.add_argument('--param') parser.add_argument('--nclust') args = parser.parse_args() glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus) print(sys.argv) print 'infile =', args.infile print 'param =', args.param cp = ClusterPath() cp.readfile(args.infile) best_partition = cp.partitions[cp.i_best] # sorted_clusters = sorted(best_partition, key=len, reverse=True) # sort by size # clonal family attributes to print print ''' score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage Size & SHM: 4 points for rank in top 25 3 points for rank 25-50 2 points for rank 50-75 1 point for rank 75-100 SFS (Fay Wu H) scores earning 4-1 points: < -20, -10, 0, 10
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)