def make_cluster_size_distribution(self, base_plotdir, partition=None, infiles=None): subd, plotdir = self.init_subd('sizes', base_plotdir) if partition is not None: # one partition csize_hists = { 'best': self.plotting.get_cluster_size_hist(partition) } elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( self.plotting.get_cluster_size_hist( cp.partitions[cp.i_best])) csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False self.plotting.plot_cluster_size_hists(plotdir + '/cluster-sizes.svg', csize_hists, title='', log='x') return [[subd + '/cluster-sizes.svg']]
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ def do_this_test(pt): if 'partition' not in pt: return False if input_stype not in pt: return False if args.quick and pt not in self.quick_tests: return False return True ptest_list = [k for k in self.tests.keys() if do_this_test(k)] if len(ptest_list) == 0: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' precision sensitivity test description' for ptest in ptest_list: cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') ccfs = cp.ccfs[cp.i_best] if None in ccfs: raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv') self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = ccfs if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ def do_this_test(pt): if 'partition' not in pt: return False if input_stype not in pt: return False if args.quick and pt not in self.quick_tests: return False return True ptest_list = [k for k in self.tests.keys() if do_this_test(k)] if len(ptest_list) == 0: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' purity completeness test description' for ptest in ptest_list: cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') ccfs = cp.ccfs[cp.i_best] if None in ccfs: raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv') self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'] = ccfs if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'], ptest)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best' : plotting.get_cluster_size_hist(partition)} self.plot_within_vs_between_hists(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best' : plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): import plotting print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) fnames = [] if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best': plotting.get_cluster_size_hist(partition)} # self.plot_within_vs_between_hists(partition, annotations, plotdir) fnames += self.plot_size_vs_shm(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best': plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') fnames.append(['cluster-sizes.svg']) if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir, fnames=fnames, new_table_each_row=True) print '(%.1f sec)' % (time.time() - start)
def read_file_info(self, infname, n_paths, calc_adj_mi): paths = [None for _ in range(n_paths)] with opener('r')(infname) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': raise Exception('ERROR null partition (one of the processes probably got passed zero sequences') # shouldn't happen any more FLW uids = [] for cluster in line['partition'].split(';'): uids.append([unique_id for unique_id in cluster.split(':')]) path_index = int(line['path_index']) if paths[path_index] is None: paths[path_index] = ClusterPath(int(line['initial_path_index'])) else: assert paths[path_index].initial_path_index == int(line['initial_path_index']) n_procs = int(line['n_procs']) if 'n_procs' in line else 1 logweight = float(line['logweight']) if 'logweight' in line else None adj_mi = -1 if calc_adj_mi: adj_mi = utils.mutual_information(uids, self.reco_info, debug=False) if self.reco_info is not None else -1 paths[path_index].add_partition(uids, float(line['logprob']), n_procs=n_procs, logweight=logweight, adj_mi=adj_mi) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
def make_cluster_size_distribution(self, base_plotdir, partition=None, infiles=None): subd, plotdir = self.init_subd('sizes', base_plotdir) if partition is not None: # one partition csize_hists = { 'best': self.plotting.get_cluster_size_hist(partition) } elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath(fname=fname) subset_hists.append( self.plotting.get_cluster_size_hist( cp.partitions[cp.i_best])) csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False fname = 'cluster-sizes' if infiles is not None: print '%s should probably rewrite this to integrate with the below' % utils.color( 'red', 'note') self.plotting.plot_cluster_size_hists(plotdir + '/' + fname + '.svg', csize_hists, title='', log='x') else: fig, ax = self.plotting.mpl_init() for label, hist in csize_hists.items(): hist.mpl_plot(ax, remove_empty_bins=True, label=label if len(csize_hists) > 1 else None) csizes = sorted([len(c) for c in partition]) xticks = [ x for x in numpy.logspace( math.log(csizes[0], 10), math.log(csizes[-1], 10), num=5) ] def tstr(xt): return ('%.0f' % xt) if xt < 500 else '%.0e' % xt self.plotting.mpl_finish(ax, plotdir, fname, xlabel='cluster size', ylabel='number of clusters', log='xy', xticks=xticks, xticklabels=[tstr(x) for x in xticks]) return [[subd + '/cluster-sizes.svg']]
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ ptest = "partition-" + input_stype + "-simu" if args.quick and ptest not in self.quick_tests: return if debug: print " version %s input %s partitioning" % (version_stype, input_stype) print " adj mi ccf under/over test description" for ptest in [k for k in self.tests.keys() if "partition" in k and input_stype in k]: if args.quick and ptest not in self.quick_tests: continue cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + "/" + ptest + ".csv") if "data" in ptest: raise Exception("needs fixing") ref_cp = ClusterPath(-1) ref_cp.readfile(self.dirs["xxxref"] + "/" + ptest + ".csv") self.perf_info["xxx"][ptest] = utils.adjusted_mutual_information( cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best] ) # adj mi between the reference and the new data partitions if debug: print " %5.2f %-28s to reference partition" % (self.perf_info["xxx"][ptest], ptest) else: self.perf_info[version_stype][ptest + "-adj_mi"] = cp.adj_mis[cp.i_best] # adj mi to true partition self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][ ptest + "-ccf_over" ] = cp.ccfs[cp.i_best] if debug: print " %5.2f %5.2f %5.2f %-28s to true partition" % ( self.perf_info[version_stype][ptest + "-adj_mi"], self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][ptest + "-ccf_over"], ptest, )
def read_partition_performance(self, version_stype, input_stype, debug=False): """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """ ptest = 'partition-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return if debug: print ' version %s input %s partitioning' % (version_stype, input_stype) print ' precision sensitivity test description' for ptest in [k for k in self.tests.keys() if 'partition' in k and input_stype in k]: if args.quick and ptest not in self.quick_tests: continue cp = ClusterPath(-1) cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv') if 'data' in ptest: raise Exception('needs fixing') # ref_cp = ClusterPath(-1) # ref_cp.readfile(self.dirs['xxxref'] + '/' + ptest + '.csv') # self.perf_info['xxx'][ptest] = utils.adjusted_mutual_information(cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best]) # adj mi between the reference and the new data partitions # if debug: # print ' %5.2f %-28s to reference partition' % (self.perf_info['xxx'][ptest], ptest) else: self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = cp.ccfs[cp.i_best] if debug: print ' %5.2f %5.2f %-28s to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
def read_file_info(self, infname, n_paths): paths = [None for _ in range(n_paths)] lines_list = [[] for _ in range(n_paths)] with open(infname, 'r') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': print ' %s null partition (one of the processes probably got passed zero sequences)' % utils.color( 'red', 'warning') return paths path_index = int( line['path_index']) if 'path_index' in line else 0 initial_path_index = int( line['initial_path_index'] ) if 'initial_path_index' in line else 0 if paths[path_index] is None: # is this the first line for this path? paths[path_index] = ClusterPath( initial_path_index, seed_unique_id=self.seed_unique_id ) # NOTE I may have screwed up the initial_path_index/path_index distinction here... it's been too long since I wrote the smc stuff and I'm not sure else: assert paths[ path_index].initial_path_index == initial_path_index lines_list[path_index].append(line) if paths.count(None) > 0: raise Exception( 'couldn\'t find the required number of paths in file %s' % infname) for path_index in range(n_paths): paths[path_index].readlines(lines_list[path_index], process_csv=True) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def merge_fileinfos(self, fileinfos, smc_particles, previous_info=None, debug=False): self.paths = [ ClusterPath(None, seed_unique_id=self.seed_unique_id) for _ in range(smc_particles) ] # each path's initial_path_index is None since we're merging paths that, in general, have different initial path indices # DEAR FUTURE SELF this won't make any sense until you find that picture you took of the white board if previous_info is not None and smc_particles > 1: # if we're doing smc, this has to happen *beforehand*, since the previous paths are separate for each process (cont'd at XX) assert len(previous_info) == len( fileinfos ) # both are the number of processes we're merging into one # it would be nice to prevent this from adding duplicate adjacent partitions (well... not that important) if debug: print 'prepend previous history' for ifile in range(len(fileinfos)): if debug: print 'ifile', ifile for ipath in range(smc_particles): if debug: print ' ipath', ipath print ' before' fileinfos[ifile][ipath].print_partitions( self.reco_info) initial_path_index = fileinfos[ifile][ ipath].initial_path_index # which previous path are we hooking up to? previous_path = previous_info[ifile][initial_path_index] current_path = fileinfos[ifile][ipath] # first_new_logprob = current_path.logprobs[0] extended_path = ClusterPath( None, seed_unique_id=self.seed_unique_id) for ip in range(len(previous_path.partitions)): # if previous_path.logprobs[ip] >= first_new_logprob: # skip the merges past which we rewound # continue extended_path.add_partition( list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip]) for ip in range(len(current_path.partitions)): extended_path.add_partition( list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip]) fileinfos[ifile][ipath] = extended_path fileinfos[ifile][ipath].set_synthetic_logweight_history( self.reco_info ) # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions if debug: print ' after' fileinfos[ifile][ipath].print_partitions( self.reco_info) # do the actual process-merging for ipath in range(smc_particles): if debug and len(fileinfos) > 1: print 'merge path %d from %d processes:' % (ipath, len(fileinfos)) for ifile in range(len(fileinfos)): fileinfos[ifile][ipath].print_partitions( self.reco_info, extrastr=('%d' % (ifile))) print '' # merge all the steps in each path def last_one(): last = True for ifile in range( len(fileinfos) ): # we're finished when all the files are out of glomeration steps (i.e. they all only have one [the last] line left) last &= len(fileinfos[ifile][ipath].partitions) == 1 return last def remove_one_of_the_first_partitions(): maxdelta, ibestfile = None, None for ifile in range(len(fileinfos)): if len( fileinfos[ifile][ipath].partitions ) == 1: # if this is the last line (i.e. there aren't any more glomeration steps in this file), leave it alone continue thisdelta = fileinfos[ifile][ipath].logprobs[ 1] - fileinfos[ifile][ipath].logprobs[ 0] # logprob difference between the next partition and this one if maxdelta is None or thisdelta > maxdelta: maxdelta = thisdelta ibestfile = ifile # print ' ibest %d with %f - %f = %f' % (ibestfile, fileinfos[ibestfile][ipath].logprobs[1], fileinfos[ibestfile][ipath].logprobs[0], fileinfos[ibestfile][ipath].logprobs[1] - fileinfos[ibestfile][ipath].logprobs[0]) fileinfos[ibestfile][ipath].remove_partition(0) def add_next_global_partition(): global_partition = [] global_logprob = 0. for ifile in range( len(fileinfos) ): # combine the first line in each file to make a global partition for cluster in fileinfos[ifile][ipath].partitions[0]: global_partition.append(list(cluster)) global_logprob += fileinfos[ifile][ipath].logprobs[0] self.paths[ipath].add_partition( global_partition, global_logprob, n_procs=len(fileinfos), logweight=0. ) # don't know the logweight yet (or maybe at all!) while not last_one(): add_next_global_partition() remove_one_of_the_first_partitions() add_next_global_partition() if smc_particles > 1: self.paths[ipath].set_synthetic_logweight_history( self.reco_info) if debug: print ' merged path %d with %d glomeration steps and %d final clusters' % ( ipath, len(self.paths[ipath].partitions), len(self.paths[ipath].partitions[-1])) self.paths[ipath].print_partitions(self.reco_info) if smc_particles == 1: # XX: ...whereas if we're *not* doing smc, we have to add the previous histories *afterward*, since the previous histories are all in one piece if previous_info is None: if debug: print ' no previous history' else: # it would be nice to prevent this from adding duplicate adjacent partitions if debug: print 'prepend previous history' if debug: print ' before' assert len( self.paths ) == 1 # in case gremlins sneak in and add some between lines of code self.paths[0].print_partitions(self.reco_info) # initial_path_index = fileinfos[ifile][ipath].initial_path_index # which previous path are we hooking up to? previous_path = previous_info current_path = self.paths[0] # first_new_logprob = UPDATEME current_path.logprobs[0] extended_path = ClusterPath(None, seed_unique_id=self.seed_unique_id) for ip in range(len(previous_path.partitions)): # if previous_path.logprobs[ip] >= first_new_logprob: # skip the merges past which we rewound # continue extended_path.add_partition( list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip]) for ip in range(len(current_path.partitions)): extended_path.add_partition( list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip]) self.paths[0] = extended_path # self.paths[0].set_synthetic_logweight_history(self.reco_info) # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions if debug: print ' after' self.paths[0].print_partitions(self.reco_info)
#!/usr/bin/env python import sys sys.path.insert(1, './python') import csv csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() glfo = utils.read_germline_set(args.datadir) reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
from clusterpath import ClusterPath parser = argparse.ArgumentParser() parser.add_argument('--infile') parser.add_argument('--locus') parser.add_argument('--param') parser.add_argument('--nclust') args = parser.parse_args() glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus) print(sys.argv) print 'infile =', args.infile print 'param =', args.param cp = ClusterPath() cp.readfile(args.infile) best_partition = cp.partitions[cp.i_best] # sorted_clusters = sorted(best_partition, key=len, reverse=True) # sort by size # clonal family attributes to print print ''' score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage Size & SHM: 4 points for rank in top 25 3 points for rank 25-50 2 points for rank 50-75 1 point for rank 75-100
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None, max_runs=100, max_iterations=1000, method='euclidean', plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, remove_duplicates=False, debug=False): workdir = base_workdir + '/mds' msafname = workdir + '/msa.fa' mdsfname = workdir + '/components.txt' clusterfname = workdir + '/clusters.txt' if not os.path.exists(workdir): os.makedirs(workdir) if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos): # it'll just crash when it's running mds later, but this is faster if remove_duplicates: seq_groups = [list(group) for _, group in itertools.groupby(sorted(seqfos, key=lambda x: x['seq']), key=lambda x: x['seq'])] seqs_to_remove = [] for sgroup in seq_groups: seqs_to_remove += [sfo['name'] for sfo in sgroup[1:]] # remove any after the first one seqfos = [sfo for sfo in seqfos if sfo['name'] not in seqs_to_remove] else: raise Exception('duplicate sequences in seqfos') if aligned: # NOTE unlike the sklearn version below, this doesn't modify <seqfos> with open(msafname, 'w') as fastafile: for sfo in seqfos: fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq'])) else: utils.align_many_seqs(seqfos, outfname=msafname) # build the R cmd file cmdlines = [ 'options(rgl.useNULL=TRUE)', 'require(bios2mds, quietly=TRUE)', 'set.seed(%d)' % seed, 'human <- import.fasta("%s")' % msafname, 'active <- mat.dif(human, human)', # mat.dif or mat.dis? ] if n_components is not None: cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components] cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname] else: raise Exception('need to implement') if n_clusters is not None: cmdlines += [ 'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method), # 'kmeans.run1$clusters', # 'kmeans.run1$elements', 'options(width=10000)', 'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname, # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000, # run for every possible number of clusters (?) # method = "euclidean") # random.msa # builds a random [...] ] rstart = time.time() try: utils.run_r(cmdlines, workdir) #, print_time='kmeans') except subprocess.CalledProcessError as e: # typically happens because of complex eigenvalues print e print ' mds failed on cluster' # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m. title = (title if title is not None else '') + ' mds failed' pcvals = read_component_file(mdsfname, n_components, seqfos) partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None rstop = time.time() if debug and partition is not None: print ' kmeans partition:' cp = ClusterPath(partition=partition) cp.print_partitions(abbreviate=False) os.remove(msafname) os.rmdir(workdir) plotstart = time.time() if plotdir is not None: # utils.prep_dir(plotdir, wildlings=['*.svg']) plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title) if not debug: # this isn't a great way to do this, but I don't want to deal with finding all the calling functions, I just want to add some debug printing to this fcn print ' %5.1f %5.1f' % (rstop - rstart, time.time() - plotstart), return partition
return None return hdist # ---------------------------------------------------------------------------------------- def cdr3_translation(info): naive_cdr3_seq = naive_cdr3(info) naive_cdr3_seq = naive_cdr3_seq[3:len(naive_cdr3_seq) - 3] if len(naive_cdr3_seq) % 3 != 0: # print ' out of frame: adding %s' % ((3 - len(naive_cdr3_seq) % 3) * 'N') naive_cdr3_seq += (3 - len(naive_cdr3_seq) % 3) * 'N' return utils.ltranslate(naive_cdr3_seq) # ---------------------------------------------------------------------------------------- cpaths = [ClusterPath() for _ in range(len(args.infiles))] for ifile in range(len(args.infiles)): cpaths[ifile].readfile(args.infiles[ifile]) partitions = [ sorted(cp.partitions[cp.i_best], key=len, reverse=True) for cp in cpaths ] repertoire_sizes = [ sum([len(c) for c in partition]) for partition in partitions ] min_inner_sizes = [ args.min_inner_size if args.min_inner_rep_frac is None else args.min_inner_rep_frac * repertoire_sizes[isample] for isample in range(len(args.infiles)) ] min_outer_sizes = [
def merge_fileinfos(self, fileinfos, smc_particles, previous_info=None, debug=False): self.paths = [ClusterPath(None, seed_unique_id=self.seed_unique_id) for _ in range(smc_particles)] # each path's initial_path_index is None since we're merging paths that, in general, have different initial path indices # DEAR FUTURE SELF this won't make any sense until you find that picture you took of the white board if previous_info is not None and smc_particles > 1: # if we're doing smc, this has to happen *beforehand*, since the previous paths are separate for each process (cont'd at XX) assert len(previous_info) == len(fileinfos) # both are the number of processes we're merging into one # TODO prevent this from adding duplicate adjacent partitions (well... not that important) if debug: print 'prepend previous history' for ifile in range(len(fileinfos)): if debug: print 'ifile', ifile for ipath in range(smc_particles): if debug: print ' ipath', ipath print ' before' fileinfos[ifile][ipath].print_partitions(self.reco_info) initial_path_index = fileinfos[ifile][ipath].initial_path_index # which previous path are we hooking up to? previous_path = previous_info[ifile][initial_path_index] current_path = fileinfos[ifile][ipath] # first_new_logprob = current_path.logprobs[0] extended_path = ClusterPath(None, seed_unique_id=self.seed_unique_id) for ip in range(len(previous_path.partitions)): # if previous_path.logprobs[ip] >= first_new_logprob: # skip the merges past which we rewound # continue extended_path.add_partition(list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip], adj_mi=previous_path.adj_mis[ip]) for ip in range(len(current_path.partitions)): extended_path.add_partition(list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip], adj_mi=current_path.adj_mis[ip]) fileinfos[ifile][ipath] = extended_path fileinfos[ifile][ipath].set_synthetic_logweight_history(self.reco_info) # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions if debug: print ' after' fileinfos[ifile][ipath].print_partitions(self.reco_info) # do the actual process-merging for ipath in range(smc_particles): if debug and len(fileinfos) > 1: print 'merge path %d from %d processes:' % (ipath, len(fileinfos)) for ifile in range(len(fileinfos)): fileinfos[ifile][ipath].print_partitions(self.reco_info, extrastr=('%d' % (ifile))) print '' # merge all the steps in each path def last_one(): last = True for ifile in range(len(fileinfos)): # we're finished when all the files are out of glomeration steps (i.e. they all only have one [the last] line left) last &= len(fileinfos[ifile][ipath].partitions) == 1 return last def remove_one_of_the_first_partitions(): maxdelta, ibestfile = None, None for ifile in range(len(fileinfos)): if len(fileinfos[ifile][ipath].partitions) == 1: # if this is the last line (i.e. there aren't any more glomeration steps in this file), leave it alone continue thisdelta = fileinfos[ifile][ipath].logprobs[1] - fileinfos[ifile][ipath].logprobs[0] # logprob difference between the next partition and this one if maxdelta is None or thisdelta > maxdelta: maxdelta = thisdelta ibestfile = ifile # print ' ibest %d with %f - %f = %f' % (ibestfile, fileinfos[ibestfile][ipath].logprobs[1], fileinfos[ibestfile][ipath].logprobs[0], fileinfos[ibestfile][ipath].logprobs[1] - fileinfos[ibestfile][ipath].logprobs[0]) fileinfos[ibestfile][ipath].remove_first_partition() def add_next_global_partition(): global_partition = [] global_logprob = 0. for ifile in range(len(fileinfos)): # combine the first line in each file to make a global partition for cluster in fileinfos[ifile][ipath].partitions[0]: global_partition.append(list(cluster)) global_logprob += fileinfos[ifile][ipath].logprobs[0] self.paths[ipath].add_partition(global_partition, global_logprob, n_procs=len(fileinfos), logweight=0.) # don't know the logweight yet (or maybe at all!) while not last_one(): add_next_global_partition() remove_one_of_the_first_partitions() add_next_global_partition() if smc_particles > 1: self.paths[ipath].set_synthetic_logweight_history(self.reco_info) if debug: print ' merged path %d with %d glomeration steps and %d final clusters' % (ipath, len(self.paths[ipath].partitions), len(self.paths[ipath].partitions[-1])) self.paths[ipath].print_partitions(self.reco_info) if smc_particles == 1: # XX: ...whereas if we're *not* doing smc, we have to add the previous histories *afterward*, since the previous histories are all in one piece if previous_info is None: if debug: print ' no previous history' else: # TODO prevent this from adding duplicate adjacent partitions if debug: print 'prepend previous history' if debug: print ' before' assert len(self.paths) == 1 # in case gremlins sneak in and add some between lines of code self.paths[0].print_partitions(self.reco_info) # initial_path_index = fileinfos[ifile][ipath].initial_path_index # which previous path are we hooking up to? previous_path = previous_info current_path = self.paths[0] # first_new_logprob = UPDATEME current_path.logprobs[0] extended_path = ClusterPath(None, seed_unique_id=self.seed_unique_id) for ip in range(len(previous_path.partitions)): # if previous_path.logprobs[ip] >= first_new_logprob: # skip the merges past which we rewound # continue extended_path.add_partition(list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip], adj_mi=previous_path.adj_mis[ip]) for ip in range(len(current_path.partitions)): extended_path.add_partition(list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip], adj_mi=current_path.adj_mis[ip]) self.paths[0] = extended_path # self.paths[0].set_synthetic_logweight_history(self.reco_info) # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions if debug: print ' after' self.paths[0].print_partitions(self.reco_info)
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)