def print_true_partition(self): print ' true partition' print ' clonal? ids' true_partition = utils.get_true_partition(self.reco_info) for cluster in true_partition: print ' %d %s' % (utils.from_same_event(self.reco_info, cluster), ':'.join([str(uid) for uid in cluster]))
def get_bad_clusters(part): bad_clusters = [ ] # inferred clusters that aren't really all from the same event for ic in range(len(part)): same_event = utils.from_same_event( reco_info, part[ic]) # are all the sequences from the same event? entire_cluster = True # ... and if so, are they the entire true cluster? if same_event: reco_id = reco_info[part[ic][0]][ 'reco_id'] # they've all got the same reco_id then, so pick an aribtrary one true_cluster = true_partition[reco_id] for uid in true_cluster: if uid not in part[ic]: entire_cluster = False break else: entire_cluster = False if not same_event or not entire_cluster: bad_clusters.append(':'.join(part[ic])) if len(bad_clusters) > 25: bad_clusters = ['too', 'long'] return bad_clusters
def print_partition(self, ip, reco_info=None, extrastr='', abbreviate=True, highlight_cluster_indices=None, print_cluster_indices=False, right_extrastr=''): # NOTE <highlight_cluster_indices> and <print_cluster_indices> are quite different despite sounding similar, but I can't think of something else to call the latter that makes more sense # NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output if ip > 0: # delta between this logprob and the previous one delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1]) else: delta_str = '' print ' %s %-12.2f%-7s %s%-5d %4d' % (extrastr, self.logprobs[ip], delta_str, ('%-5d ' % ip) if print_cluster_indices else '', len(self.partitions[ip]), self.n_procs[ip]), print ' ' + self.get_ccf_str(ip), # clusters sorted_clusters = sorted(self.partitions[ip], key=lambda c: len(c), reverse=True) for iclust in range(len(sorted_clusters)): cluster = sorted_clusters[iclust] if abbreviate: cluster_str = ':'.join(['o' if len(uid) > 3 else uid for uid in cluster]) else: # cluster_str = ':'.join(sorted([str(uid) for uid in cluster])) cluster_str = ':'.join([str(uid) for uid in cluster]) if reco_info is not None and not utils.from_same_event(reco_info, cluster): cluster_str = utils.color('red', cluster_str) if self.seed_unique_id is not None and self.seed_unique_id in cluster: cluster_str = utils.color('reverse_video', cluster_str) if highlight_cluster_indices is not None and iclust in highlight_cluster_indices: cluster_str = utils.color('red', cluster_str) if abbreviate: print ' %s' % cluster_str, else: print ' %s' % cluster_str, print '%s' % right_extrastr, print ''
def get_bad_clusters(self, partition, reco_info, true_partition): bad_clusters = [ ] # inferred clusters that aren't really all from the same event for ic in range(len(partition)): same_event = utils.from_same_event( reco_info, partition[ic]) # are all the sequences from the same event? entire_cluster = True # ... and if so, are they the entire true cluster? if same_event: reco_id = reco_info[partition[ic][0]][ 'reco_id'] # they've all got the same reco_id then, so pick an aribtrary one true_clusters = [ cluster for cluster in true_partition if reco_info[cluster[0]]['reco_id'] == reco_id ] # NOTE I think this doesn't work right with shm indels in the cdr3 assert len(true_clusters) == 1 true_cluster = true_clusters[0] for uid in true_cluster: if uid not in partition[ic]: entire_cluster = False break else: entire_cluster = False if not same_event or not entire_cluster: bad_clusters.append(':'.join(partition[ic])) if len(bad_clusters) > 25: bad_clusters = ['too', 'long'] return bad_clusters
def print_true_partition(self): print ' true partition' print ' clonal? ids' true_partition = utils.get_true_partition(self.reco_info) for cluster in true_partition: print ' %d %s' % (utils.from_same_event( self.reco_info, cluster), ':'.join( [str(uid) for uid in cluster]))
def print_partition(self, ip, reco_info=None, extrastr='', one_line=True, abbreviate=True): if one_line: if ip > 0: # delta between this logprob and the previous one delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1]) else: delta_str = '' print ' %5s %-12.2f%-7s %-5d %5d' % (extrastr, self.logprobs[ip], delta_str, len(self.partitions[ip]), self.n_procs[ip]), # logweight (and inverse of number of potential parents) if self.logweights[ip] is not None: way_str, logweight_str = '', '' expon = math.exp(self.logweights[ip]) n_ways = 0 if expon == 0. else 1. / expon way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways) logweight_str = '%8.3f' % self.logweights[ip] # adj mi if reco_info is not None: adj_mi_str = '' if self.adj_mis[ip] is None: adj_mi_str = 'na' else: if self.adj_mis[ip] > 1e-3: adj_mi_str = '%-8.3f' % self.adj_mis[ip] else: adj_mi_str = '%-8.0e' % self.adj_mis[ip] print ' %8s ' % (adj_mi_str), if self.logweights[ip] is not None: print ' %10s %8s ' % (way_str, logweight_str), else: print ' %5s partition %-15.2f' % (extrastr, self.logprobs[ip]), if reco_info is not None: print ' %-8.2f' % (self.adj_mis[ip]), print '' print ' clonal? ids' # clusters for cluster in self.partitions[ip]: same_event = utils.from_same_event(reco_info is None, reco_info, cluster) if same_event is None: same_event = -1 if abbreviate: cluster_str = ':'.join(['o' for uid in cluster]) else: cluster_str = ':'.join([str(uid) for uid in cluster]) if not same_event: cluster_str = utils.color('red', cluster_str) if one_line: if abbreviate: print ' %s' % cluster_str, else: print ' %s' % cluster_str, else: print ' %d %s' % (int(same_event), cluster_str) if one_line: print ''
def print_true_partition(self): print ' true partition' print ' clonal? ids' true_partition = utils.get_true_partition(self.reco_info) for cluster in true_partition.values(): same_event = utils.from_same_event(self.reco_info is None, self.reco_info, cluster) if same_event is None: same_event = -1 print ' %d %s' % (int(same_event), ':'.join([str(uid) for uid in cluster]))
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
def single_link(self, input_scores=None, infname=None, debug=False, reco_info=None, outfile=None): if infname is None: assert input_scores is not None else: assert input_scores is None # should only specify <input_scores> *or* <infname> input_scores = [] with opener('r')(infname) as infile: reader = csv.DictReader(infile) for line in reader: input_scores.append(line) sorted_lines = sorted(input_scores, key=lambda k: float(k['logprob'])) for line in sorted_lines: a_name = line['id_a'] b_name = line['id_b'] score = float(line['logprob']) dbg_str_list = ['%22s %22s %8.3f' % (a_name, b_name, score), ] if reco_info is None: dbg_str_list[-1] += ' %s' % ('-') else: from_same_event = utils.from_same_event(reco_info, [a_name, b_name]) dbg_str_list[-1] += ' %d' % (from_same_event) self.incorporate_into_clusters(a_name, b_name, score, dbg_str_list) self.pairscores[(utils.get_key((a_name, b_name)))] = score self.plotscores['all'].append(score) if reco_info is not None: if from_same_event: self.plotscores['same'].append(score) else: self.plotscores['diff'].append(score) if debug: outstr = ''.join(dbg_str_list) if outfile == None: print outstr else: outfile.write(outstr + '\n') for query, cluster_id in self.query_clusters.iteritems(): if cluster_id not in self.id_clusters: self.id_clusters[cluster_id] = [] self.id_clusters[cluster_id].append(query) for cluster_id, queries in self.id_clusters.items(): if len(queries) == 1: self.singletons.append(queries[0]) # print 'nearest',self.nearest_true_mate out_str_list = [' %d clusters:\n'%len(self.id_clusters), ] for cluster_id in self.id_clusters: out_str_list.append(' ' + ' '.join([str(x) for x in self.id_clusters[cluster_id]]) + '\n') if outfile == None: print ''.join(out_str_list) else: outfile.write(''.join(out_str_list))
def print_partition(self, ip, reco_info=None, extrastr='', abbreviate=True, smc_print=False): # NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output if ip > 0: # delta between this logprob and the previous one delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip - 1]) else: delta_str = '' print ' %s %-12.2f%-7s %-5d %4d' % ( extrastr, self.logprobs[ip], delta_str, len( self.partitions[ip]), self.n_procs[ip]), # logweight (and inverse of number of potential parents) if self.logweights[ip] is not None and smc_print: way_str, logweight_str = '', '' expon = math.exp(self.logweights[ip]) n_ways = 0 if expon == 0. else 1. / expon way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways) logweight_str = '%8.3f' % self.logweights[ip] print ' ' + self.get_ccf_str(ip), if self.logweights[ip] is not None and smc_print: print ' %10s %8s ' % (way_str, logweight_str), # clusters for cluster in sorted(self.partitions[ip], key=lambda c: len(c), reverse=True): if abbreviate: cluster_str = ':'.join( ['o' if len(uid) > 3 else uid for uid in cluster]) else: # cluster_str = ':'.join(sorted([str(uid) for uid in cluster])) cluster_str = ':'.join([str(uid) for uid in cluster]) if reco_info is not None and not utils.from_same_event( reco_info, cluster): cluster_str = utils.color('red', cluster_str) if self.seed_unique_id is not None and self.seed_unique_id in cluster: cluster_str = utils.color('reverse_video', cluster_str) if abbreviate: print ' %s' % cluster_str, else: print ' %s' % cluster_str, print ''
def print_partition(self, ip, reco_info=None, extrastr="", abbreviate=True, smc_print=False): # NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output if ip > 0: # delta between this logprob and the previous one delta_str = "%.1f" % (self.logprobs[ip] - self.logprobs[ip - 1]) else: delta_str = "" print " %s %-12.2f%-7s %-5d %4d" % ( extrastr, self.logprobs[ip], delta_str, len(self.partitions[ip]), self.n_procs[ip], ), # logweight (and inverse of number of potential parents) if self.logweights[ip] is not None and smc_print: way_str, logweight_str = "", "" expon = math.exp(self.logweights[ip]) n_ways = 0 if expon == 0.0 else 1.0 / expon way_str = ("%.1f" % n_ways) if n_ways < 1e7 else ("%8.1e" % n_ways) logweight_str = "%8.3f" % self.logweights[ip] print " " + self.get_ccf_str(ip), if self.logweights[ip] is not None and smc_print: print " %10s %8s " % (way_str, logweight_str), # clusters for cluster in self.partitions[ip]: if abbreviate: cluster_str = ":".join(["o" if len(uid) > 3 else uid for uid in cluster]) else: # cluster_str = ':'.join(sorted([str(uid) for uid in cluster])) cluster_str = ":".join([str(uid) for uid in cluster]) if reco_info is not None and not utils.from_same_event(reco_info, cluster): cluster_str = utils.color("red", cluster_str) if self.seed_unique_id is not None and self.seed_unique_id in cluster: cluster_str = utils.color("reverse_video", cluster_str) if abbreviate: print " %s" % cluster_str, else: print " %s" % cluster_str, print ""
def write_partitions(self, writer, is_data, reco_info, true_partition, smc_particles, path_index, n_to_write=None, calc_adj_mi=None): for ipart in self.get_partition_subset(n_partitions=n_to_write): part = self.partitions[ipart] cluster_str = '' bad_clusters = [] # inferred clusters that aren't really all from the same event for ic in range(len(part)): if ic > 0: cluster_str += ';' cluster_str += ':'.join(part[ic]) if not is_data: same_event = utils.from_same_event(is_data, reco_info, part[ic]) # are all the sequences from the same event? entire_cluster = True # ... and if so, are they the entire true cluster? if same_event: reco_id = reco_info[part[ic][0]]['reco_id'] # they've all got the same reco_id then, so pick an aribtrary one true_cluster = true_partition[reco_id] for uid in true_cluster: if uid not in part[ic]: entire_cluster = False break else: entire_cluster = False if not same_event or not entire_cluster: bad_clusters.append(':'.join(part[ic])) if len(bad_clusters) > 25: bad_clusters = ['too', 'long'] row = {'logprob' : self.logprobs[ipart], 'n_clusters' : len(part), 'n_procs' : self.n_procs[ipart], 'clusters' : cluster_str} if smc_particles > 1: row['path_index'] = path_index row['logweight'] = self.logweights[ipart] if not is_data: if calc_adj_mi is None or self.adj_mis[ipart] != -1: # if we don't want to write any adj mis, or if we already calculated it row['adj_mi'] = self.adj_mis[ipart] else: if calc_adj_mi == 'best' and ipart == self.i_best: # only calculate adj_mi for the best partition row['adj_mi'] = utils.mutual_information(part, reco_info) else: row['adj_mi'] = self.adj_mis[ipart] row['n_true_clusters'] = len(true_partition) row['bad_clusters'] = ';'.join(bad_clusters) writer.writerow(row)
def get_bad_clusters(part): bad_clusters = [] # inferred clusters that aren't really all from the same event for ic in range(len(part)): same_event = utils.from_same_event(reco_info, part[ic]) # are all the sequences from the same event? entire_cluster = True # ... and if so, are they the entire true cluster? if same_event: reco_id = reco_info[part[ic][0]]['reco_id'] # they've all got the same reco_id then, so pick an aribtrary one true_cluster = true_partition[reco_id] for uid in true_cluster: if uid not in part[ic]: entire_cluster = False break else: entire_cluster = False if not same_event or not entire_cluster: bad_clusters.append(':'.join(part[ic])) if len(bad_clusters) > 25: bad_clusters = ['too', 'long'] return bad_clusters
def print_partition(self, ip, reco_info=None, extrastr='', abbreviate=True, smc_print=False): if ip > 0: # delta between this logprob and the previous one delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1]) else: delta_str = '' print ' %s %-12.2f%-7s %-5d %4d' % (extrastr, self.logprobs[ip], delta_str, len(self.partitions[ip]), self.n_procs[ip]), # logweight (and inverse of number of potential parents) if self.logweights[ip] is not None and smc_print: way_str, logweight_str = '', '' expon = math.exp(self.logweights[ip]) n_ways = 0 if expon == 0. else 1. / expon way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways) logweight_str = '%8.3f' % self.logweights[ip] print ' ' + self.get_ccf_str(ip), if self.logweights[ip] is not None and smc_print: print ' %10s %8s ' % (way_str, logweight_str), # clusters for cluster in self.partitions[ip]: if abbreviate: cluster_str = ':'.join(['o' if len(uid) > 3 else uid for uid in cluster]) else: # cluster_str = ':'.join(sorted([str(uid) for uid in cluster])) cluster_str = ':'.join([str(uid) for uid in cluster]) if reco_info is not None and not utils.from_same_event(reco_info, cluster): cluster_str = utils.color('red', cluster_str) if self.seed_unique_id is not None and self.seed_unique_id in cluster: cluster_str = utils.color('reverse_video', cluster_str) if abbreviate: print ' %s' % cluster_str, else: print ' %s' % cluster_str, print ''
def peruse_forward_scores(): _, reco_info = seqfileopener.get_seqfile_info( simfname, is_data=False) #, n_max_queries=10000) logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict( ), OrderedDict() for n_set in n_set_list: print n_set # if n_set != 5: # continue logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[ n_set] = OrderedDict(), OrderedDict(), OrderedDict() with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: uidlist = line['unique_ids'].split(':') assert utils.from_same_event(reco_info, uidlist) reco_id = reco_info[uidlist[0]]['reco_id'] if reco_id in logprobs[n_set]: raise Exception('already had %s' % reco_id) logprobs[n_set][reco_id] = float(line['logprob']) factor = 1. / n_set partialcorr_logprobs[n_set][reco_id] = factor * float( line['logprob']) factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set # factor = 1. / (0.77547824*n_set + 0.20327936) corr_logprobs[n_set][reco_id] = factor * float(line['logprob']) i_baseline = -1 deviations = get_deviations(logprobs, i_baseline) # fit_stuff(n_set_list, deviations) partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline) signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True) corr_deviations = get_deviations(corr_logprobs, i_baseline) signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True) import plotting fig, ax = plotting.mpl_init() ax.plot(n_set_list, deviations, marker='.') plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) # fig, ax = plotting.mpl_init() # ax.plot(n_set_list, partialcorr_deviations, marker='.') # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)') ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n') ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)') ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02)) fig, ax = plotting.mpl_init() ax.plot(n_set_list, signed_corr_deviations, marker='.') ax.plot([n_set_list[0], n_set_list[-1]], [0, 0]) plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
def single_link(self, input_scores=None, infname=None, debug=False, reco_info=None, outfile=None): if infname is None: assert input_scores is not None else: assert input_scores is None # should only specify <input_scores> *or* <infname> input_scores = [] with open(infname, 'r') as infile: reader = csv.DictReader(infile) for line in reader: input_scores.append(line) sorted_lines = sorted(input_scores, key=lambda k: float(k['logprob'])) for line in sorted_lines: a_name = line['id_a'] b_name = line['id_b'] score = float(line['logprob']) dbg_str_list = [ '%22s %22s %8.3f' % (a_name, b_name, score), ] if reco_info is None: dbg_str_list[-1] += ' %s' % ('-') else: from_same_event = utils.from_same_event( reco_info, [a_name, b_name]) dbg_str_list[-1] += ' %d' % (from_same_event) self.incorporate_into_clusters(a_name, b_name, score, dbg_str_list) self.pairscores[(utils.get_key((a_name, b_name)))] = score self.plotscores['all'].append(score) if reco_info is not None: if from_same_event: self.plotscores['same'].append(score) else: self.plotscores['diff'].append(score) if debug: outstr = ''.join(dbg_str_list) if outfile == None: print outstr else: outfile.write(outstr + '\n') for query, cluster_id in self.query_clusters.iteritems(): if cluster_id not in self.id_clusters: self.id_clusters[cluster_id] = [] self.id_clusters[cluster_id].append(query) for cluster_id, queries in self.id_clusters.items(): if len(queries) == 1: self.singletons.append(queries[0]) # print 'nearest',self.nearest_true_mate out_str_list = [ ' %d clusters:\n' % len(self.id_clusters), ] for cluster_id in self.id_clusters: out_str_list.append( ' ' + ' '.join([str(x) for x in self.id_clusters[cluster_id]]) + '\n') if outfile == None: print ''.join(out_str_list) else: outfile.write(''.join(out_str_list))