def kmeans_cluster_v_seqs(self, qr_seqs, swfo, plotdir=None, debug=False): if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg'], subdirs=[d for d in os.listdir(plotdir) if os.path.isdir(plotdir + '/' + d)], rm_subdirs=True) clusterfos = [] if debug: print 'kmeans clustering' print ' seqs family' for family, seqfos in self.get_family_groups(qr_seqs, swfo).items(): if debug: print ' %5d %s' % (len(seqfos), family) partition = mds.run_bios2mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.workdir + '/mds', self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None) # partition = mds.run_sklearn_mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None) clusterfos += self.get_clusterfos_from_partition(partition, qr_seqs) clusterfos = sorted(clusterfos, key=lambda c: len(c['seqfos']), reverse=True) return clusterfos
def make_mds_plots(self, sorted_clusters, annotations, base_plotdir, max_cluster_size=10000, reco_info=None, color_rule=None, run_in_parallel=False, debug=False): debug = True # ---------------------------------------------------------------------------------------- def get_fname(ic): return 'icluster-%d' % ic # ---------------------------------------------------------------------------------------- def get_cluster_info(full_cluster): full_info = annotations[':'.join(full_cluster)] title = '%s (size: %d)' % (self.get_cdr3_title(full_info), len(full_cluster)) all_seqs = set() kept_indices = [] for iseq in range(len(full_cluster)): if full_info['seqs'][ iseq] in all_seqs: # duplicates are from shm indels (I think I did this on purpose in sw) continue if full_info['n_mutations'][ iseq] == 0: # remove unmutated sequences since a) they'll crash mds after we add the naive seq below and b) they'd show up in the same spot anyway (note that the only way there can be more than one is if there's Ns either within the sequences or on either end) continue kept_indices.append(iseq) all_seqs.add(full_info['seqs'][iseq]) if len(kept_indices) > max_cluster_size: uids_to_choose_from = set([ full_cluster[i] for i in kept_indices ]) # note similarity to code in seqfileopener.post_process() if self.args.queries_to_include is not None: uids_to_choose_from -= set(self.args.queries_to_include) n_to_remove = len(kept_indices) - max_cluster_size if n_to_remove >= len( uids_to_choose_from ): # i.e. if we'd have to start removing queries that are in <queries_to_include> removed_uids = uids_to_choose_from else: removed_uids = numpy.random.choice( list(uids_to_choose_from), n_to_remove, replace=False ) # i think this'll still crash if len(uids_to_choose_from) is zero, but, meh kept_indices = sorted( set(kept_indices) - set([full_cluster.index(uid) for uid in removed_uids])) title += ' (subset: %d / %d)' % (len(kept_indices), len(full_cluster)) seqfos = [{ 'name': full_info['unique_ids'][iseq], 'seq': full_info['seqs'][iseq] } for iseq in kept_indices] color_scale_vals = { full_cluster[iseq]: full_info['n_mutations'][iseq] for iseq in kept_indices } seqfos.append( { 'name': '_naive', 'seq': full_info['naive_seq'] } ) # note that if any naive sequences that were removed above are in self.args.queries_to_include, they won't be labeled in the plot (but, screw it, who's going to ask to specifically label a sequence that's already specifically labeled?) color_scale_vals[ '_naive'] = 0 # leading underscore is 'cause the mds will crash if there's another sequence with the same name, and e.g. christian's simulation spits out the naive sequence with name 'naive'. No, this is not a good long term fix queries_to_include = ['_naive'] if self.args.queries_to_include is not None: queries_to_include += self.args.queries_to_include return seqfos, color_scale_vals, queries_to_include, title # ---------------------------------------------------------------------------------------- def get_labels_for_coloring(full_cluster, color_rule): full_info = annotations[':'.join(full_cluster)] if color_rule == 'nearest-target': # color by the index of the nearest cluster index (bcr-phylo simulation only) if 'target_seqs' not in reco_info[full_cluster[0]]: return labels = { uid: str(reco_info[uid]['nearest_target_indices'][0]) for uid in full_cluster } labels['_naive'] = 'foop' elif color_rule == 'wtf': labels = {uid: uid.split('@')[1] for uid in full_cluster} labels['_naive'] = 'foop' else: assert False return labels # ---------------------------------------------------------------------------------------- def prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals, title): subworkdir = '%s/mds-%d' % (self.args.workdir, iclust) utils.prep_dir(subworkdir) tmpfname = '%s/seqs.fa' % subworkdir with open(tmpfname, 'w') as tmpfile: for sfo in seqfos: csval = None if sfo['name'] in color_scale_vals: csval = color_scale_vals[sfo['name']] tmpfile.write( '>%s%s\n%s\n' % (sfo['name'], (' %d' % csval) if csval is not None else '', sfo['seq'])) cmdstr = './bin/mds-run.py %s --aligned --plotdir %s --plotname %s --workdir %s --seed %d' % ( tmpfname, plotdir, get_fname(iclust), subworkdir, self.args.seed) if queries_to_include is not None: cmdstr += ' --queries-to-include %s' % ':'.join( queries_to_include) if title is not None: cmdstr += ' --title=%s' % title.replace(' ', '@') return { 'cmd_str': cmdstr, 'workdir': subworkdir, 'outfname': '%s/%s.svg' % (plotdir, get_fname(iclust)), 'workfnames': [tmpfname] } # ---------------------------------------------------------------------------------------- subd, plotdir = self.init_subd('mds', base_plotdir) start = time.time() if debug: if not run_in_parallel: print ' making mds plots starting with %d clusters' % len( sorted_clusters) print ' size (+naive) mds plot total' skipped_cluster_lengths = [] fnames = [[]] cmdfos = [] for iclust in range(len(sorted_clusters)): if not self.plot_this_cluster(sorted_clusters, iclust): skipped_cluster_lengths.append(len(sorted_clusters[iclust])) continue seqfos, color_scale_vals, queries_to_include, title = get_cluster_info( sorted_clusters[iclust]) labels = None if color_rule is not None: labels = get_labels_for_coloring(sorted_clusters[iclust], color_rule) # print ' %s setting color_scale_vals to None so we can use colors for nearest target seq index' % utils.color('red', 'note') color_scale_vals = None # not sure this is really the best way to do this if debug and not run_in_parallel: substart = time.time() subset_str = '' if len( sorted_clusters[iclust] ) <= max_cluster_size else utils.color( 'red', '/%d' % len(sorted_clusters[iclust]), width=6, padside='right') # -1 is for the added naive seq tmpfo = annotations[':'.join(sorted_clusters[iclust])] # n_naive_in_cluster = len([iseq for iseq in range(len(sorted_clusters[iclust])) if tmpfo['n_mutations'][iseq] == 0]) # work out if there was a sequence already in the cluster that was the same as the naive sequence # print ' %4d%6s' % (len(seqfos) - 1 + n_naive_in_cluster, subset_str), print ' %4d%6s' % (len(seqfos), subset_str), if run_in_parallel: assert labels is None # would need to implement this (or just switch to non-parallel version if you need to run with labels set) cmdfos.append( prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals, title)) else: mds.run_bios2mds(self.n_mds_components, None, seqfos, self.args.workdir, self.args.seed, aligned=True, plotdir=plotdir, plotname=get_fname(iclust), queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title) if debug: print ' %5.1f' % (time.time() - substart) self.addfname(fnames, '%s' % get_fname(iclust)) if run_in_parallel: utils.run_cmds(cmdfos, clean_on_success=True) #, debug='print') if debug and len(skipped_cluster_lengths) > 0: print ' skipped %d clusters with lengths: %s (+%d singletons)' % ( len(skipped_cluster_lengths), ' '.join([ '%d' % l for l in skipped_cluster_lengths if l > 1 ]), skipped_cluster_lengths.count(1)) if not self.args.only_csv_plots: self.plotting.make_html(plotdir, fnames=fnames) print ' made %d mds plots (%.1fs)' % (sum( len(x) for x in fnames), time.time() - start) return [[subd + '/' + fn for fn in fnames[0]]]
parser.add_argument('--workdir', default='/tmp/dralph/mds/' + str(random.randint(0, 999999))) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--aligned', action='store_true') args = parser.parse_args() args.queries_to_include = utils.get_arg_list(args.queries_to_include) if args.title is not None: args.title = args.title.replace('@', ' ') # this is kind of hackey seqfos = utils.read_fastx(args.infname) color_scale_vals = {} for sfo in seqfos: if len(sfo['infostrs']) == 2: color_scale_vals[sfo['name']] = int(sfo['infostrs'][1]) if len(color_scale_vals) == 0: color_scale_vals = None # mds.run_sklearn_mds(args.n_components, args.n_clusters, seqfos, args.seed, plotdir=args.plotdir) mds.run_bios2mds(args.n_components, args.n_clusters, seqfos, args.workdir, args.seed, aligned=args.aligned, plotdir=args.plotdir, plotname=args.plotname, queries_to_include=args.queries_to_include, color_scale_vals=color_scale_vals, title=args.title)