def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper() cmds = getpathcmd() cmds += ['conda activate %s' % args.env_label] cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir # if args.glfo_dir is not None else 'data/germlines/ XXX human' # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up) glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def run_other_method(args, method): if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']: # really just to make it easier to search for this fcn assert False assert args.n_max_queries is None if utils.output_exists(args, get_outfname(args, method)): return simfasta = utils.getprefix(args.simfname) + '.fa' utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True) cmd = './test/%s-run.py' % method.split('-')[0] if method == 'tigger-tuned': cmd += ' --tuned-tigger-params' cmd += ' --infname ' + simfasta cmd += ' --outfname ' + get_outfname(args, method) if args.species != 'human': cmd += ' --species %s' % args.species if args.overwrite: cmd += ' --overwrite' if args.gls_gen: cmd += ' --gls-gen' cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir # the partis mehods have this as the default internally, but we want/have to set it explicitly here else: cmd += ' --glfo-dir ' + args.inf_glfo_dir cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info if method != 'igdiscover': # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir cmd += ' --workdir ' + args.workdir + '/' + method cmd += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd += ' --slurm' utils.simplerun(cmd, dryrun=args.dry_run)
def simulate(): rearrange() glfo, naive_event_list, cpath = utils.read_output(naive_fname()) assert len(naive_event_list) == args.n_sim_events outdirs = [ '%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list)) ] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): run_bcr_phylo(naive_line, outdir, ievent) if utils.output_exists( args, simfname(), outlabel='mutated simu', offset=4 ): # i guess if it crashes during the plotting just below, this'll get confused return mutated_events = [] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): mutated_events.append( parse_bcr_phylo_output(glfo, naive_line, outdir, ievent)) print ' writing annotations to %s' % simfname() utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers) import plotting for outdir, event in zip(outdirs, mutated_events): plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, args.metric_for_target_distance)
def run_other_method(args, method): if method not in [ 'tigger', 'igdiscover' ]: # really just to make it easier to search for this fcn assert False if utils.output_exists(args, get_outfname(args, method)): return simfasta = utils.getprefix(args.simfname) + '.fa' utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True) cmd = './test/%s-run.py' % method cmd += ' --infname ' + simfasta cmd += ' --outfname ' + get_outfname(args, method) if args.overwrite: cmd += ' --overwrite' if args.gls_gen: cmd += ' --gls-gen' cmd += ' --glfo-dir ' + partis_dir + '/data/germlines/human' # the partis mehods have this as the default internally, but we want/have to set it explicitly here else: cmd += ' --glfo-dir ' + args.inf_glfo_dir if method != 'igdiscover': # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir cmd += ' --workdir ' + args.workdir + '/' + method cmd += ' --n-procs ' + str(args.n_procs) utils.simplerun(cmd, dryrun=args.dry_run)
def run_partis(infname, outfname): if utils.output_exists(args, outfname, offset=8): return aligned_gl_seqs = {} # keyed by seq so it's easy to check for duplicates for r in utils.regions: # deduplicate before passing to partis for seqfo in utils.read_fastx(get_glfname(r, aligned=True)): if seqfo['seq'] in aligned_gl_seqs: continue aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs']) aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa' with open(aligned_germline_fname, 'w') as merged_file: for seq, gene in aligned_gl_seqs.items(): merged_file.write('>%s\n%s\n' % (gene, seq)) cmd = './bin/partis cache-parameters' cmd += ' --infname ' + infname cmd += ' --leave-default-germline' cmd += ' --presto-output --only-smith-waterman' cmd += ' --outfname ' + outfname if args.glfo_dir is not None: cmd += ' --initial-germline-dir ' + args.glfo_dir cmd += ' --aligned-germline-fname ' + aligned_germline_fname cmd += ' --n-procs ' + str(args.n_procs) utils.simplerun(cmd, print_time='partis annotation') os.remove(aligned_germline_fname)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % args.infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] cmdfname = outdir + '/run.sh' with open(cmdfname, 'w') as cmdfile: for cmd in cmds: cmdfile.write(cmd + '\n') subprocess.check_call(['chmod', '+x', cmdfname]) cmdfos = [{ 'cmd_str': cmdfname, 'workdir': outdir, 'outfname': outdir + '/work/final/%s_usage.tab' % 'v'.upper() }] utils.simplerun(cmdfname, shell=True, print_time='igdiscover')
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events, uid_str_len=None): if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4): return None cmd = '%s/bin/simulator.py' % bcr_phylo_path if args.run_help: cmd += ' --help' elif args.stype == 'neutral': assert False # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args) cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365) cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation elif args.stype == 'selection': cmd += ' --selection' cmd += ' --lambda %f' % args.branching_parameter cmd += ' --lambda0 %f' % args.base_mutation_rate cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength) cmd += ' --obs_times %s' % ' '.join(['%d' % get_vpar_val('obs-times', t) for t in args.obs_times]) cmd += ' --n_to_sample %s' % ' '.join('%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation) cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance if args.paratope_positions is not None: cmd += ' --paratope_positions %s' % args.paratope_positions cmd += ' --target_dist %d' % args.target_distance cmd += ' --target_count %d' % args.target_count cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap) if not args.dont_observe_common_ancestors: cmd += ' --observe_common_ancestors' if args.leaf_sampling_scheme is not None: cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme if args.n_target_clusters is not None: cmd += ' --n_target_clusters %d' % args.n_target_clusters # cmd += ' --target_cluster_distance 1' if args.min_target_distance is not None: cmd += ' --min_target_distance %d' % args.min_target_distance else: assert False cmd += ' --debug %d' % args.debug cmd += ' --n_tries 1000' if args.context_depend == 0: cmd += ' --no_context' cmd += ' --no_plot' if args.only_csv_plots: cmd += ' --dont_write_hists' cmd += ' --outbase %s/%s' % (outdir, args.extrastr) cmd += ' --random_seed %d' % (args.seed + ievent) if uid_str_len is not None: cmd += ' --uid_str_len %d' % uid_str_len cmd += ' --naive_seq %s' % naive_line['naive_seq'] if not os.path.exists(outdir): os.makedirs(outdir) cfo = None if args.n_procs == 1: utils.run_ete_script(cmd, ete_path) # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange() else: cmd, _ = utils.run_ete_script(cmd, ete_path, return_for_cmdfos=True, tmpdir=outdir) cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)} return cfo
def run_changeo(infname, igblast_outfname, outfname): if utils.output_exists(args, outfname, offset=8): return glfnames = [get_glfname(r, aligned=True) for r in utils.regions] cmd = args.changeo_path + '/bin/MakeDb.py igblast' cmd += ' -i %s -s %s -r %s --regions --scores' % (igblast_outfname, infname, ' '.join(glfnames)) utils.simplerun(cmd, print_time='changeo')
def cache_parameters(): if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --n-procs %d --seed %d' % ( simfname(), param_dir(), args.n_procs, args.seed) utils.simplerun(cmd, debug=True) #, dryrun=True)
def run_partis_parameter_cache(args, method): if utils.output_exists(args, get_outfname(args, method)): return paramdir = args.outdir + '/' + method plotdir = args.outdir + '/' + method + '/plots' # remove any old sw cache files sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, args.locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman' cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir if method == 'partis': cmd_str += ' --debug-allele-finding' # --always-find-new-alleles' cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info if args.allele_cluster: cmd_str += ' --allele-cluster' if args.kmeans_allele_cluster: cmd_str += ' --kmeans-allele-cluster' elif method == 'full': cmd_str += ' --leave-default-germline' else: assert False if args.species != 'human': cmd_str += ' --species %s' % args.species cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' if not args.gls_gen: # otherwise it uses the default (full) germline dir cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir # --dont-remove-unlikely-alleles cmd_str += ' --parameter-dir ' + paramdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) if args.plot_and_fit_absolutely_everything is not None: cmd_str += ' --plot-and-fit-absolutely-everything ' + str( args.plot_and_fit_absolutely_everything) utils.simplerun(cmd_str, dryrun=args.dryrun)
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int( args.debug), args.seed, naive_fname(), args.n_sim_events) utils.simplerun(cmd, debug=True)
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events): if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4): return cmd = '%s/bin/simulator.py' % bcr_phylo_path if args.run_help: cmd += ' --help' elif args.stype == 'neutral': assert False # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args) cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365) cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation elif args.stype == 'selection': cmd += ' --selection' cmd += ' --lambda %f' % args.branching_parameter cmd += ' --lambda0 %f' % args.base_mutation_rate cmd += ' --selection_strength %f' % get_vpar_val( 'selection-strength', args.selection_strength) cmd += ' --obs_times %s' % ' '.join( ['%d' % get_vpar_val('obs-times', t) for t in args.obs_times]) cmd += ' --n_to_sample %s' % ' '.join( '%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation) cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance cmd += ' --target_dist %d' % args.target_distance cmd += ' --target_count %d' % args.target_count cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap) if not args.dont_observe_common_ancestors: cmd += ' --observe_common_ancestors' # cmd += ' --n_target_clusters 1' # cmd += ' --target_cluster_distance 1' # cmd += ' --observe_based_on_affinity' # implementation in bcr-phylo needs some work else: assert False cmd += ' --debug %d' % args.debug cmd += ' --n_tries 30' cmd += ' --no_context' cmd += ' --no_plot' cmd += ' --outbase %s/%s' % (outdir, args.extrastr) cmd += ' --random_seed %d' % (args.seed + ievent) if n_total_events > 1: # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates cmd += ' --uid_str_len 7' cmd += ' --naive_seq %s' % naive_line['naive_seq'] if not os.path.exists(outdir): os.makedirs(outdir) utils.run_ete_script( cmd, ete_path ) # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
def cache_parameters(): if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --seed %d --no-indels' % (simfname(), param_dir(), args.seed) # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' if args.n_max_queries is not None: cmd += ' --n-max-queries %d' % args.n_max_queries utils.simplerun(cmd, debug=True) #, dryrun=True)
def partition(): if utils.output_exists(args, partition_fname(), outlabel='partition', offset=4): return cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --is-simu --get-tree-metrics --infname %s --parameter-dir %s --plotdir %s --n-procs %d --outfname %s --seed %d' % ( simfname(), param_dir(), infdir() + '/plots', args.n_procs, partition_fname(), args.seed) if args.lb_tau is not None: cmd += ' --lb-tau %f' % args.lb_tau utils.simplerun(cmd, debug=True) #, dryrun=True)
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(args.debug), args.seed, naive_fname(), args.n_sim_events) if args.restrict_available_genes: cmd += ' --only-genes IGHV1-18*01:IGHJ1*01' if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, debug=True)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper( ) cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def write_locus_file(locus, ofos, lpair=None, extra_str=' '): ofn = utils.paired_fn(args.outdir, locus=locus, lpair=lpair) if utils.output_exists(args, ofn, leave_zero_len=len(ofos)==0, offset=4): # NOTE not really sure this does anything (or if i want it) now that I'm cleaning/looking for the whole dir at the start of this script return if not os.path.exists(os.path.dirname(ofn)): os.makedirs(os.path.dirname(ofn)) if len(ofos) == 0: # print '%s%s: nothing to write' % (extra_str, locus) open(ofn, 'w').close() return print '%s%s: %d to %s/%s' % (extra_str, locus, len(ofos), os.path.basename(os.path.dirname(ofn)), os.path.basename(ofn)) with open(ofn, 'w') as lfile: for sfo in ofos: lfile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int( args.debug), args.seed, naive_fname(), args.n_sim_events) if args.n_procs > 1 and args.n_sim_events % args.n_procs == 0: # if --n-procs is not divisble by --n-sim-events, partis simulate doesn't give you exactly the number you asked for cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, debug=True)
def partition(): if utils.output_exists(args, partition_fname(), outlabel='partition', offset=4): return cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --infname %s --parameter-dir %s --n-procs %d --outfname %s --seed %d' % ( simfname(), param_dir(), args.n_procs, partition_fname(), args.seed) # --write-additional-cluster-annotations 0:5 # I don't think there was really a good reason for having this if not args.dont_get_tree_metrics: cmd += ' --get-tree-metrics --plotdir %s' % (infdir() + '/plots') if args.lb_tau is not None: cmd += ' --lb-tau %f' % args.lb_tau utils.simplerun(cmd, debug=True) #, dryrun=True)
def simulate(args): if utils.output_exists(args, args.simfname): return cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters' if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) if args.root_mrca_weibull_parameter is not None: cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv' # figure what genes we're using if args.gls_gen: assert args.sim_v_genes is None and args.allele_prevalence_freqs is None sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus) glutils.remove_v_genes_with_bad_cysteines(sglfo) glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname else: sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes)) added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set if args.allele_prevalence_freqs is not None: if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know... raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v']))) gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # glutils.print_glfo(sglfo) # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def cache_parameters(): if utils.output_exists(args, ifname('params'), outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --seed %d --no-indels' % args.seed # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s' cmd += fstr % (spath('mutated'), ipath('params')) if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' if args.n_max_queries is not None: cmd += ' --n-max-queries %d' % args.n_max_queries utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
def run_igblast(infname, outfname): if utils.output_exists(args, outfname, offset=8): return if args.glfo_dir is not None: print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color('red', 'warning') cmd = './igblastn' cmd += ' -germline_db_V human_gl_V -germline_db_D human_gl_V -germline_db_J human_gl_J' cmd += ' -auxiliary_data optional_file/human_gl.aux' cmd += ' -domain_system imgt -ig_seqtype Ig -organism human -outfmt \'7 std qseq sseq btop\'' cmd += ' -num_threads %d' % args.n_procs cmd += ' -query ' + infname + ' -out ' + outfname cmd = 'cd %s; %s' % (args.igbdir, cmd) utils.simplerun(cmd, shell=True, print_time='igblast')
def run_performance_plot(args, method): perf_outdir = get_outfname(args, method, annotation_performance_plots=True) if utils.output_exists(args, perf_outdir): return cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance' cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True) # i.e. use the inferred glfo from <method> cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir' cmd_str += ' --only-overall-plots --plotdir ' + perf_outdir cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters' # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str(args.n_max_queries) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def rearrange(): if utils.output_exists( args, naive_fname('igh'), outlabel='naive simu', offset=4 ): # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there) return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --n-sim-events %d' % (int( args.debug), args.seed, args.n_sim_events) if args.paired_loci: cmd += ' --paired-loci --paired-outdir %s' % spath('naive') else: cmd += ' --outfname %s' % spath('naive') if args.restrict_available_genes: assert not args.paired_loci cmd += ' --only-genes IGHV1-18*01:IGHJ1*01' if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, dryrun=args.dry_run, debug=True)
def simulate(): rearrange() glfo, naive_event_list, cpath = utils.read_output(naive_fname()) assert len(naive_event_list) == args.n_sim_events outdirs = ['%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))] start = time.time() cmdfos = [] if args.n_procs > 1: print ' starting %d events' % len(naive_event_list) uid_str_len = 6 + int(math.log(len(naive_event_list), 10)) # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): if args.n_sim_events > 1 and args.n_procs == 1: print ' %s %d' % (utils.color('blue', 'ievent'), ievent) cfo = run_bcr_phylo(naive_line, outdir, ievent, len(naive_event_list), uid_str_len=uid_str_len) # if n_procs > 1, doesn't run, just returns cfo if cfo is not None: print ' %s %s' % (utils.color('red', 'run'), cfo['cmd_str']) cmdfos.append(cfo) if args.n_procs > 1 and len(cmdfos) > 0: utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print') print ' bcr-phylo run time: %.1fs' % (time.time() - start) if utils.output_exists(args, simfname(), outlabel='mutated simu', offset=4): # i guess if it crashes during the plotting just below, this'll get confused return start = time.time() mutated_events = [] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): mutated_events.append(parse_bcr_phylo_output(glfo, naive_line, outdir, ievent)) print ' parsing time: %.1fs' % (time.time() - start) print ' writing annotations to %s' % simfname() utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers) if not args.only_csv_plots: import lbplotting for outdir, event in zip(outdirs, mutated_events): lbplotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance])
def run_igblast(infname, outfname): if utils.output_exists(args, outfname, offset=8): return if args.glfo_dir is not None: print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color( 'red', 'warning') if args.n_random_queries is not None: sub_infname = os.path.dirname(outfname) + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igblast (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igblast (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname cmds = ['#!/bin/bash'] cmds += ['cd %s/%s' % (args.igbdir, args.locus)] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += ['igblastn'] for tmpreg in utils.regions: cmds[-1] += ' -germline_db_%s %s%s-unaligned.fasta' % ( tmpreg.upper(), args.locus, tmpreg) cmds[-1] += ' -auxiliary_data optional_file/%s_gl.aux' % args.species cmds[ -1] += ' -domain_system imgt -ig_seqtype Ig -organism %s -outfmt \'7 std qseq sseq btop\'' % args.species cmds[-1] += ' -num_threads %d' % utils.auto_n_procs() cmds[-1] += ' -query ' + infname + ' -out ' + outfname utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.workdir + '/run.sh')
def partition(): if utils.output_exists(args, ifname('partition'), outlabel='partition', offset=4): return cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --seed %d' % args.seed fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ( ' --infname %%s --parameter-dir %s --outfname %%s' % ipath('params')) cmd += fstr % (spath('mutated'), ipath('partition')) # --write-additional-cluster-annotations 0:5 # I don't think there was really a good reason for having this if not args.dont_get_tree_metrics: cmd += ' --get-selection-metrics --plotdir %s' % ( 'paired-outdir' if args.paired_loci else ipath('plots')) if args.lb_tau is not None: cmd += ' --lb-tau %f' % args.lb_tau if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' if args.n_max_queries is not None: cmd += ' --n-max-queries %d' % args.n_max_queries utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = [ 'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)', 'library(dplyr, warn.conflicts=FALSE)' ] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += [ '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True)) ] tigger_outfname = outdir + '/tigger.fasta' find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name, utils.auto_n_procs()) if args.tuned_tigger_params: germline_min = 5 # only analyze genes which correspond to at least this many V calls (default 200) min_seqs = 5 # minimum number of total sequences j_max = 0.95 # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15) find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % ( germline_min, min_seqs, j_max) rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr] # rcmds += ['sessionInfo()'] rcmds += ['print(novel_df)'] rcmds += [ 'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name) ] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ} proc = utils.run_cmd(cmdfo) while proc.poll() is None: time.sleep(0.01) if proc.returncode != 0: # damn thing crashes if it thinks the sample size is small with open(args.workdir + '/err') as ferr: errstr = ''.join(ferr.readlines()) if 'Not enough sample sequences were assigned to any germline' in errstr: with open(tigger_outfname, 'w') as dummy_outfasta: dummy_outfasta.write('') else: subprocess.check_call(['cat', args.workdir + '/out']) subprocess.check_call(['cat', args.workdir + '/err']) sys.exit(proc.returncode) for oe in ['err', 'out']: with open(args.workdir + '/' + oe) as oefile: print ''.join(oefile.readlines()) os.remove(args.workdir + '/' + oe) # post-process tigger .fa template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( tigger_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
'--droplet-id-separator', default='_', help= 'everything in the sequence id before this character is treated as the droplet id, e.g. for the default, the uid AAACGGGCAAGCGAGT-1_contig_2 has a droplet id of AAACGGGCAAGCGAGT-1' ) parser.add_argument('--overwrite', action='store_true') parser.add_argument( '--n-max-queries', type=int, default=-1, help= 'Maximum number of query sequences to read from input file, starting from beginning of file' ) args = parser.parse_args() if utils.output_exists(args, args.outfname, offset=4, debug=False): print ' extract-pairing-info.py output exists and --overwrite was not set, so not doing anything: %s' % args.outfname sys.exit(0) seqfos = utils.read_fastx(args.infname, n_max_queries=args.n_max_queries) droplet_ids = {} for sfo in seqfos: did = utils.get_droplet_id(sfo['name']) if did not in droplet_ids: droplet_ids[did] = [] droplet_ids[did].append(sfo['name']) print ' read %d sequences with %d droplet ids' % (len(seqfos), len(droplet_ids)) count_info = {} for dlist in droplet_ids.values():