def make_gls_tree_plot(args, region, plotdir, plotname, glsfnames, glslabels, locus, ref_label=None, title=None, title_color=None, legends=None, legend_title=None, pie_chart_faces=False, param_dirs=None): # ete3 requires its own python version, so we run as a subprocess cmdstr = 'export PATH=%s:$PATH && xvfb-run -a ./bin/plot-gl-set-trees.py' % args.ete_path cmdstr += ' --plotdir ' + plotdir cmdstr += ' --plotname ' + plotname cmdstr += ' --glsfnames ' + ':'.join(glsfnames) cmdstr += ' --glslabels ' + ':'.join(glslabels) cmdstr += ' --region ' + region if ref_label is not None: cmdstr += ' --ref-label ' + ref_label if title is not None: cmdstr += ' --title="%s"' % title if title_color is not None: cmdstr += ' --title-color %s' % title_color if legends is not None: cmdstr += ' --legends=' + ':'.join('"%s"' % l for l in legends) if legend_title is not None: cmdstr += ' --legend-title="%s"' % legend_title if pie_chart_faces: cmdstr += ' --pie-chart-faces' if param_dirs is not None: cmdstr += ' --param-dirs %s' % ':'.join(param_dirs) cmdstr += ' --locus ' + locus if args.plotcache: cmdstr += ' --use-cache' if args.only_print: cmdstr += ' --only-print' utils.simplerun(cmdstr, shell=True, debug=args.dryrun, dryrun=args.dryrun)
def run_other_method(args, method): if method not in [ 'tigger', 'igdiscover' ]: # really just to make it easier to search for this fcn assert False if utils.output_exists(args, get_outfname(args, method)): return simfasta = utils.getprefix(args.simfname) + '.fa' utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True) cmd = './test/%s-run.py' % method cmd += ' --infname ' + simfasta cmd += ' --outfname ' + get_outfname(args, method) if args.overwrite: cmd += ' --overwrite' if args.gls_gen: cmd += ' --gls-gen' cmd += ' --glfo-dir ' + partis_dir + '/data/germlines/human' # the partis mehods have this as the default internally, but we want/have to set it explicitly here else: cmd += ' --glfo-dir ' + args.inf_glfo_dir if method != 'igdiscover': # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir cmd += ' --workdir ' + args.workdir + '/' + method cmd += ' --n-procs ' + str(args.n_procs) utils.simplerun(cmd, dryrun=args.dry_run)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper() cmds = getpathcmd() cmds += ['conda activate %s' % args.env_label] cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir # if args.glfo_dir is not None else 'data/germlines/ XXX human' # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up) glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def run_partis(infname, outfname): if utils.output_exists(args, outfname, offset=8): return aligned_gl_seqs = {} # keyed by seq so it's easy to check for duplicates for r in utils.regions: # deduplicate before passing to partis for seqfo in utils.read_fastx(get_glfname(r, aligned=True)): if seqfo['seq'] in aligned_gl_seqs: continue aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs']) aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa' with open(aligned_germline_fname, 'w') as merged_file: for seq, gene in aligned_gl_seqs.items(): merged_file.write('>%s\n%s\n' % (gene, seq)) cmd = './bin/partis cache-parameters' cmd += ' --infname ' + infname cmd += ' --leave-default-germline' cmd += ' --presto-output --only-smith-waterman' cmd += ' --outfname ' + outfname if args.glfo_dir is not None: cmd += ' --initial-germline-dir ' + args.glfo_dir cmd += ' --aligned-germline-fname ' + aligned_germline_fname cmd += ' --n-procs ' + str(args.n_procs) utils.simplerun(cmd, print_time='partis annotation') os.remove(aligned_germline_fname)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % args.infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] cmdfname = outdir + '/run.sh' with open(cmdfname, 'w') as cmdfile: for cmd in cmds: cmdfile.write(cmd + '\n') subprocess.check_call(['chmod', '+x', cmdfname]) cmdfos = [{ 'cmd_str': cmdfname, 'workdir': outdir, 'outfname': outdir + '/work/final/%s_usage.tab' % 'v'.upper() }] utils.simplerun(cmdfname, shell=True, print_time='igdiscover')
def run_other_method(args, method): if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']: # really just to make it easier to search for this fcn assert False assert args.n_max_queries is None if utils.output_exists(args, get_outfname(args, method)): return simfasta = utils.getprefix(args.simfname) + '.fa' utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True) cmd = './test/%s-run.py' % method.split('-')[0] if method == 'tigger-tuned': cmd += ' --tuned-tigger-params' cmd += ' --infname ' + simfasta cmd += ' --outfname ' + get_outfname(args, method) if args.species != 'human': cmd += ' --species %s' % args.species if args.overwrite: cmd += ' --overwrite' if args.gls_gen: cmd += ' --gls-gen' cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir # the partis mehods have this as the default internally, but we want/have to set it explicitly here else: cmd += ' --glfo-dir ' + args.inf_glfo_dir cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info if method != 'igdiscover': # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir cmd += ' --workdir ' + args.workdir + '/' + method cmd += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd += ' --slurm' utils.simplerun(cmd, dryrun=args.dry_run)
def run_changeo(infname, igblast_outfname, outfname): if utils.output_exists(args, outfname, offset=8): return glfnames = [get_glfname(r, aligned=True) for r in utils.regions] cmd = args.changeo_path + '/bin/MakeDb.py igblast' cmd += ' -i %s -s %s -r %s --regions --scores' % (igblast_outfname, infname, ' '.join(glfnames)) utils.simplerun(cmd, print_time='changeo')
def partition(): n_procs = 1 cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s/params --n-procs %d --seed %d' % ( simfname(args.stype), infdir(args.stype), n_procs, args.seed) utils.simplerun(cmd, debug=True) #, dryrun=True) cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --lb-tau %f --is-simu --get-tree-metrics --infname %s --parameter-dir %s/params --plotdir %s --n-procs %d --outfname %s/partition.yaml --seed %d' % ( args.lb_tau, simfname(args.stype), infdir(args.stype), infdir(args.stype) + '/plots', n_procs, infdir(args.stype), args.seed) utils.simplerun(cmd, debug=True) #, dryrun=True)
def cache_parameters(): if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --n-procs %d --seed %d' % ( simfname(), param_dir(), args.n_procs, args.seed) utils.simplerun(cmd, debug=True) #, dryrun=True)
def run_partis_parameter_cache(args, method): if utils.output_exists(args, get_outfname(args, method)): return paramdir = args.outdir + '/' + method plotdir = args.outdir + '/' + method + '/plots' # remove any old sw cache files sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, args.locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman' cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir if method == 'partis': cmd_str += ' --debug-allele-finding' # --always-find-new-alleles' cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info if args.allele_cluster: cmd_str += ' --allele-cluster' if args.kmeans_allele_cluster: cmd_str += ' --kmeans-allele-cluster' elif method == 'full': cmd_str += ' --leave-default-germline' else: assert False if args.species != 'human': cmd_str += ' --species %s' % args.species cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' if not args.gls_gen: # otherwise it uses the default (full) germline dir cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir # --dont-remove-unlikely-alleles cmd_str += ' --parameter-dir ' + paramdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) if args.plot_and_fit_absolutely_everything is not None: cmd_str += ' --plot-and-fit-absolutely-everything ' + str( args.plot_and_fit_absolutely_everything) utils.simplerun(cmd_str, dryrun=args.dryrun)
def make_gls_tree_plot(args, plotdir, plotname, glsfnames, glslabels): # ete3 requires its own python version, so we run as a subprocess cmdstr = 'export PATH=%s:$PATH && xvfb-run -a ./bin/plot-gl-set-trees.py' % args.ete_path cmdstr += ' --plotdir ' + plotdir cmdstr += ' --plotname ' + plotname cmdstr += ' --glsfnames ' + ':'.join(glsfnames) cmdstr += ' --glslabels ' + ':'.join(glslabels) if args.plotcache: cmdstr += ' --use-cache' utils.simplerun(cmdstr, shell=True)
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int( args.debug), args.seed, naive_fname(), args.n_sim_events) utils.simplerun(cmd, debug=True)
def cache_parameters(): if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --seed %d --no-indels' % (simfname(), param_dir(), args.seed) # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' if args.n_max_queries is not None: cmd += ' --n-max-queries %d' % args.n_max_queries utils.simplerun(cmd, debug=True) #, dryrun=True)
def partition(): if utils.output_exists(args, partition_fname(), outlabel='partition', offset=4): return cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --is-simu --get-tree-metrics --infname %s --parameter-dir %s --plotdir %s --n-procs %d --outfname %s --seed %d' % ( simfname(), param_dir(), infdir() + '/plots', args.n_procs, partition_fname(), args.seed) if args.lb_tau is not None: cmd += ' --lb-tau %f' % args.lb_tau utils.simplerun(cmd, debug=True) #, dryrun=True)
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(args.debug), args.seed, naive_fname(), args.n_sim_events) if args.restrict_available_genes: cmd += ' --only-genes IGHV1-18*01:IGHJ1*01' if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, debug=True)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper( ) cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def partition(): if utils.output_exists(args, partition_fname(), outlabel='partition', offset=4): return cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --infname %s --parameter-dir %s --n-procs %d --outfname %s --seed %d' % ( simfname(), param_dir(), args.n_procs, partition_fname(), args.seed) # --write-additional-cluster-annotations 0:5 # I don't think there was really a good reason for having this if not args.dont_get_tree_metrics: cmd += ' --get-tree-metrics --plotdir %s' % (infdir() + '/plots') if args.lb_tau is not None: cmd += ' --lb-tau %f' % args.lb_tau utils.simplerun(cmd, debug=True) #, dryrun=True)
def run_single_test(args, baseoutdir, val, n_events, method): cmd = get_base_cmd(args, n_events, method) outdir = get_outdir(args, baseoutdir, args.action, val, n_events=n_events) sim_v_genes = [args.v_genes[0]] nsnpstr, nindelstr = '1', '' if args.action == 'mfreq': cmd += ' --mut-mult ' + str(val) elif args.action == 'nsnp': nsnpstr = str(val) elif args.action == 'multi-nsnp': nsnpstr = ':'.join([str(n) for n in val]) sim_v_genes *= len(val) elif args.action == 'prevalence': cmd += ' --allele-prevalence-freqs ' + str(1. - val) + ':' + str( val ) # i.e. previously-known allele has 1 - p, and new allele has p elif args.action == 'n-leaves': cmd += ' --n-leaves ' + str( val ) # NOTE default of 1 (for other tests) is set in test-allele-finding.py cmd += ' --n-leaf-distribution geometric' cmd += ' --n-max-queries ' + str( n_events ) # i.e. we simulate <n_events> rearrangement events, but then only use <n_events> sequences for inference elif args.action == 'weibull': cmd += ' --n-leaves 5' # NOTE default of 1 (for other tests) is set in test-allele-finding.py cmd += ' --n-leaf-distribution geometric' cmd += ' --n-max-queries ' + str( n_events ) # i.e. we simulate <n_events> rearrangement events, but then only use <n_events> sequences for inference elif args.action == 'alcluster': nsnpstr = val['snp'] nindelstr = val['indel'] sim_v_genes *= len(val['snp'].split(':')) elif args.action == 'gls-gen': nsnpstr = '1:1:2:3:100:100' nindelstr = '0:0:0:0:3:3' cmd += ' --gls-gen' else: assert False if args.action != 'gls-gen': cmd += ' --sim-v-genes ' + ':'.join(sim_v_genes) if '--nosim' not in cmd: if nsnpstr != '': cmd += ' --nsnp-list ' + nsnpstr if nindelstr != '': cmd += ' --nindel-list ' + nindelstr cmd += ' --outdir ' + outdir utils.simplerun(cmd, dryrun=args.dry_run)
def rearrange(): if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4): return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int( args.debug), args.seed, naive_fname(), args.n_sim_events) if args.n_procs > 1 and args.n_sim_events % args.n_procs == 0: # if --n-procs is not divisble by --n-sim-events, partis simulate doesn't give you exactly the number you asked for cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, debug=True)
def simulate(args): if utils.output_exists(args, args.simfname): return cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters' if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) if args.root_mrca_weibull_parameter is not None: cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv' # figure what genes we're using if args.gls_gen: assert args.sim_v_genes is None and args.allele_prevalence_freqs is None sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus) glutils.remove_v_genes_with_bad_cysteines(sglfo) glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname else: sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes)) added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set if args.allele_prevalence_freqs is not None: if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know... raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v']))) gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # glutils.print_glfo(sglfo) # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def run_igblast(infname, outfname): if utils.output_exists(args, outfname, offset=8): return if args.glfo_dir is not None: print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color('red', 'warning') cmd = './igblastn' cmd += ' -germline_db_V human_gl_V -germline_db_D human_gl_V -germline_db_J human_gl_J' cmd += ' -auxiliary_data optional_file/human_gl.aux' cmd += ' -domain_system imgt -ig_seqtype Ig -organism human -outfmt \'7 std qseq sseq btop\'' cmd += ' -num_threads %d' % args.n_procs cmd += ' -query ' + infname + ' -out ' + outfname cmd = 'cd %s; %s' % (args.igbdir, cmd) utils.simplerun(cmd, shell=True, print_time='igblast')
def cache_parameters(): if utils.output_exists(args, ifname('params'), outlabel='parameters', offset=4): return cmd = './bin/partis cache-parameters --seed %d --no-indels' % args.seed # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s' cmd += fstr % (spath('mutated'), ipath('params')) if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' if args.n_max_queries is not None: cmd += ' --n-max-queries %d' % args.n_max_queries utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
def multiple_tests(args): def getlogdir(iproc): logdir = args.outdir + '/' + str(iproc) + '/logs' if args.plot_annotation_performance: logdir += '/annotation-performance-plots' return logdir + '/' + '-'.join(args.methods) def cmd_str(iproc): clist = copy.deepcopy(sys.argv) utils.remove_from_arglist(clist, '--n-tests', has_arg=True) utils.remove_from_arglist(clist, '--iteststart', has_arg=True) utils.replace_in_arglist(clist, '--outdir', args.outdir + '/' + str(iproc)) utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc)) # clist.append('--slurm') return ' '.join(clist) for iproc in range( args.iteststart, args.n_tests ): # don't overwrite old log files... need to eventually fix this so it isn't necessary def lfn(iproc, ilog): logfname = args.outdir + '/' + str(iproc) + '/log' if ilog > 0: logfname += '.' + str(ilog) return logfname cmdfos = [{ 'cmd_str': cmd_str(iproc), 'workdir': args.workdir + '/' + str(iproc), 'logdir': getlogdir(iproc), 'outfname': args.outdir + '/' + str(iproc) } for iproc in range(args.iteststart, args.n_tests)] if args.dry_run: for iproc in range(args.iteststart, args.n_tests): utils.simplerun(cmdfos[iproc - args.iteststart]['cmd_str'], dryrun=True) return for iproc in range(args.iteststart, args.n_tests): logd = getlogdir(iproc) if os.path.exists(logd + '/log'): ilog = 0 while os.path.exists(logd + '/log.' + str(ilog)): ilog += 1 check_call(['mv', '-v', logd + '/log', logd + '/log.' + str(ilog)]) print ' look for logs in %s' % args.outdir utils.run_cmds(cmdfos, debug='write')
def update_igdiscover(): cmds = getpathcmd() # # ---------------------------------------------------------------------------------------- # # non-dev version: # args.env_label = 'igdiscover' # # install: # cmds += ['conda config --add channels defaults'] # cmds += ['conda config --add channels conda-forge'] # cmds += ['conda config --add channels bioconda'] # cmds += ['conda create -n %s igdiscover' % args.env_label] # cmds += ['conda activate %s' % args.env_label] # # update: # cmds += ['conda activate %s' % args.env_label] # cmds += ['igdiscover --version'] # cmds += ['conda update igdiscover'] # cmds += ['igdiscover --version'] # ---------------------------------------------------------------------------------------- # dev version: args.env_label = 'igdiscover-dev' install_dir = partis_dir + '/packages' if not os.path.exists(install_dir): os.makedirs(install_dir) cmds += ['cd %s' % install_dir] # install: # cmds += ['git clone https://github.com/NBISweden/IgDiscover.git'] # cmds += ['cd IgDiscover'] # cmds += ['conda env create -n %s -f environment.yml' % args.env_label] # cmds += ['source activate %s' % args.env_label] # cmds += ['python3 -m pip install -e .'] # cmds += ['igdiscover --version'] # update dev version: cmds += ['cd IgDiscover'] cmds += ['git pull'] cmds += ['source activate %s' % args.env_label] cmds += ['igdiscover --version'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
def read_input_tree_file(self, outfname): if self.args.debug: print ' reading trees from %s' % self.args.input_simulation_treefname utils.simplerun('cp %s %s' % (self.args.input_simulation_treefname, outfname), debug=False) ages, treestrs = [], [] with open(outfname) as treefile: for line in treefile: tstr = line.strip() if tstr == '': # skip empty lines continue dtree = treeutils.get_dendro_tree( treestr=tstr, suppress_internal_node_taxa=True) if dtree.seed_node.edge_length is None: # make sure root edge length is set (otherwise bppseqgen barfs) dtree.seed_node.edge_length = 0. old_new_label_pairs = [ (l.taxon.label, 't%d' % (i + 1)) for i, l in enumerate(dtree.leaf_node_iter()) ] treeutils.translate_labels( dtree, old_new_label_pairs ) # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this to check that e.g. bppseqgen didn't screw up the ordering) age = self.choose_full_sequence_branch_length() if self.args.debug > 1: # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height print ' input tree %d (rescaled depth %.3f --> %.3f):' % ( len(ages), treeutils.get_mean_leaf_height(tree=dtree), age) treeutils.rescale_tree( age, dtree=dtree ) # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now ages.append(age) treestrs.append(dtree.as_string(schema='newick').strip()) if self.args.debug > 1: print utils.pad_lines(treeutils.get_ascii_tree(dtree)) if any(a > 1. for a in ages): raise Exception( 'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s' % (self.args.input_simulation_treefname, ages)) if len(ages) != self.args.n_trees: print ' resetting --n-trees from %d to %d to match trees read from %s' % ( self.args.n_trees, len(ages), self.args.input_simulation_treefname) self.args.n_trees = len(ages) return ages, treestrs
def run_data(args, baseoutdir, study, dset, method): cmd = './datascripts/run.py cache-parameters' cmd += ' --study ' + study cmd += ' --dsets ' + dset assert args.label is not None # it's got a default now, so it shouldn't anymore be None cmd += ' --extra-str gls-gen-paper-' + args.label if args.no_slurm: cmd += ' --no-slurm' cmd += ' --n-procs ' + str(args.n_procs_per_test) if args.n_random_queries is not None: assert method == 'partis' # I don't think it works for any others a.t.m. cmd += ' --n-random-queries ' + str(args.n_random_queries) if args.check: cmd += ' --check' if method != 'partis': cmd += ' --other-method ' + method utils.simplerun(cmd, dryrun=args.dry_run)
def run_performance_plot(args, method): perf_outdir = get_outfname(args, method, annotation_performance_plots=True) if utils.output_exists(args, perf_outdir): return cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance' cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True) # i.e. use the inferred glfo from <method> cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir' cmd_str += ' --only-overall-plots --plotdir ' + perf_outdir cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters' # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str(args.n_max_queries) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def run_bcr_phylo(naive_line, outdir, ievent): tmpdir = utils.choose_random_subdir('/tmp/%s' % os.getenv('USER')) # this is I think just for xvfb-run os.makedirs(tmpdir) prof_cmds = '' # '-m cProfile -s tottime -o prof.out' cmd = 'export TMPDIR=%s && export PATH=%s:$PATH && xvfb-run -a python %s %s/bin/simulator.py' % (tmpdir, ete_path, prof_cmds, bcr_phylo_path) if args.run_help: cmd += ' --help' elif args.stype == 'neutral': assert False # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args) cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365) cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation elif args.stype == 'selection': cmd += ' --selection' cmd += ' --lambda %f' % args.branching_parameter cmd += ' --lambda0 %f' % args.base_mutation_rate cmd += ' --obs_times %s' % ' '.join(['%d' % t for t in args.obs_times]) cmd += ' --n_to_sample %d' % args.n_sim_seqs_per_generation cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance cmd += ' --target_dist %d' % args.target_distance cmd += ' --target_count %d' % args.target_count cmd += ' --carry_cap %d' % args.carry_cap cmd += ' --observe_common_ancestors' # cmd += ' --n_target_clusters 1' # cmd += ' --target_cluster_distance 1' # cmd += ' --observe_based_on_affinity' # implementation in bcr-phylo needs some work else: assert False cmd += ' --debug 1' cmd += ' --no_context' cmd += ' --no_plot' cmd += ' --outbase %s/%s' % (outdir, args.extrastr) cmd += ' --naive_seq %s' % naive_line['naive_seq'] cmd += ' --random_seed %d' % (args.seed + ievent) if not os.path.exists(outdir): os.makedirs(outdir) utils.simplerun(cmd, shell=True, extra_str=' ', debug=True) #, dryrun=True) os.rmdir(tmpdir)
def rearrange(): if utils.output_exists( args, naive_fname('igh'), outlabel='naive simu', offset=4 ): # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there) return cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this) cmd += ' --debug %d --seed %d --n-sim-events %d' % (int( args.debug), args.seed, args.n_sim_events) if args.paired_loci: cmd += ' --paired-loci --paired-outdir %s' % spath('naive') else: cmd += ' --outfname %s' % spath('naive') if args.restrict_available_genes: assert not args.paired_loci cmd += ' --only-genes IGHV1-18*01:IGHJ1*01' if args.n_procs > 1: cmd += ' --n-procs %d' % args.n_procs if args.slurm: cmd += ' --batch-system slurm' utils.simplerun(cmd, dryrun=args.dry_run, debug=True)