def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--discard-garbage', dest='discard_garbage', action='store_true') parser.add_argument('--handbuilt', dest='handbuilt_fn', required=True) parser.add_argument('pwgs_ssm_fn') parser.add_argument('pwgs_params_fn') parser.add_argument('pairtree_ssm_fn') parser.add_argument('pairtree_params_fn') args = parser.parse_args() tree_type = 'handbuilt.xeno' hb = load_handbuilt(args.handbuilt_fn, tree_type) clusters = convert_clusters(hb['clusters']) garbage = hb['garbage'] # Since we remove the empty first cluster, the indexing on `structure` is now # a little weird -- cluster `i` is now represented by `i + 1` in `structure`. # That's okay. adjl = hb['structure'] parents = convert_adjl_to_parents(adjl) pwgs_params = inputparser.load_params(args.pwgs_params_fn) variants = load_phylowgs(args.pwgs_ssm_fn) if args.discard_garbage: remove_garbage(variants, garbage) variants, clusters = make_varids_contiguous(variants, garbage, clusters) garbage = [] inputparser.write_ssms(variants, args.pairtree_ssm_fn) write_pairtree_params(pwgs_params['samples'], garbage, clusters, parents, args.pairtree_params_fn)
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('--counts', required=True) parser.add_argument('in_ssm_fn') parser.add_argument('in_params_fn') parser.add_argument('out_base') args = parser.parse_args() random.seed(1337) counts = [int(C) for C in args.counts.split(',')] assert len(counts) == len(set(counts)) ssms = inputparser.load_ssms(args.in_ssm_fn) params = inputparser.load_params(args.in_params_fn) sampnames = params['samples'] # Always include diagnosis sample, on assumption we're working with # SJbALL022609 from Steph for the paper congraph figure. subsets = _select_samp_subsets(sampnames, counts, all_must_include=['D']) for subset in subsets: idxs = _find_idxs(sampnames, subset) new_ssms = _filter_ssms(ssms, idxs) new_params = dict(params) new_params['samples'] = subset out_base = '%s_S%s' % (args.out_base, len(subset)) inputparser.write_ssms(new_ssms, out_base + '.ssm') with open(out_base + '.params.json', 'w') as F: json.dump(new_params, F)
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('in_ssm_fn') parser.add_argument('out_ssm_fn') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') ssms = inputparser.load_ssms(args.in_ssm_fn) fixed_prop = _fix_omegas(ssms, print_bad=False) print('fixed_omegas=%s' % fixed_prop) inputparser.write_ssms(ssms, args.out_ssm_fn)
def _process(ssmfn, jsonfn, order): params = inputparser.load_params(jsonfn) ssms = inputparser.load_ssms(ssmfn) order = [int(idx) for idx in order.split(',')] N = len(params['samples']) assert set(range(N)) == set(order) assert len(list(ssms.values())[0]['var_reads']) == N params['samples'] = [params['samples'][idx] for idx in order] for vid in ssms.keys(): for K in ('var_reads', 'ref_reads', 'total_reads', 'vaf', 'omega_v'): ssms[vid][K] = ssms[vid][K][order] with open(jsonfn, 'w') as F: json.dump(params, F) inputparser.write_ssms(ssms, ssmfn)
def _process(ssmfn, jsonfn, to_remove): params = inputparser.load_params(jsonfn) ssms = inputparser.load_ssms(ssmfn) to_remove = set([int(idx) for idx in to_remove.split(',')]) N = len(params['samples']) all_samps = set(range(N)) assert to_remove.issubset(all_samps) to_keep = sorted(all_samps - to_remove) assert len(to_keep) > 0 params['samples'] = [params['samples'][idx] for idx in to_keep] for vid in ssms.keys(): for K in ('var_reads', 'ref_reads', 'total_reads', 'vaf', 'omega_v'): ssms[vid][K] = ssms[vid][K][to_keep] with open(jsonfn, 'w') as F: json.dump(params, F) inputparser.write_ssms(ssms, ssmfn)
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--remove-small-clusters', action='store_true') parser.add_argument('inssm_fn') parser.add_argument('outssm_fn') parser.add_argument('outparams_fn') args = parser.parse_args() variants = load_ssms(args.inssm_fn) clusters = extract_clusters(variants) clusters = sort_clusters_by_vaf(clusters, variants) inputparser.write_ssms(variants, args.outssm_fn) if args.remove_small_clusters: clusters, garbage = remove_small_clusters(clusters) else: garbage = [] sampnames = make_sampnames(variants) write_params(clusters, garbage, sampnames, args.outparams_fn)
def main(): parser = argparse.ArgumentParser( description= 'Find variants with likely incorrect var_read_prob by comparing model with provided var_read_prob to haploid (LOH) model using Bayes factors', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--logbf-threshold', type=float, default=10., help= 'Logarithm of Bayes factor threshold at which the haploid model is accepted as more likely model than the model using the provided var_read_prob' ) parser.add_argument('--verbose', action='store_true', help='Print debugging messages') parser.add_argument( '--ignore-existing-garbage', action='store_true', help= 'Ignore any existing garbage variants listed in in_params_fn and test all variants. If not specified, any existing garbage variants will be kept as garbage and not tested again.' ) parser.add_argument('--action', choices=('add_to_garbage', 'modify_var_read_prob'), default='add_to_garbage') parser.add_argument('--var-read-prob-alt', type=float, default=1.) parser.add_argument('in_ssm_fn', help='Input SSM file with mutations') parser.add_argument( 'in_params_fn', help= 'Input params file listing sample names and any existing garbage mutations' ) parser.add_argument( 'out_ssm_fn', help='Output SSM file with modified list of garbage mutations') parser.add_argument( 'out_params_fn', help='Output params file with modified list of garbage mutations') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') if args.ignore_existing_garbage: variants, params = inputparser.load_ssms_and_params(args.in_ssm_fn, args.in_params_fn, remove_garb=False) params['garbage'] = [] else: variants, params = inputparser.load_ssms_and_params( args.in_ssm_fn, args.in_params_fn) bad_vids, bad_samp_prop = _remove_bad(variants, args.logbf_threshold, args.var_read_prob_alt, args.verbose) bad_ssm_prop = len(bad_vids) / len(variants) if args.action == 'add_to_garbage': params['garbage'] = common.sort_vids( set(bad_vids) | set(params['garbage'])) elif args.action == 'modify_var_read_prob': for vid in bad_vids: variants[vid]['omega_v'][:] = args.var_read_prob_alt else: raise Exception('Unknown action: %s' % args.action) inputparser.write_ssms(variants, args.out_ssm_fn) with open(args.out_params_fn, 'w') as F: json.dump(params, F) stats = { 'num_bad_ssms': len(bad_vids), 'bad_ssms': common.sort_vids(bad_vids), 'bad_samp_prop': '%.3f' % bad_samp_prop, 'bad_ssm_prop': '%.3f' % bad_ssm_prop, } for K, V in stats.items(): print('%s=%s' % (K, V))