def callback(self, args): if args.mode.startswith("pair"): for i in args.signatures: i = int(i) W, signature_names = read_signatures(i) if args.mode.endswith('gen'): gen_benchmark_2combinations(args.root, signature_names, W) elif args.mode.endswith('run'): run_benchmark_2combinations(args.root, i, signature_names, W, force=True) elif args.mode.endswith('run_ds'): run_benchmark_2combinations_deconstruct_sigs(args.root, i, signature_names, W, force=True) elif args.mode.startswith('multiple'): if args.mode.endswith('gen'): pass elif args.mode.endswith('run'): multiple_benchmark_run(i, signature_names, W, force=True) elif args.mode.endswith('run_ds'): pass multiple_benchmark() aggregate_multiple_benchmarks() elif args.mode == 'aggregate': aggregate_benchmarks(args.root) else: print("Unknown benchmark action mode") self.parser.print_usage() sys.exit(1)
def __init__(self, profile, sig_set, method='MLEZ', others_threshold=0, bootstrap=True, dummy_sigs=True, global_optimization=None): """ Arguments: `profile`: Profile to decompose. Must have length 96. `sig_set`: signature set to use for the decomposition. `method`: solver method. `others_threshold`: minimum threshold for acceptable results. `bootstrap`: Use the bootstrap to calculate confidence intervals. `dummy_sigs`: Account for unexplained variance (non-context dependent mutational processes and unknown signatures) `debug`: run the decomposition in debug mode. """ assert len( profile) == 96, "Invalid sample. Must be vector of length 96" assert sig_set in [5, 10, 30, 49], "Invalid sig_set choice. Must be 5,10,30 or 49" assert method.lower( ) in IDENTIFY_MIN_FUNCTIONS, "Unknown method provided" self.profile = profile self.sig_set = sig_set self.method = method self.bootstrap = bootstrap self.enable_dummy = dummy_sigs self.global_optimization = global_optimization self.others_threshold = others_threshold self.W_and_labels = read_signatures(self.sig_set) self._main()
def multiple_benchmark_helper(j): dirname = "data/benchmark/multiple" # for i in [5, 10, 30]: for i in [ 30, ]: W, signature_names = read_signatures(i) N = W.shape[1] # r = random.randrange(2, i // 3 + 2) r = random.randrange(2, min(i + 1, 15)) # print(np.random.choice(N, r), .05 + np.random.dirichlet(np.ones(r), 1)) while True: h0 = np.zeros(N) h0[np.random.choice( N, r)] = 0.05 + np.random.dirichlet(np.ones(r), 1) if np.greater(h0, 0.05).sum() == r: break h0 /= h0.sum() v0 = W.dot(h0) # print(h0) n_mutations = random.randrange(10, 50) v0_counts = np.random.multinomial(n_mutations, v0 / v0.sum()) # print(v0_counts) random_name = str(uuid.uuid4())[:4] fname = dirname + "/{:02d}_{}_{}_{}".format(i, r, n_mutations, random_name) print(fname) profile_fname = fname + ".profile" info_fname = fname + ".info" mle_info = fname + ".MLE.info" mlez_info = fname + ".MLEZ.info" ds_info = fname + ".ds.info" write_profile(profile_fname, v0_counts) write_decomposition(info_fname, h0, signature_names) ################################################## results = deconstruct_sigs_custom(profile_fname, signatures=i) write_decomposition(ds_info, results, signature_names) ################################################## profile = read_profile_file(profile_fname) for method, method_fname in [("MLE", mle_info), ("MLEZ", mlez_info)]: _, _, results = decompose_mutational_profile_counts( profile, (W, signature_names), method, debug=False, others_threshold=0.0) write_decomposition(method_fname, results, signature_names)
def _gen(self): W_og, signature_names = read_signatures(self.ref_sig) self.ref_w_labels = (W_og, signature_names) sig_dict = list( map(lambda w, name: { 'w': w, 'name': name }, W_og.T, signature_names)) if 'predef_sig_names' in dir(self): sig_names = self.predef_sig_names sample_sigs = list( filter(lambda item: item['name'] in sig_names, sig_dict)) assert len(sample_sigs) == len( sig_names), "sig_names provided were not found" h = self.predef_h else: sample_sigs = np.random.choice( sig_dict, self.complexity, replace=False) # synthetic sample sigs sig_names = [sig["name"] for sig in sample_sigs] h = np.random.rand(self.complexity) # init exposure h = h / h.sum() # normalize exposures W = np.array([sig["w"] for sig in sample_sigs]).T # sample specific W v = W.dot(h) # calculate mutational profile v = np.rint( v * (self.N_mut / v.sum())) # make v the desired number of mutations v_noise = np.random.random_integers(-self.noise, self.noise, 96) # generate noise v += v_noise # add noise to v v[v < 0] = 0 # make any negative counts 0 info = { 'sig': { 'group': self.ref_sig, 'used': sig_names }, 'W': W, 'h': h, 'noise': v_noise, 'total_mut': v.sum() } self.info = info super().__init__(v)
def identify(self, args): if not args.infile: logger.warning("Provide input file in VCF or MAF format (-i) and a corresponding genome assembly (-g)") return if not args.genome: logger.warning(genome_error_message) return if not args.signatures: logger.warning("Set of signatures required. Use 5 and 10 for MUTAGENE-5 and MUTAGENE-10. Use 30 for COSMIC-30") return if args.method.lower() not in IDENTIFY_MIN_FUNCTIONS: logger.warning('Unknown method provided') return method = args.method # mutations, processing_stats = read_VCF_profile(args.infile, asm=args.genome) # mutations, processing_stats = read_auto_profile(args.infile, fmt=args.input_format, asm=args.genome) W, signature_names = read_signatures(int(args.signatures)) if args.input_format == 'TCGI': mutations, mutations_with_context, processing_stats = read_TCGI_with_context_window(args.infile, args.genome, window_size=1) samples_profiles = get_multisample_mutational_profile(mutations, counts=True) samples_results = {} for sample, profile in samples_profiles.items(): _, _, results = decompose_mutational_profile_counts( profile, (W, signature_names), method, others_threshold=0.0) samples_results[sample] = results write_multisample_decomposition(args.outfile, samples_results, signature_names) if args.input_format == 'MAF': mutations, mutations_with_context, processing_stats = read_MAF_with_context_window(args.infile, args.genome, window_size=1) samples_profiles = get_multisample_mutational_profile(mutations, counts=True) samples_results = {} for sample, profile in samples_profiles.items(): _, _, results = decompose_mutational_profile_counts( profile, (W, signature_names), method, others_threshold=0.0) samples_results[sample] = results write_multisample_decomposition(args.outfile, samples_results, signature_names) elif args.input_format == 'VCF': mutations, processing_stats = read_auto_profile(args.infile, fmt=args.input_format, asm=args.genome) profile = get_mutational_profile(mutations, counts=True) if not args.bootstrap: _, _, results = decompose_mutational_profile_counts( profile, (W, signature_names), method, others_threshold=0.0, enable_dummy=args.no_unexplained_variance) write_decomposition(args.outfile, results, signature_names, 'VCF') else: bootstrap_results = [] for resampled_profile in generate_resampled_profiles(profile, 100): _, _, results = decompose_mutational_profile_counts( resampled_profile, (W, signature_names), method, others_threshold=0.0, enable_dummy=args.no_unexplained_variance) bootstrap_results.append(results) write_bootstrap_decomposition(args.outfile, bootstrap_results, signature_names, 'VCF')
def identify(self, args): if not args.infile: logger.warning("Provide input file in VCF or MAF format (-i) and a corresponding genome assembly (-g)") return if not args.genome: logger.warning(genome_error_message) return if not args.signatures: logger.warning("Set of signatures required. Use 5 and 10 for MUTAGENE-5 and MUTAGENE-10. Use 30 for COSMIC-30") return if args.method.lower() not in IDENTIFY_MIN_FUNCTIONS: logger.warning('Unknown method provided') return if args.bootstrap_replicates < 10: logger.warning("Number of bootstrap replicates too low. Specify at least 10 replicates") return if args.bootstrap_confidence_level < 70: logger.warning("Specify confidence level of at least 70% and less than 99%") return only = None if args.keep_only is not None: only = args.keep_only.split(",") if len(only) < 1: logger.warning("List of signatures to keep for the analysis is empty") return logger.warning("We will only analyze signatures in this list: {}".format(", ".join(only))) if 'input_format' not in args: # guess format from file name name = args.infile.name.upper() if name.endswith("MAF"): args.input_format = "MAF" elif name.endswith("VCF"): args.input_format = "VCF" else: logger.warning("Input format was not specified. Assuming it is MAF") args.input_format = "MAF" W, signature_names = read_signatures(args.signatures, only=only) try: mutations, _, processing_stats = read_mutations(args.input_format, args.infile, args.genome, window_size=1) except Exception as e: e_message = getattr(e, 'message', repr(e)) logger.warning( "Parsing {0} failed. " "Check that the input file is in {0} format " "or specify a different format using option -f \n" "{1}".format(args.input_format, e_message)) if logger.root.level == logging.DEBUG: raise return samples_profiles = get_multisample_mutational_profile(mutations, counts=True) samples_results = {} for sample, profile in samples_profiles.items(): _, _, results = decompose_mutational_profile_counts( profile, (W, signature_names), args.method, others_threshold=0.0, enable_dummy=args.no_unexplained_variance) samples_results[sample] = results if not args.bootstrap: write_decomposition(args.outfile, samples_results, signature_names, mutations_threshold=args.mutations_threshold) else: bootstrap_samples_results = {} for sample, profile in samples_profiles.items(): bootstrap_results = [] for resampled_profile in tqdm(generate_resampled_profiles(profile, args.bootstrap_replicates), total=args.bootstrap_replicates): _, _, results = decompose_mutational_profile_counts( resampled_profile, (W, signature_names), args.method, others_threshold=0.0, enable_dummy=args.no_unexplained_variance) bootstrap_results.append(results) bootstrap_samples_results[sample] = bootstrap_results write_decomposition( args.outfile, samples_results, signature_names, mutations_threshold=args.mutations_threshold, bootstrap_method=args.bootstrap_method, profile=samples_profiles, bootstrap_results=bootstrap_samples_results, bootstrap_level=args.bootstrap_confidence_level)
def aggregate_multiple_benchmarks(): methods = { "mle": ".MLE.info", "mlez": ".MLEZ.info", "ds": ".ds.info", 'aicc': '.AICc.info', 'bic': '.BIC.info', 'aiccz': '.AICcz.info', 'bicz': '.BICz.info', } # signatures_thresholds = { # 5: 0.06, # 10: 0.03, # 30: 0.01, # } signatures_thresholds = { 5: 0.06, 10: 0.06, 30: 0.06, } # signatures_thresholds = { # 5: 0.0001, # 10: 0.0001, # 30: 0.0001, # } # only report the signature 2 value (as in DeconstructSigs benchmark) with open("data/benchmark/multiple/res1.txt", 'w') as o: o.write( "file_id\tsigtype\tnsig\tnmut\tmethod\tSRMSE\tPRMSE\tSTRMSE\tLLIK\tLLIK0\tTLLIK\tTLLIK0\tprecision\trecall\taccuracy\tf1\n" ) for fname in glob.glob("data/benchmark/multiple/*.profile", recursive=True): file_id = fname.split("/")[-1].split(".")[0] sigtype, r, nmut, replica = fname.split("/")[-1].split( ".")[0].split("_") sigtype = int(sigtype) if sigtype != 30: continue W, signature_names = read_signatures(sigtype) info_fname = fname.split(".")[0] + '.info' orig_profile = read_profile_file(fname) h0, names = read_decomposition(info_fname) # threshold = 0.06 threshold = 0.06 # threshold = 1.0 / np.sqrt(int(nmut)) if method != "ds" else 0.06 h0_threshold = np.where(h0 > threshold, h0, 0.0) # zero below threshold h0_binary = np.array( h0_threshold) > 0.0 # true / false for threshold nsig = np.count_nonzero(h0_binary) if nsig < int(r): print("LESS", sigtype, nsig, r) if nsig > int(r): print("MORE", sigtype, nsig, r) if nsig <= 1: continue if nsig > 10: continue for method in methods: method_fname = fname.split(".")[0] + methods[method] values, names = read_decomposition(method_fname) # print(method_fname) if values is None: continue h = np.array(values) if h.sum() == 0: continue h_threshold = np.where(h > threshold, h, 0.0) # zero below threshold reconstructed_profile = W.dot(h) # print(h) # print(reconstructed_profile) PRMSE = np.sqrt( mean_squared_error( np.array(orig_profile) / np.array(orig_profile).sum(), np.array(reconstructed_profile) / np.array(reconstructed_profile).sum())) SRMSE = np.sqrt(mean_squared_error(h0, h)) STRMSE = np.sqrt(mean_squared_error(h0_threshold, h_threshold)) LLIK0 = -NegLogLik(h0, W, orig_profile) TLLIK0 = -NegLogLik(h0_threshold, W, orig_profile) LLIK = -NegLogLik(h, W, orig_profile) TLLIK = -NegLogLik(h_threshold, W, orig_profile) # print(h0.sum()) # print(h.sum()) h_binary = np.array( h_threshold) > 0.0 # true / false for threshold precision = precision_score(h0_binary, h_binary) recall = recall_score(h0_binary, h_binary) accuracy = accuracy_score(h0_binary, h_binary) f1 = f1_score(h0_binary, h_binary) o.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" .format(file_id, sigtype, nsig, nmut, method, SRMSE, PRMSE, STRMSE, LLIK, LLIK0, TLLIK, TLLIK0, precision, recall, accuracy, f1))