def batch_multi_dendrix(args): """Runs Multi-Dendrix for each parameter setting on the input mutation data. **Returns:** A tuple containing the following: * **collections** (*dictionary*) - mapping of t -> k -> output of Multi-Dendrix * **mutation_data** (*tuple*) - mutation data tuple (see :func:`multi_dendrix.multi_dendrix` for details). * **runtime** (*float*) - total runtime (in seconds) of Multi-Dendrix on all the parameter settings """ # Import required modules from time import time # Load mutation data used in each run start = time() include = Multi.white_and_blacklisting(args.patient_whitelist, args.patient_blacklist, args.gene_whitelist, args.gene_blacklist) gene2include, sample2include = include mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) m, n, genes, patients, mutation2patients, patient2mutations = mutation_data if args.verbose: print "* Mutation data: %s genes x %s patients" % (m, n) # Run Multi-Dendrix for the range of parameters ts = range(args.min_num_gene_sets, args.max_num_gene_sets + 1) ks = range(args.min_gene_set_size, args.max_gene_set_size + 1) collections = dict( [(t, {}) for t in ts] ) for t, k_max in [(t, k) for t in ts for k in ks]: multi_params = [ mutation_data, t, args.min_gene_set_size, k_max, args.alpha, args.delta, args.lmbda ] collection_w_weights = Multi.ILP(*multi_params, verbose=args.verbose) collections[t][k_max] = zip(*collection_w_weights) return collections, mutation_data, time() - start
def run(): """ This script performs the matrix permutation test on a collection of gene sets. It gives an example of how one could use various functions from different modules in order to evaluate the collection without obtaining the collection from the Multi-Dendrix pipeline. The program first loads the GBM(2008) mutation data, permutes it the given number of times, and then performs the matrix permutation test. The default parameters permute the GBM(2008) data *ten times*. On my machine, this script runs in 41.2 seconds. """ # Load required modules import sys, os sys.path.insert(1, os.path.abspath('../../')) import multi_dendrix.permute.mutation_data as P from multi_dendrix.evaluate.matrix import matrix_permutation_test # Default parameters for this tutorial (CHANGE AT YOUR OWN RISK) COLLECTION = [["CDKN2B", "RB1", "CDK4"], ["CDKN2A", "TP53", "DTX3"]] MUTATION_MATRIX = "../mutation_data/GBM_2008/GBM_2008.m2" GENE_BLACKLIST = "../mutation_data/fishy_genes.glst" CUTOFF = 2 ALPHA = 1.0 NUM_PERMUTATIONS = 10 T, K_MIN, K_MAX, OVERLAPS, SETS_PER_GENE = 2, 3, 3, 0, 1 # Load the mutation data import multi_dendrix as Multi gene2include, patient2include = Multi.white_and_blacklisting( None, None, None, GENE_BLACKLIST) mutation_data = Multi.load_mutation_data_w_cutoff(MUTATION_MATRIX, patient2include, gene2include, CUTOFF) m, n, genes, patients, mutation2patients, patient2mutations = mutation_data # Permute the mutation data G = P.construct_mutation_graph(mutation2patients, patient2mutations) Hs = [ P.permute_mutation_data(G.copy(), genes, patients) for i in range(NUM_PERMUTATIONS) ] # Calculate the p-value and output the results W_prime = sum( [Multi.W(mutation2patients, module, ALPHA) for module in COLLECTION]) print "W' of collection:", W_prime print 'P=', matrix_permutation_test(W_prime, Hs, T, K_MIN, K_MAX, ALPHA, OVERLAPS, SETS_PER_GENE)
def run(): """ This script performs the matrix permutation test on a collection of gene sets. It gives an example of how one could use various functions from different modules in order to evaluate the collection without obtaining the collection from the Multi-Dendrix pipeline. The program first loads the GBM(2008) mutation data, permutes it the given number of times, and then performs the matrix permutation test. The default parameters permute the GBM(2008) data *ten times*. On my machine, this script runs in 41.2 seconds. """ # Load required modules import sys, os sys.path.insert(1, os.path.abspath('../../')) import multi_dendrix.permute.mutation_data as P from multi_dendrix.evaluate.matrix import matrix_permutation_test # Default parameters for this tutorial (CHANGE AT YOUR OWN RISK) COLLECTION = [["CDKN2B", "RB1", "CDK4"], ["CDKN2A", "TP53", "DTX3"]] MUTATION_MATRIX="../mutation_data/GBM_2008/GBM_2008.m2" GENE_BLACKLIST="../mutation_data/fishy_genes.glst" CUTOFF=2 ALPHA=1.0 NUM_PERMUTATIONS=10 T, K_MIN, K_MAX, OVERLAPS, SETS_PER_GENE = 2, 3, 3, 0, 1 # Load the mutation data import multi_dendrix as Multi gene2include, patient2include = Multi.white_and_blacklisting(None, None, None, GENE_BLACKLIST) mutation_data = Multi.load_mutation_data_w_cutoff(MUTATION_MATRIX, patient2include, gene2include, CUTOFF) m, n, genes, patients, mutation2patients, patient2mutations = mutation_data # Permute the mutation data G = P.construct_mutation_graph(mutation2patients, patient2mutations) Hs = [ P.permute_mutation_data(G.copy(), genes, patients) for i in range(NUM_PERMUTATIONS) ] # Calculate the p-value and output the results W_prime = sum([ Multi.W(mutation2patients, module, ALPHA) for module in COLLECTION]) print "W' of collection:", W_prime print 'P=', matrix_permutation_test(W_prime, Hs, T, K_MIN, K_MAX, ALPHA, OVERLAPS, SETS_PER_GENE)
def batch_multi_dendrix(args): """Runs Multi-Dendrix for each parameter setting on the input mutation data. **Returns:** A tuple containing the following: * **collections** (*dictionary*) - mapping of t -> k -> output of Multi-Dendrix * **mutation_data** (*tuple*) - mutation data tuple (see :func:`multi_dendrix.multi_dendrix` for details). * **runtime** (*float*) - total runtime (in seconds) of Multi-Dendrix on all the parameter settings """ # Import required modules from time import time # Load mutation data used in each run start = time() include = Multi.white_and_blacklisting(args.patient_whitelist, args.patient_blacklist, args.gene_whitelist, args.gene_blacklist) gene2include, sample2include = include mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) m, n, genes, patients, mutation2patients, patient2mutations = mutation_data if args.verbose: print "* Mutation data: %s genes x %s patients" % (m, n) # Run Multi-Dendrix for the range of parameters ts = range(args.min_num_gene_sets, args.max_num_gene_sets + 1) ks = range(args.min_gene_set_size, args.max_gene_set_size + 1) collections = dict([(t, {}) for t in ts]) for t, k_max in [(t, k) for t in ts for k in ks]: multi_params = [ mutation_data, t, args.min_gene_set_size, k_max, args.alpha, args.delta, args.lmbda ] collection_w_weights = Multi.ILP(*multi_params, verbose=args.verbose) collections[t][k_max] = zip(*collection_w_weights) return collections, mutation_data, time() - start
def run(args): # Parse args into shorter variable handles mutation_matrix = args.mutation_matrix cutoff, gene_whitelist = args.cutoff, args.gene_whitelist k, t = args.gene_set_size, args.num_gene_sets N, s = args.num_iterations, args.step_length npermutations = args.num_permutations # Load mutation data print "* Loading mutation data..." include = Multi.white_and_blacklisting(args.sample_whitelist, args.sample_blacklist, args.gene_whitelist, args.gene_blacklist) gene2include, sample2include = include mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) m, n, genes, patients, gene2cases, sample2genes = mutation_data # Convert the mutation data to integers for easy C parsing gene2index, index2gene, gene2case_index = convert_mutation_data( mutation_data) # Run the MCMC algorithm print "* Running MCMC algorithm..." solns = mcmc(k, t, m, n, gene2case_index, N, s, int(not args.quiet)) # Collate the results and sort them descending by sampling frequency solns_w_weights = convert_solns(index2gene, solns) results = collate_results(solns_w_weights) # Flatten the results print "* Flattening the results..." flat_results = [[S, data["freq"], data["W_prime"], data["Ws"]] for S, data in results.items()] flat_results.sort(key=lambda (S, F, W_prime, Ws): F, reverse=True) # Perform the permutation test (if necessary) if args.num_permutations > 0: permute_pval = run_matrix_permutation_test(npermutations, args.permuted_matrices_dir, mutation_data, k, t, N, s, flat_results, args.parallel) # Output results print "* Outputting results..." if args.num_permutations > 0: output = ["#Sampling Frequency\tWeight W'\tP-value ({} permutations)"\ "\tGene set\tWeight W".format(args.num_permutations)] else: output = ["#Sampling Frequency\tWeight W'\tGene set\tWeight W"] for S, freq, W_prime, Ws in flat_results: for i, M in enumerate(S): if i == 0: row = [freq, W_prime, ", ".join(sorted(M)), Ws[i]] if args.num_permutations > 0: row.insert(2, permute_pval[S]) else: row = ["", "", "", ", ".join(sorted(M)), Ws[i]] output.append("\t".join(map(str, row))) open(args.output_file, "w").write("\n".join(output))
def run(args): # Parse args into shorter variable handles mutation_matrix = args.mutation_matrix cutoff, gene_whitelist = args.cutoff, args.gene_whitelist k, t = args.gene_set_size, args.num_gene_sets N, s = args.num_iterations, args.step_length npermutations = args.num_permutations # Load mutation data print "* Loading mutation data..." include = Multi.white_and_blacklisting( args.sample_whitelist, args.sample_blacklist, args.gene_whitelist, args.gene_blacklist ) gene2include, sample2include = include mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff) m, n, genes, patients, gene2cases, sample2genes = mutation_data # Convert the mutation data to integers for easy C parsing gene2index, index2gene, gene2case_index = convert_mutation_data(mutation_data) # Run the MCMC algorithm print "* Running MCMC algorithm..." solns = mcmc(k, t, m, n, gene2case_index, N, s, int(not args.quiet)) # Collate the results and sort them descending by sampling frequency solns_w_weights = convert_solns(index2gene, solns) results = collate_results(solns_w_weights) # Flatten the results print "* Flattening the results..." flat_results = [[S, data["freq"], data["W_prime"], data["Ws"]] for S, data in results.items()] flat_results.sort(key=lambda (S, F, W_prime, Ws): F, reverse=True) # Perform the permutation test (if necessary) if args.num_permutations > 0: permute_pval = run_matrix_permutation_test( npermutations, args.permuted_matrices_dir, mutation_data, k, t, N, s, flat_results, args.parallel ) # Output results print "* Outputting results..." if args.num_permutations > 0: output = [ "#Sampling Frequency\tWeight W'\tP-value ({} permutations)" "\tGene set\tWeight W".format(args.num_permutations) ] else: output = ["#Sampling Frequency\tWeight W'\tGene set\tWeight W"] for S, freq, W_prime, Ws in flat_results: for i, M in enumerate(S): if i == 0: row = [freq, W_prime, ", ".join(sorted(M)), Ws[i]] if args.num_permutations > 0: row.insert(2, permute_pval[S]) else: row = ["", "", "", ", ".join(sorted(M)), Ws[i]] output.append("\t".join(map(str, row))) open(args.output_file, "w").write("\n".join(output))