def batch_multi_dendrix(args):
    """Runs Multi-Dendrix for each parameter setting on the input
    mutation data.

    **Returns:**
      A tuple containing the following:
        * **collections** (*dictionary*) - mapping of t -> k -> output of Multi-Dendrix
        * **mutation_data** (*tuple*) - mutation data tuple (see :func:`multi_dendrix.multi_dendrix` for details).
        * **runtime** (*float*) - total runtime (in seconds) of Multi-Dendrix on all the parameter settings
    """
    # Import required modules
    from time import time

    # Load mutation data used in each run
    start = time()
    include = Multi.white_and_blacklisting(args.patient_whitelist,
              args.patient_blacklist, args.gene_whitelist, args.gene_blacklist)
    gene2include, sample2include = include

    mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix,
                    sample2include, gene2include, args.cutoff)
    m, n, genes, patients, mutation2patients, patient2mutations = mutation_data
    if args.verbose: print "* Mutation data: %s genes x %s patients" % (m, n)

    # Run Multi-Dendrix for the range of parameters 
    ts = range(args.min_num_gene_sets, args.max_num_gene_sets + 1)
    ks = range(args.min_gene_set_size, args.max_gene_set_size + 1)
    collections = dict( [(t, {}) for t in ts] )
    for t, k_max in [(t, k) for t in ts for k in ks]:
        multi_params = [ mutation_data, t, args.min_gene_set_size, k_max,
                         args.alpha, args.delta, args.lmbda ]
        collection_w_weights = Multi.ILP(*multi_params, verbose=args.verbose)
        collections[t][k_max] = zip(*collection_w_weights)

    return collections, mutation_data, time() - start
Exemplo n.º 2
0
def run():
    """
	This script performs the matrix permutation test on a collection of gene 
	sets. It gives an example of how one could use various functions from 
	different modules in order to evaluate the collection without obtaining
	the collection from the Multi-Dendrix pipeline. The program first loads
	the GBM(2008) mutation data, permutes it the given number of times, and
	then performs the matrix permutation test.

	The default parameters permute the GBM(2008) data *ten times*.
	On my machine, this script runs in 41.2 seconds.
	"""
    # Load required modules
    import sys, os
    sys.path.insert(1, os.path.abspath('../../'))
    import multi_dendrix.permute.mutation_data as P
    from multi_dendrix.evaluate.matrix import matrix_permutation_test

    # Default parameters for this tutorial (CHANGE AT YOUR OWN RISK)
    COLLECTION = [["CDKN2B", "RB1", "CDK4"], ["CDKN2A", "TP53", "DTX3"]]
    MUTATION_MATRIX = "../mutation_data/GBM_2008/GBM_2008.m2"
    GENE_BLACKLIST = "../mutation_data/fishy_genes.glst"
    CUTOFF = 2
    ALPHA = 1.0
    NUM_PERMUTATIONS = 10
    T, K_MIN, K_MAX, OVERLAPS, SETS_PER_GENE = 2, 3, 3, 0, 1

    # Load the mutation data
    import multi_dendrix as Multi
    gene2include, patient2include = Multi.white_and_blacklisting(
        None, None, None, GENE_BLACKLIST)

    mutation_data = Multi.load_mutation_data_w_cutoff(MUTATION_MATRIX,
                                                      patient2include,
                                                      gene2include, CUTOFF)
    m, n, genes, patients, mutation2patients, patient2mutations = mutation_data

    # Permute the mutation data
    G = P.construct_mutation_graph(mutation2patients, patient2mutations)
    Hs = [
        P.permute_mutation_data(G.copy(), genes, patients)
        for i in range(NUM_PERMUTATIONS)
    ]

    # Calculate the p-value and output the results
    W_prime = sum(
        [Multi.W(mutation2patients, module, ALPHA) for module in COLLECTION])
    print "W' of collection:", W_prime
    print 'P=', matrix_permutation_test(W_prime, Hs, T, K_MIN, K_MAX, ALPHA,
                                        OVERLAPS, SETS_PER_GENE)
def run():
	"""
	This script performs the matrix permutation test on a collection of gene 
	sets. It gives an example of how one could use various functions from 
	different modules in order to evaluate the collection without obtaining
	the collection from the Multi-Dendrix pipeline. The program first loads
	the GBM(2008) mutation data, permutes it the given number of times, and
	then performs the matrix permutation test.

	The default parameters permute the GBM(2008) data *ten times*.
	On my machine, this script runs in 41.2 seconds.
	"""
	# Load required modules
	import sys, os
	sys.path.insert(1, os.path.abspath('../../'))
	import multi_dendrix.permute.mutation_data as P
	from multi_dendrix.evaluate.matrix import matrix_permutation_test

	# Default parameters for this tutorial (CHANGE AT YOUR OWN RISK)
	COLLECTION = [["CDKN2B", "RB1", "CDK4"], ["CDKN2A", "TP53", "DTX3"]]
	MUTATION_MATRIX="../mutation_data/GBM_2008/GBM_2008.m2"
	GENE_BLACKLIST="../mutation_data/fishy_genes.glst"
	CUTOFF=2
	ALPHA=1.0
	NUM_PERMUTATIONS=10
	T, K_MIN, K_MAX, OVERLAPS, SETS_PER_GENE  = 2, 3, 3, 0, 1

	# Load the mutation data
	import multi_dendrix as Multi
	gene2include, patient2include = Multi.white_and_blacklisting(None, None,
		                            None, GENE_BLACKLIST)

	mutation_data = Multi.load_mutation_data_w_cutoff(MUTATION_MATRIX,
		patient2include, gene2include, CUTOFF)
	m, n, genes, patients, mutation2patients, patient2mutations = mutation_data

	# Permute the mutation data
	G  = P.construct_mutation_graph(mutation2patients, patient2mutations)
	Hs = [ P.permute_mutation_data(G.copy(), genes, patients)
	       for i in range(NUM_PERMUTATIONS) ]

	# Calculate the p-value and output the results
	W_prime = sum([ Multi.W(mutation2patients, module, ALPHA)
 		           for module in COLLECTION])
	print  "W' of collection:", W_prime
	print 'P=', matrix_permutation_test(W_prime, Hs, T, K_MIN, K_MAX, ALPHA,
		                                OVERLAPS, SETS_PER_GENE)
Exemplo n.º 4
0
def batch_multi_dendrix(args):
    """Runs Multi-Dendrix for each parameter setting on the input
    mutation data.

    **Returns:**
      A tuple containing the following:
        * **collections** (*dictionary*) - mapping of t -> k -> output of Multi-Dendrix
        * **mutation_data** (*tuple*) - mutation data tuple (see :func:`multi_dendrix.multi_dendrix` for details).
        * **runtime** (*float*) - total runtime (in seconds) of Multi-Dendrix on all the parameter settings
    """
    # Import required modules
    from time import time

    # Load mutation data used in each run
    start = time()
    include = Multi.white_and_blacklisting(args.patient_whitelist,
                                           args.patient_blacklist,
                                           args.gene_whitelist,
                                           args.gene_blacklist)
    gene2include, sample2include = include

    mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix,
                                                      sample2include,
                                                      gene2include,
                                                      args.cutoff)
    m, n, genes, patients, mutation2patients, patient2mutations = mutation_data
    if args.verbose: print "* Mutation data: %s genes x %s patients" % (m, n)

    # Run Multi-Dendrix for the range of parameters
    ts = range(args.min_num_gene_sets, args.max_num_gene_sets + 1)
    ks = range(args.min_gene_set_size, args.max_gene_set_size + 1)
    collections = dict([(t, {}) for t in ts])
    for t, k_max in [(t, k) for t in ts for k in ks]:
        multi_params = [
            mutation_data, t, args.min_gene_set_size, k_max, args.alpha,
            args.delta, args.lmbda
        ]
        collection_w_weights = Multi.ILP(*multi_params, verbose=args.verbose)
        collections[t][k_max] = zip(*collection_w_weights)

    return collections, mutation_data, time() - start
def run(args):
    # Parse args into shorter variable handles
    mutation_matrix = args.mutation_matrix
    cutoff, gene_whitelist = args.cutoff, args.gene_whitelist
    k, t = args.gene_set_size, args.num_gene_sets
    N, s = args.num_iterations, args.step_length
    npermutations = args.num_permutations

    # Load mutation data
    print "* Loading mutation data..."
    include = Multi.white_and_blacklisting(args.sample_whitelist,
                                           args.sample_blacklist,
                                           args.gene_whitelist,
                                           args.gene_blacklist)
    gene2include, sample2include = include

    mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix,
                                                      sample2include,
                                                      gene2include,
                                                      args.cutoff)
    m, n, genes, patients, gene2cases, sample2genes = mutation_data

    # Convert the mutation data to integers for easy C parsing
    gene2index, index2gene, gene2case_index = convert_mutation_data(
        mutation_data)

    # Run the MCMC algorithm
    print "* Running MCMC algorithm..."
    solns = mcmc(k, t, m, n, gene2case_index, N, s, int(not args.quiet))

    # Collate the results and sort them descending by sampling frequency
    solns_w_weights = convert_solns(index2gene, solns)
    results = collate_results(solns_w_weights)

    # Flatten the results
    print "* Flattening the results..."
    flat_results = [[S, data["freq"], data["W_prime"], data["Ws"]]
                    for S, data in results.items()]
    flat_results.sort(key=lambda (S, F, W_prime, Ws): F, reverse=True)

    # Perform the permutation test (if necessary)
    if args.num_permutations > 0:
        permute_pval = run_matrix_permutation_test(npermutations,
                                                   args.permuted_matrices_dir,
                                                   mutation_data, k, t, N, s,
                                                   flat_results, args.parallel)

    # Output results
    print "* Outputting results..."
    if args.num_permutations > 0:
        output = ["#Sampling Frequency\tWeight W'\tP-value ({} permutations)"\
                  "\tGene set\tWeight W".format(args.num_permutations)]
    else:
        output = ["#Sampling Frequency\tWeight W'\tGene set\tWeight W"]

    for S, freq, W_prime, Ws in flat_results:
        for i, M in enumerate(S):
            if i == 0:
                row = [freq, W_prime, ", ".join(sorted(M)), Ws[i]]
                if args.num_permutations > 0:
                    row.insert(2, permute_pval[S])
            else:
                row = ["", "", "", ", ".join(sorted(M)), Ws[i]]

            output.append("\t".join(map(str, row)))

    open(args.output_file, "w").write("\n".join(output))
def run(args):
    # Parse args into shorter variable handles
    mutation_matrix = args.mutation_matrix
    cutoff, gene_whitelist = args.cutoff, args.gene_whitelist
    k, t = args.gene_set_size, args.num_gene_sets
    N, s = args.num_iterations, args.step_length
    npermutations = args.num_permutations

    # Load mutation data
    print "* Loading mutation data..."
    include = Multi.white_and_blacklisting(
        args.sample_whitelist, args.sample_blacklist, args.gene_whitelist, args.gene_blacklist
    )
    gene2include, sample2include = include

    mutation_data = Multi.load_mutation_data_w_cutoff(args.mutation_matrix, sample2include, gene2include, args.cutoff)
    m, n, genes, patients, gene2cases, sample2genes = mutation_data

    # Convert the mutation data to integers for easy C parsing
    gene2index, index2gene, gene2case_index = convert_mutation_data(mutation_data)

    # Run the MCMC algorithm
    print "* Running MCMC algorithm..."
    solns = mcmc(k, t, m, n, gene2case_index, N, s, int(not args.quiet))

    # Collate the results and sort them descending by sampling frequency
    solns_w_weights = convert_solns(index2gene, solns)
    results = collate_results(solns_w_weights)

    # Flatten the results
    print "* Flattening the results..."
    flat_results = [[S, data["freq"], data["W_prime"], data["Ws"]] for S, data in results.items()]
    flat_results.sort(key=lambda (S, F, W_prime, Ws): F, reverse=True)

    # Perform the permutation test (if necessary)
    if args.num_permutations > 0:
        permute_pval = run_matrix_permutation_test(
            npermutations, args.permuted_matrices_dir, mutation_data, k, t, N, s, flat_results, args.parallel
        )

    # Output results
    print "* Outputting results..."
    if args.num_permutations > 0:
        output = [
            "#Sampling Frequency\tWeight W'\tP-value ({} permutations)"
            "\tGene set\tWeight W".format(args.num_permutations)
        ]
    else:
        output = ["#Sampling Frequency\tWeight W'\tGene set\tWeight W"]

    for S, freq, W_prime, Ws in flat_results:
        for i, M in enumerate(S):
            if i == 0:
                row = [freq, W_prime, ", ".join(sorted(M)), Ws[i]]
                if args.num_permutations > 0:
                    row.insert(2, permute_pval[S])
            else:
                row = ["", "", "", ", ".join(sorted(M)), Ws[i]]

            output.append("\t".join(map(str, row)))

    open(args.output_file, "w").write("\n".join(output))