Exemplo n.º 1
0
def run(args):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print('- Mutation data: %s genes x %s patients' % (m, n))

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(list(geneToIndex.keys()), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases,
                           pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = list(zip(solns, weights, tables, probs))
    res.sort(key=lambda arr: arr[1], reverse=True)  # sort by weight decreasing
    solns = [sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [w for g, w, t, p in res]
    tables = [t for g, w, t, p in res]
    probs = [p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf),
              "w") as outfile:
        output = [
            "\t".join([", ".join(s), str(p), str(w)])
            for s, p, w in zip(solns, probs, weights)
        ]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write("\n".join(output))

    return list(zip(solns, probs, weights))
Exemplo n.º 2
0
def run( args ):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print '- Mutation data: %s genes x %s patients' % (m, n)

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(geneToIndex.keys(), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = zip(solns, weights, tables, probs)
    res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing
    solns   = [ sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [ w for g, w, t, p in res]
    tables  = [ t for g, w, t, p in res]
    probs   = [ p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile:
        output = [ "\t".join([ ", ".join(s), str(p), str(w)])
                   for s, p, w in zip(solns, probs, weights)]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write( "\n".join(output) )

    return zip(solns, probs, weights)
Exemplo n.º 3
0
def run(args):
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    rc = args.num_initial
    t = len(args.gene_set_sizes)  # number of pathways
    ks = args.gene_set_sizes  # size of each pathway
    N = args.num_iterations  # number of iteration
    s = args.step_length  # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5  # increamental for non-converged chain
    tc = 1

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.subtype:
        with open(args.subtype) as f:
            subSet = [l.rstrip() for l in f]
    else:
        subSet = list()

    if args.verbose:
        print 'Mutation data: %s genes x %s patients' % (m, n)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = initial_solns_generator(
            args.num_initial, mutations, ks, args.initial_soln, subSet)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s,
                                             init, acc, subSet, nt,
                                             hybridCutoff, args.exact_cut,
                                             True)
                print "Mem usage: ", resource.getrusage(
                    resource.RUSAGE_SELF).ru_maxrss / 1000
                merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N / s))
            print finalTv, N

            newN = int(N * NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = merge_results(totalOut)
        printParameters(args, ks,
                        finalTv)  # store and output parameters into .json

    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc,
                                     subSet, nt, hybridCutoff, args.exact_cut,
                                     True)
        results = outresults
        runNum = 1
        printParameters(args, ks, 1)

    C.free_factorials()

    # Output Comet results to TSV
    collections = sorted(results.keys(),
                         key=lambda S: results[S]["total_weight"],
                         reverse=True)
    header = "#Freq\tTotal Weight\tTarget Weight\t"
    header += "\t".join([
        "Gene set %s (k=%s)\tProb %s\tWeight function %s" %
        (i, ks[i - 1], i, i) for i in range(1,
                                            len(ks) + 1)
    ])
    tbl = [header]
    for S in collections:
        data = results[S]
        row = [
            data["freq"], data["total_weight"],
            format(data["target_weight"], 'g')
        ]
        for d in sorted(data["sets"], key=lambda d: d["W"]):
            row += [", ".join(sorted(d["genes"])), d["prob"], d["num_tbls"]]
        tbl.append("\t".join(map(str, row)))

    outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N *
                                     (runNum), ks, args.accelerator)
    with open(outputFile, "w") as outfile:
        outfile.write("\n".join(tbl))

    return [(S, results[S]["freq"], results[S]["total_weight"])
            for S in collections]
Exemplo n.º 4
0
def run( args ):
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    rc    = args.num_initial
    t     = len(args.gene_set_sizes) # number of pathways
    ks    = args.gene_set_sizes      # size of each pathway
    N     = args.num_iterations      # number of iteration
    s     = args.step_length         # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5                 # increamental for non-converged chain
    tc   = 1

	# Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
    m, n, genes, patients, geneToCases, patientToGenes = mutations

    if args.subtype:
        with open(args.subtype) as f:
            subSet = [ l.rstrip() for l in f ]
    else:
        subSet = list()

    if args.verbose:
        print 'Mutation data: %s genes x %s patients' % (m, n)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = initial_solns_generator(args.num_initial, mutations, ks, args.initial_soln, subSet )
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True)
                print "Mem usage: ", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
                merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N/s))
            print finalTv, N

            newN = int(N*NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = merge_results(totalOut)
        printParameters(args, ks, finalTv) # store and output parameters into .json


    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, init, acc, subSet, nt, hybridCutoff, args.exact_cut, True)
        results = outresults
        runNum = 1
        printParameters(args, ks, 1)

    C.free_factorials()

    # Output Comet results to TSV
    collections = sorted(results.keys(), key=lambda S: results[S]["total_weight"], reverse=True)
    weight_func_mapping = {0: 'E', 1:'E', 2:'B', 3:'P'}
    header = "#Freq\tTotal Weight\tTarget Weight\t"
    header += "\t".join(["Gene set %s (k=%s)\tPhi %s\tWeight function %s" % (i, ks[i-1], i, i) for i in range(1, len(ks)+1)])
    tbl = [header]
    for S in collections:
        data = results[S]
        row = [ data["freq"], data["total_weight"], format(data["target_weight"], 'g') ]
        for d in sorted(data["sets"], key=lambda d: d["W"]):
            row += [", ".join(sorted(d["genes"])), d["prob"], weight_func_mapping[d["num_tbls"]] ]
        tbl.append("\t".join(map(str, row)))

    outputFile = "%s.tsv" % iter_num(args.output_prefix + '.sum', N*(runNum), ks, args.accelerator)
    with open(outputFile, "w") as outfile: outfile.write( "\n".join(tbl) )

    return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]
Exemplo n.º 5
0
def run(args):
    ###########################################################################
    # Parse the arguments into shorter variable handles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc = args.num_initial
    t = len(args.gene_set_sizes)  # number of pathways
    ks = args.gene_set_sizes  # size of each pathway
    N = args.num_iterations  # number of iteration
    s = args.step_length  # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5  # increamental for non-converged chain

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = (m, n, genes, patients, geneToCases, patientToGenes)

    ###########################################################################
    if args.verbose:
        print(f'Mutation data: {m} genes x {n} patients')

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list(subtypes.union(set([l.rstrip() for l in f])))
    else:
        subSet = list(subtypes)

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N / s))
            print(finalTv, N)

            newN = int(N * NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)

    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results.keys(),
                         key=lambda S: results[S]["total_weight"],
                         reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N * (runNum), 0,
                   0)

    return [(S, results[S]["freq"], results[S]["total_weight"])
            for S in collections]
Exemplo n.º 6
0
def run( args ):
    ###########################################################################
    # Parse the arguments into shorter variable handles    
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    subtypeFile = args.subtype
    rc    = args.num_initial
    t     = len(args.gene_set_sizes) # number of pathways
    ks    = args.gene_set_sizes      # size of each pathway
    N     = args.num_iterations      # number of iteration
    s     = args.step_length         # step
    NStop = args.n_stop
    acc = args.accelerator
    nt = args.nt
    hybridCutoff = args.binom_cut
    NInc = 1.5                 # increamental for non-converged chain    

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq, subtypeFile)
    m, n, genes, patients, geneToCases, patientToGenes, subtypes = mutations
    mutations = ( m, n, genes, patients, geneToCases, patientToGenes )


    ###########################################################################
    if args.verbose:
        print('Mutation data: %s genes x %s patients' % (m, n))

    if args.core_events:
        with open(args.core_events) as f:
            subSet = list( subtypes.union( set( [ l.rstrip() for l in f ] ) ) )
    else:
        subSet = list( subtypes )

    # Precompute factorials
    C.precompute_factorials(max(m, n))
    C.set_random_seed(args.seed)

    # stored the score of pre-computed collections into C
    if args.precomputed_scores:
        C.load_precomputed_scores(args.precomputed_scores, mutations, subSet)

    # num_initial > 1, perform convergence pipeline, otherwise, perform one run only
    if args.num_initial > 1:
        # collect initial soln from users, multidendrix and random.
        initialSolns, totalOut = C.initial_solns_generator(args.num_initial, \
            mutations, ks, args.initial_soln, subSet, \
            importMultidendrix, multi_dendrix)
        runN = N
        while True:
            lastSolns = list()
            for i in range(len(initialSolns)):
                init = initialSolns[i]
                outresults, lastSoln = comet(mutations, n, t, ks, runN, s, \
                    init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)                
                C.merge_runs(totalOut[i], outresults)
                lastSolns.append(lastSoln)

            finalTv = C.discrete_convergence(totalOut, int(N/s))
            print(finalTv, N)

            newN = int(N*NInc)
            if newN > NStop or finalTv < args.total_distance_cutoff:
                break
            runN = newN - N
            N = newN
            initialSolns = lastSolns

        runNum = len(totalOut)
        results = C.merge_results(totalOut)
        
    else:
        init = list()
        outresults, lastSoln = comet(mutations, n, t, ks, N, s, \
            init, acc, subSet, nt, hybridCutoff, args.exact_cut, args.verbose)
        results = outresults
        runNum = 1

    C.free_factorials()

    # Output comet results to TSV and website
    collections = sorted(results, key=lambda S: results[S]["total_weight"], reverse=True)
    C.output_comet(args, mutations, results, collections, ks, N*(runNum), 0, 0)
    
    return [ (S, results[S]["freq"], results[S]["total_weight"]) for S in collections ]