Пример #1
0
def comet(mutations, n, t, ks, numIters, stepLen, initialSoln, amp, subt, nt,
          hybridPvalThreshold, pvalThresh, verbose):
    # Convert mutation data to C-ready format
    if subt: mutations = mutations + (subt, )
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    initialSolnIndex = [geneToIndex[g] for g in initialSoln]
    solns = C.comet(t, mutations[0], mutations[1], iPatientToGenes,
                    geneToNumCases, ks, numIters,
                    stepLen, amp, nt, hybridPvalThreshold, initialSolnIndex,
                    len(subt), pvalThresh, verbose)

    # Collate the results and sort them descending by sampling frequency
    solnsWithWeights = C.convert_solns(indexToGene, solns)

    def collection_key(collection):
        return " ".join(sorted([",".join(sorted(M)) for M in collection]))

    results = dict()
    # store last soln of sampling for more iterations
    lastSoln = list()
    for gset in solnsWithWeights[-1][0]:
        for g in gset:
            lastSoln.append(g)

    for collection, Ws, Cs in solnsWithWeights:

        key = collection_key(collection)
        if key in results: results[key]["freq"] += 1
        else:
            sets = []
            for i in range(len(collection)):
                M = collection[i]
                W = Ws[i]
                F = Cs[i]
                # extract the probability from the weight,
                # which can also include the accelerator
                P = pow(exp(-W), 1. / amp)
                sets.append(dict(genes=M, W=W, num_tbls=F, prob=P))

            totalWeight = sum([S["W"] for S in sets])
            targetWeight = exp(totalWeight) if totalWeight < 700 else 1e1000

            results[key] = dict(freq=1,
                                sets=sets,
                                total_weight=totalWeight,
                                target_weight=targetWeight)

    return results, lastSoln
Пример #2
0
def run(args):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile,
                                     minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print('- Mutation data: %s genes x %s patients' % (m, n))

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(list(geneToIndex.keys()), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases,
                           pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = list(zip(solns, weights, tables, probs))
    res.sort(key=lambda arr: arr[1], reverse=True)  # sort by weight decreasing
    solns = [sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [w for g, w, t, p in res]
    tables = [t for g, w, t, p in res]
    probs = [p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf),
              "w") as outfile:
        output = [
            "\t".join([", ".join(s), str(p), str(w)])
            for s, p, w in zip(solns, probs, weights)
        ]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write("\n".join(output))

    return list(zip(solns, probs, weights))
Пример #3
0
def comet(mutations, n, t, ks, numIters, stepLen, initialSoln,
          amp, subt, nt, hybridPvalThreshold, pvalThresh, verbose):
    # Convert mutation data to C-ready format
    if subt: mutations = mutations + (subt, )
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    initialSolnIndex = [geneToIndex[g] for g in initialSoln]
    solns = C.comet(t, mutations[0], mutations[1], iPatientToGenes, geneToNumCases,
                    ks, numIters, stepLen, amp, nt, hybridPvalThreshold,
                    initialSolnIndex, len(subt), pvalThresh, verbose)

    # Collate the results and sort them descending by sampling frequency
    solnsWithWeights = convert_solns( indexToGene, solns )
    def collection_key(collection):
        return " ".join(sorted([",".join(sorted(M)) for M in collection]))

    results = dict()
    # store last soln of sampling for more iterations
    lastSoln = list()
    for gset in solnsWithWeights[-1][0]:
        for g in gset:
            lastSoln.append(g)

    for collection, Ws, Cs in solnsWithWeights:

        key = collection_key(collection)
        if key in results: results[key]["freq"] += 1
        else:
            sets = []
            for i in range(len(collection)):
                M = collection[i]
                W = Ws[i]
                F = Cs[i]
                # extract the probability from the weight,
                # which can also include the accelerator
                P = pow(exp(-W), 1./amp)
                sets.append( dict(genes=M, W=W, num_tbls=F, prob=P) )

            totalWeight  = sum([ S["W"] for S in sets ])
            targetWeight = exp( totalWeight ) if totalWeight < 700 else 1e1000

            results[key] = dict(freq=1, sets=sets, total_weight=totalWeight,
                                target_weight=targetWeight)


    return results, lastSoln
Пример #4
0
def run( args ):
    # Parse the arguments into shorter variable hadnles
    mutationMatrix = args.mutation_matrix
    geneFile = args.gene_file
    patientFile = args.patient_file
    minFreq = args.min_freq
    k = args.gene_set_size
    pvalThresh = 1.1
    wf = args.weight_func

    # Load the mutation data
    mutations = C.load_mutation_data(mutationMatrix, patientFile, geneFile, minFreq)
    m, n = mutations[0], mutations[1]
    if args.verbose:
        print '- Mutation data: %s genes x %s patients' % (m, n)

    # Set up the CoMEt run and then run exhaustively
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations
    genes = sorted(geneToIndex.keys(), key=lambda g: geneToIndex[g])

    C.precompute_factorials(max(m, n))
    C.set_weight(C.weightFunctionChars[wf])
    results = C.exhaustive(k, m, n, iPatientToGenes, geneToNumCases, pvalThresh)
    C.free_factorials()

    # Parse the output
    solns, weights, tables, probs = results
    res = zip(solns, weights, tables, probs)
    res.sort(key=lambda arr: arr[1], reverse=True) # sort by weight decreasing
    solns   = [ sorted([genes[g] for g in geneset]) for geneset, w, t, p in res]
    weights = [ w for g, w, t, p in res]
    tables  = [ t for g, w, t, p in res]
    probs   = [ p for g, w, t, p in res]

    # Output only sets, probs, and freqs as TSV
    with open("%s-k%s-%s-exhaustive.tsv" % (args.output_prefix, k, wf), "w") as outfile:
        output = [ "\t".join([ ", ".join(s), str(p), str(w)])
                   for s, p, w in zip(solns, probs, weights)]
        output.insert(0, "#Gene set\tP-value\tFreq\tWeight")
        outfile.write( "\n".join(output) )

    return zip(solns, probs, weights)
Пример #5
0
def load_precomputed_scores(infile, mutations, subt):

    if subt: mutations = mutations + (subt,)
    cMutations = C.convert_mutations_to_C_format(*mutations)
    iPatientToGenes, iGeneToCases, geneToNumCases, geneToIndex, indexToGene = cMutations

    baseI = 3  # sampling freq., total weight, target weight
    setI = 3 # gene set, score, weight function

    matchObj = re.match( r'.+\.k(\d+)\..+?', infile)

    loadingT = len(matchObj.group(1)) # determine t:the number of gene sets.
    for l in open(infile):
        if not l.startswith("#"):
            v = l.rstrip().split("\t")
            j = 0
            for i in range(loadingT):
                gSet = [geneToIndex[g] for g in v[baseI + j].split(", ")]
                C.load_precomputed_scores(float(v[baseI + j + 1]), len(v[baseI + j].split(", ")), int(v[baseI + j + 2]), gSet)
                j += setI