예제 #1
0
    def load_matrices(self, load_directory, geneFile=None, patientFile=None, minFreq=0):
        matrices = os.listdir(load_directory)
        for file in matrices:
            _, _, newGenes, newPatients, newGeneToCases, newPatientToGenes = mex.load_mutation_data(load_directory + '/' + file, geneFile=geneFile,
                                                                                   patientFile=patientFile, minFreq=minFreq)

            # check for differences in loaded genes/patient
            if set.difference(set(newGenes), set(self.genes)):
                print "Error: loaded genes different from original matrix"
                print "in file ", file, " in matrix directory ", load_directory
                exit(1)
            if set.difference(set(newPatients), set(self.patients)):
                print "Error: loaded genes different from original matrix"
                print "in file ", file, " in matrix directory ", load_directory
                exit(1)

            # load the new ones in
            for gene in newGeneToCases:
                self.geneToCases_perm[gene].append(newGeneToCases[gene])

            for patient in newPatientToGenes:
                self.patientToGenes_perm[patient].append(newPatientToGenes[patient])

        print "Number of loaded matrices: ", len(matrices)
        self.num_permutations = len(matrices)

        return
예제 #2
0
def main():

    mutationmatrix = '/Users/jlu96/maf/new/BRCA_wustl/BRCA_wustl-som-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/BRCA_wustl/shared_patients.plst'
    geneFile = None
    minFreq = 0
    outpatientfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Heatmaps/BRCA_patientmatrix_log.csv'

    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(
        mutationmatrix, patientFile, geneFile, minFreq)

    # ---- All Gene Frequency Association -------------------------------------------------------------------------
    genePvalue_file = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/geneMakeBin/BRCA_genemakebin.csv'
    genePValue = all_gene_freq_association(geneToCases, patientToGenes)

    with open(genePvalue_file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Gene', 'Pvalue', 'Frequency', 'InAvg', 'OutAvg'])
        for entry in genePValue:
            writer.writerow(entry)
예제 #3
0
def main():

    mutationmatrix = '/Users/jlu96/maf/new/BRCA_wustl/BRCA_wustl-som-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/BRCA_wustl/shared_patients.plst'
    geneFile = None
    minFreq = 0
    outpatientfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Heatmaps/BRCA_patientmatrix_log.csv'

    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)


    # ---- All Gene Frequency Association -------------------------------------------------------------------------
    genePvalue_file = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/geneMakeBin/BRCA_genemakebin.csv'
    genePValue = all_gene_freq_association(geneToCases, patientToGenes)

    with open(genePvalue_file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Gene', 'Pvalue', 'Frequency', 'InAvg', 'OutAvg'])
        for entry in genePValue:
            writer.writerow(entry)
예제 #4
0
def run(args):
    mdictfile = args.mdictfile
    cdictfile = args.cdictfile

    mprob = args.mprob
    cprob = args.cprob
    cooccur_distance_threshold = args.cooccur_distance_threshold
    bin_distance_threshold = args.bin_distance_threshold

    mutationmatrix = args.mutation_matrix
    newmutationmatrix = args.newmutationmatrix

    file_prefix = args.output_prefix
    if not file_prefix:
        file_prefix = newmutationmatrix

    geneFile = args.gene_file
    patientFile = args.patient_file
    gene_blacklist = args.gene_blacklist_file
    patient_blacklist = args.patient_blacklist_file
    minFreq = args.min_freq
    minCooccur = args.min_cooccur
    min_cooccurrence_ratio = args.min_cooccurrence_ratio
    top_percentile = args.top_percentile
    top_number = args.top_number

    parallel_compute_number = args.parallel_compute_number


    filter_cooccur_same_segment = args.filter_cooccur_same_segment
    fcss_cratiothresh = args.fcss_cratiothresh
    fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh
    fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh
    fcss_coveragethresh = args.fcss_coveragethresh
    fcss_probabilitythresh = args.fcss_probabilitythresh



    gene_segment_file = args.gene_segment_file
    load_gene_segments = args.load_gene_segments
    is_gene2seg = args.is_gene2seg
    gene_bin_entries_file = args.gene_bin_entries_file
    no_throw_out_extras = args.no_throw_out_extras
    segment_info_file = args.segment_info_file


    if not gene_bin_entries_file:
        gene_bin_entries_file = file_prefix + '_binnedgenes.tsv'

    if not segment_info_file:
        segment_info_file = file_prefix + '_SEGMENTINFO.tsv'

    #-----------------------------------------------------






    mutations = mex.remove_blacklists(gene_blacklist, patient_blacklist,
                                  *mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq))
    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations
    print 'Filtered Mutation data: %s genes x %s patients' % (numGenes, numCases)


    # Load segment info
    if load_gene_segments:

        # extra_genes is the genes not found in the segment file.
        # If throw_out_extras is False, extra_genes will be empty.
        geneToBin, extra_genes = load_gene_to_bin(gene_segment_file, geneToCases, no_throw_out_extras=no_throw_out_extras, is_gene2seg=is_gene2seg)

        numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes(extra_genes, numGenes, numCases, genes, patients, geneToCases, patientToGenes)


    else:
        print "Beginning bin genes by co-occurring pairs. "
        genepairs = getgenepairs(geneToCases, genes, closer_than_distance=bin_distance_threshold)
        print "Pairs retrieved. Calculating cooccurring pairs to make bins."

        cpairsdict, cgenedict = met.complete_cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, fcss_probabilitythresh, minCooccur,
                      cooccur_distance_threshold, fcss_cratiothresh, parallel_compute_number,
                      filter_cooccur_same_segment, fcss_cratiothresh, fcss_mutfreqdiffratiothresh,
                      fcss_coveragethresh, fcss_probabilitythresh)

        print "Cooccurring pairs calculated."
        geneToBin = get_gene_bins_cooccur_same_segment(cpairsdict, geneToCases, fcss_cratiothresh, fcss_mutfreqdiffthresh,
                           fcss_mutfreqdiffratiothresh, fcss_coveragethresh, fcss_probabilitythresh, bin_distance_threshold=bin_distance_threshold)
        # Write these new bins out
        new_bins = convert_genes_to_bins(genes, geneToBin)
        write_segment_infos(new_bins, filename=segment_info_file)
        print "New SEGMENTINFO written to ", segment_info_file

        write_gene_positions(new_bins)
        print "New segment positions appended to gene_positions.txt"


    # Update to the new mutation matrix.
    geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin)

    newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes(geneToCases, patientToGenes, bin_setToBin, at_least_half=True)

    gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases, geneToBinSet, bin_setToBin)

    if gene_bin_entries_file:
        met.writeanydict(gene_bin_entries, gene_bin_entries_file)
        print "Gene bin entries written to ", gene_bin_entries_file


    # Write the new mutation matrix out.
    writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
예제 #5
0
def main():


    mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst'
    cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt'
    partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf'
    load_partitions = True
    do_min_cohort = True

    geneFile = None
    minFreq = 0
    test_minFreq = 100
    compute_mutex = True



    include_cohort_info = False
    num_cohorts_list = [1,3, 5, 7]


    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)

    print "number of genes is ", numGenes


    if do_min_cohort:
        cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts(partitionfile, patientToGenes)
        min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort))

        print "getting pairs"
        genepairs = met.getgenepairs(geneToCases, min_cohort_genes, test_minFreq=test_minFreq)

        print "Number of pairs ", len(genepairs)


        print "Normal cooccur test"
        t = time.time()
        cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex)
        print "Normal cooccur done in ", time.time() - t

        print "Beginning cohorts"
        t = time.time()
        cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict, min_cohort)
        print "Cohorts done in ", time.time() - t

    else:
        genepairs = met.getgenepairs(geneToCases, genes, test_minFreq=test_minFreq)
        print "Number of pairs ", len(genepairs)


        print "Normal cooccur test"
        cpairsdict, cgenedict = met.cooccurpairs(numCases, geneToCases, patientToGenes, genepairs, compute_mutex=compute_mutex)

        # print "Add binomial probability"
        # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes)

        # undo
        print "Beginning cohorts"





        if load_partitions:
            cohort_dict = partition.load_patient_cohorts(partitionfile)
            cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict)

        else:
            for num_cohorts in num_cohorts_list:
                # get cohorts
                cohort_dict = generate_patient_cohorts(patientToGenes, num_cohorts)

                cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict)

                if include_cohort_info:
                    cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases, patientToGenes, cohort_dict)

    print "Writing to file..."
    met.writeanydict(cpairsdict, cpairfile)
예제 #6
0
def main():
    # INDEX BY LOSSES
    mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst'
    out_file = '/Users/jlu96/conte/jlu/Analyses/CancerMutationDistributions/OV_broad-cna-jl-PMM.csv'
    partition_file = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf'
    min_cluster_size = 30
    num_init = 9
    minComp = 4
    maxComp = 6
    do_plot = True
    do_gmm = False
    do_dna = True
    num_integrated = 4
    do_kmeans = False
    do_pmm = False
    geneFile = None
    minFreq = 0
    dna_gene_file = '/Users/jlu96/conte/jlu/Analyses/CancerGeneAnalysis/DNADamageRepair_loss.txt'


    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)

    p_gene_list = []

    with open(dna_gene_file, 'rU') as row_file:
        reader = csv.reader(row_file, delimiter='\t')
        for row in reader:
            p_gene_list.append(row[0])


    if do_kmeans:
        datas = []
        for i in np.arange(minComp, maxComp, 1):
            datas.append(partition_gene_kmeans(geneToCases, patientToGenes, p_gene_list, i, num_bins=50, title=None, do_plot=True))

        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            writer.writeheader()
            for row in datas:
                writer.writerow(row)


    if do_dna:
        cohort_dict = partition_gene_list(patientToGenes, p_gene_list, binary=not bool(num_integrated))
        # Make new cohorts over this
        if num_integrated:
            cohort_dict = integrate_cohorts(cohort_dict, numCases, num_integrated)


        cohort_pairings = [(key, cohort_dict[key]) for key in cohort_dict]
        draw_partitions_cohorts(geneToCases, patientToGenes, cohort_pairings, title='DNADamageGenes',
                        num_bins=100 if mutationmatrix[-9:] == 'cna-jl.m2' else 50)


    if do_gmm:
        datas = []
        for i in np.arange(minComp, maxComp, 1):
            datas.append(partition_GMM(patientToGenes, i, num_bins=50, title='GMM size ' + str(i), do_plot=do_plot))

        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            writer.writeheader()
            for row in datas:
                writer.writerow(row)


    if do_pmm:
        datas = []
        clusters = []
        for num_components in np.arange(minComp, maxComp, 1):
            best_data, clusterToPatient = best_pmm(patientToGenes, num_components, rand_num=5, far_rand_num=5,
                                                   min_cluster_size=min_cluster_size)
            datas.append(best_data)
            clusters.append(clusterToPatient)
            # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20, rand_init=False, do_plot=True)
            # datas.append(data)
            # all_lls.append(lls)
            # for j in range(num_init):
            #     data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20)
            #     datas.append(data)
            #     all_lls.append(lls)

        # os.system('say "Jonathan your program has finished"')


        # get the best BIC
        combined = zip(datas, clusters)
        combined = sorted(combined, key=lambda entry: (entry['MoreThanMin'], entry['BIC']))
        datas, clusters = zip(*combined)


        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            print datas
            writer.writeheader()
            for row in datas:
                writer.writerow(row)


        best_data = datas[0]
        clusterToPatient = clusters[0]

        # code to parition by best clusters
        with open(partition_file, 'w') as csvfile:
            writer = csv.writer(csvfile)

            writer.writerow(['Likelihood', best_data['Likelihood']])
            writer.writerow(['BIC', best_data['BIC']])
            writer.writerow(['NumComponents', best_data['Number']])
            writer.writerow(['Cluster', 'Lambda', 'Probability', 'Patients'])
            for k in clusterToPatient:
                if k != -1:
                    lam = best_data['Means'][k]
                    p_k = best_data['Probabilities'][k]
                else:
                    lam = None
                    p_k = None
                writer.writerow([k, lam, p_k] + list(clusterToPatient[k]))

        load_patient_cohorts(partition_file)
예제 #7
0
def run(args):
    mdictfile = args.mdictfile
    cdictfile = args.cdictfile

    mprob = args.mprob
    cprob = args.cprob
    cooccur_distance_threshold = args.cooccur_distance_threshold
    bin_distance_threshold = args.bin_distance_threshold

    mutationmatrix = args.mutation_matrix
    newmutationmatrix = args.newmutationmatrix

    file_prefix = args.output_prefix
    if not file_prefix:
        file_prefix = newmutationmatrix

    geneFile = args.gene_file
    patientFile = args.patient_file
    gene_blacklist = args.gene_blacklist_file
    patient_blacklist = args.patient_blacklist_file
    minFreq = args.min_freq
    minCooccur = args.min_cooccur
    min_cooccurrence_ratio = args.min_cooccurrence_ratio
    top_percentile = args.top_percentile
    top_number = args.top_number

    parallel_compute_number = args.parallel_compute_number

    filter_cooccur_same_segment = args.filter_cooccur_same_segment
    fcss_cratiothresh = args.fcss_cratiothresh
    fcss_mutfreqdiffthresh = args.fcss_mutfreqdiffthresh
    fcss_mutfreqdiffratiothresh = args.fcss_mutfreqdiffratiothresh
    fcss_coveragethresh = args.fcss_coveragethresh
    fcss_probabilitythresh = args.fcss_probabilitythresh

    gene_segment_file = args.gene_segment_file
    load_gene_segments = args.load_gene_segments
    is_gene2seg = args.is_gene2seg
    gene_bin_entries_file = args.gene_bin_entries_file
    no_throw_out_extras = args.no_throw_out_extras
    segment_info_file = args.segment_info_file

    if not gene_bin_entries_file:
        gene_bin_entries_file = file_prefix + '_binnedgenes.tsv'

    if not segment_info_file:
        segment_info_file = file_prefix + '_SEGMENTINFO.tsv'

    #-----------------------------------------------------

    mutations = mex.remove_blacklists(
        gene_blacklist, patient_blacklist,
        *mex.load_mutation_data(mutationmatrix, patientFile, geneFile,
                                minFreq))
    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mutations
    print 'Filtered Mutation data: %s genes x %s patients' % (numGenes,
                                                              numCases)

    # Load segment info
    if load_gene_segments:

        # extra_genes is the genes not found in the segment file.
        # If throw_out_extras is False, extra_genes will be empty.
        geneToBin, extra_genes = load_gene_to_bin(
            gene_segment_file,
            geneToCases,
            no_throw_out_extras=no_throw_out_extras,
            is_gene2seg=is_gene2seg)

        numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.remove_extra_genes(
            extra_genes, numGenes, numCases, genes, patients, geneToCases,
            patientToGenes)

    else:
        print "Beginning bin genes by co-occurring pairs. "
        genepairs = getgenepairs(geneToCases,
                                 genes,
                                 closer_than_distance=bin_distance_threshold)
        print "Pairs retrieved. Calculating cooccurring pairs to make bins."

        cpairsdict, cgenedict = met.complete_cooccurpairs(
            numCases, geneToCases, patientToGenes, genepairs,
            fcss_probabilitythresh, minCooccur, cooccur_distance_threshold,
            fcss_cratiothresh, parallel_compute_number,
            filter_cooccur_same_segment, fcss_cratiothresh,
            fcss_mutfreqdiffratiothresh, fcss_coveragethresh,
            fcss_probabilitythresh)

        print "Cooccurring pairs calculated."
        geneToBin = get_gene_bins_cooccur_same_segment(
            cpairsdict,
            geneToCases,
            fcss_cratiothresh,
            fcss_mutfreqdiffthresh,
            fcss_mutfreqdiffratiothresh,
            fcss_coveragethresh,
            fcss_probabilitythresh,
            bin_distance_threshold=bin_distance_threshold)
        # Write these new bins out
        new_bins = convert_genes_to_bins(genes, geneToBin)
        write_segment_infos(new_bins, filename=segment_info_file)
        print "New SEGMENTINFO written to ", segment_info_file

        write_gene_positions(new_bins)
        print "New segment positions appended to gene_positions.txt"

    # Update to the new mutation matrix.
    geneToBinSet, bin_setToBin = bin_sets_from_geneToBin(genes, geneToBin)

    newGeneToCases, newPatientToGenes = update_geneToCases_patientToGenes(
        geneToCases, patientToGenes, bin_setToBin, at_least_half=True)

    gene_bin_entries = get_gene_bin_entries(geneToCases, newGeneToCases,
                                            geneToBinSet, bin_setToBin)

    if gene_bin_entries_file:
        met.writeanydict(gene_bin_entries, gene_bin_entries_file)
        print "Gene bin entries written to ", gene_bin_entries_file

    # Write the new mutation matrix out.
    writemutationmatrix(newPatientToGenes, filename=newmutationmatrix)
예제 #8
0
def main():
    # INDEX BY LOSSES
    mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst'
    out_file = '/Users/jlu96/conte/jlu/Analyses/CancerMutationDistributions/OV_broad-cna-jl-PMM.csv'
    partition_file = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf'
    min_cluster_size = 30
    num_init = 9
    minComp = 4
    maxComp = 6
    do_plot = True
    do_gmm = False
    do_dna = True
    num_integrated = 4
    do_kmeans = False
    do_pmm = False
    geneFile = None
    minFreq = 0
    dna_gene_file = '/Users/jlu96/conte/jlu/Analyses/CancerGeneAnalysis/DNADamageRepair_loss.txt'

    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(
        mutationmatrix, patientFile, geneFile, minFreq)

    p_gene_list = []

    with open(dna_gene_file, 'rU') as row_file:
        reader = csv.reader(row_file, delimiter='\t')
        for row in reader:
            p_gene_list.append(row[0])

    if do_kmeans:
        datas = []
        for i in np.arange(minComp, maxComp, 1):
            datas.append(
                partition_gene_kmeans(geneToCases,
                                      patientToGenes,
                                      p_gene_list,
                                      i,
                                      num_bins=50,
                                      title=None,
                                      do_plot=True))

        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            writer.writeheader()
            for row in datas:
                writer.writerow(row)

    if do_dna:
        cohort_dict = partition_gene_list(patientToGenes,
                                          p_gene_list,
                                          binary=not bool(num_integrated))
        # Make new cohorts over this
        if num_integrated:
            cohort_dict = integrate_cohorts(cohort_dict, numCases,
                                            num_integrated)

        cohort_pairings = [(key, cohort_dict[key]) for key in cohort_dict]
        draw_partitions_cohorts(
            geneToCases,
            patientToGenes,
            cohort_pairings,
            title='DNADamageGenes',
            num_bins=100 if mutationmatrix[-9:] == 'cna-jl.m2' else 50)

    if do_gmm:
        datas = []
        for i in np.arange(minComp, maxComp, 1):
            datas.append(
                partition_GMM(patientToGenes,
                              i,
                              num_bins=50,
                              title='GMM size ' + str(i),
                              do_plot=do_plot))

        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            writer.writeheader()
            for row in datas:
                writer.writerow(row)

    if do_pmm:
        datas = []
        clusters = []
        for num_components in np.arange(minComp, maxComp, 1):
            best_data, clusterToPatient = best_pmm(
                patientToGenes,
                num_components,
                rand_num=5,
                far_rand_num=5,
                min_cluster_size=min_cluster_size)
            datas.append(best_data)
            clusters.append(clusterToPatient)
            # data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20, rand_init=False, do_plot=True)
            # datas.append(data)
            # all_lls.append(lls)
            # for j in range(num_init):
            #     data, lls = partition_pmm(patientToGenes, i, num_bins=50, max_iter=20)
            #     datas.append(data)
            #     all_lls.append(lls)

        # os.system('say "Jonathan your program has finished"')

        # get the best BIC
        combined = zip(datas, clusters)
        combined = sorted(combined,
                          key=lambda entry:
                          (entry['MoreThanMin'], entry['BIC']))
        datas, clusters = zip(*combined)

        with open(out_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=datas[0].keys())
            print datas
            writer.writeheader()
            for row in datas:
                writer.writerow(row)

        best_data = datas[0]
        clusterToPatient = clusters[0]

        # code to parition by best clusters
        with open(partition_file, 'w') as csvfile:
            writer = csv.writer(csvfile)

            writer.writerow(['Likelihood', best_data['Likelihood']])
            writer.writerow(['BIC', best_data['BIC']])
            writer.writerow(['NumComponents', best_data['Number']])
            writer.writerow(['Cluster', 'Lambda', 'Probability', 'Patients'])
            for k in clusterToPatient:
                if k != -1:
                    lam = best_data['Means'][k]
                    p_k = best_data['Probabilities'][k]
                else:
                    lam = None
                    p_k = None
                writer.writerow([k, lam, p_k] + list(clusterToPatient[k]))

        load_patient_cohorts(partition_file)
예제 #9
0
def main():

    mutationmatrix = '/Users/jlu96/maf/new/PRAD_broad/PRAD_broad-som.m2'
    patientFile = None #'/Users/jlu96/maf/new/PRAD_broad/shared_patients.plst'
    geneFile = None #'/Users/jlu96/conte/jlu/REQUIREDFILES_OnlyLoss2/COSMICGenes_OnlyLoss.txt'
    load_directory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices'
    minFreq = 0
    num_permutations = 20
    binary_perm_method = False
    Q = 100
    write_matrices = True
    matrixdirectory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices'
        #'/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/SARC_broad-som-jl-' + ('matrix' if binary_perm_method else 'network')
    outmutexfile = matrixdirectory + '/mutex' + str(num_permutations) + str(time.time()) + '.tsv'
    outcooccurfile = matrixdirectory + '/cooccur' + str(num_permutations)  + str(time.time()) + '.tsv'
    outseedsfile = matrixdirectory + '/seeds' + str(time.time()) + '.tsv'


    if not os.path.exists(os.path.dirname(matrixdirectory)):
        os.makedirs(os.path.dirname(matrixdirectory))


    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)

    print "numGenes ", numGenes, " and numCases ", numCases

    for patient in patients:
        if not patientToGenes[patient]:
            patientToGenes.pop(patient)
            print patient, "popped"

    # Generate Permutation Matrices
    pm = PermutationMatrices(geneToCases, patientToGenes, num_permutations, Q=Q, matrixdirectory=matrixdirectory,
                             binary_perm_method=binary_perm_method, write_matrices=write_matrices, load_directory=load_directory,
                             geneFile=geneFile, patientFile=patientFile, minFreq=minFreq)

    # Make list of pairs from highly mutated genes
    test_genes = [gene for gene in genes if len(geneToCases[gene]) > 5]
    # for test_gene in test_genes:
    #     print test_gene
    genepairs = met.getgenepairs(geneToCases, test_genes)
    print "Number of pairs to test ", len(genepairs)





    # CALCULATE MUTEX

    # Create a list of ConditionFunctions that you must later initialize...
    ConditionFunctions = range(len(genepairs))
    mutex_set_condition_function_list = []

    # Generate set_condition_function_list
    for i in range(len(genepairs)):
        genepair = genepairs[i]

        condition_dict = {}
        condition_dict['Genes'] = tuple(genepair)
        condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']]))
        condition_dict['Mutex'] = True

        ConditionFunctions[i] = Condition([condition_dict])

        if [condition_dict] != ConditionFunctions[i].conditions:
            print condition_dict, ConditionFunctions[i].conditions


        mutex_set_condition_function_list.append((genepair, ConditionFunctions[i]))

    print "Finished mutex condition function list"

    t= time.time()
    # Calculate pvalues for mutual exclusivity
    pair_to_mutex = {}

    pair_to_mutex_network_pvalue = pm.set_to_pvalue(mutex_set_condition_function_list)
    print "mutex pair network pvalues finished in ", time.time() - t

    for genepair in genepairs:
        pair_to_mutex[genepair] = mex.analyze_mutex_set_new(numCases, geneToCases, patientToGenes, genepair)
        pair_to_mutex[genepair]['NetworkProbability'] = pair_to_mutex_network_pvalue[genepair]




    # Write to output
    with open(outmutexfile, 'w') as csvfile:
        fieldnames = pair_to_mutex[genepairs[0]].keys()
        writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)
        writer.writeheader()
        for genepair in pair_to_mutex:
            writer.writerow(pair_to_mutex[genepair])



    # CALCULATE COOCCUR

    cooccur_set_condition_function_list = []

    # Generate set_condition_function_list
    for genepair in genepairs:
        ConditionFunction = Condition(None)

        condition_dict = {}
        condition_dict['Genes'] = tuple(genepair)
        condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']]))
        condition_dict['Mutex'] = False

        ConditionFunction.set_params([condition_dict])

        cooccur_set_condition_function_list.append((genepair, ConditionFunction))



    t= time.time()
    # Calculate pvalues for mutual exclusivity
    pair_to_cooccur = {}

    pair_to_cooccur_network_pvalue = pm.set_to_pvalue(cooccur_set_condition_function_list)
    print "cooccur pair network pvalues finished in ", time.time() - t

    for genepair in genepairs:
        pair_to_cooccur[genepair] = mex.analyze_cooccur_set_new(numCases, geneToCases, patientToGenes, genepair)
        pair_to_cooccur[genepair]['NetworkProbability'] = pair_to_cooccur_network_pvalue[genepair]




    # Write to output
    with open(outcooccurfile, 'w') as csvfile:
        fieldnames = pair_to_cooccur[genepairs[0]].keys()
        writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)
        writer.writeheader()
        for genepair in pair_to_cooccur:
            writer.writerow(pair_to_cooccur[genepair])


    # Write seeds to output
    with open(outseedsfile, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        for seed in pm.seeds:
            writer.writerow([seed])
예제 #10
0
def main():

    mutationmatrix = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.m2'
    patientFile = '/Users/jlu96/maf/new/OV_broad/shared_patients.plst'
    cpairfile = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LorenzoModel/Binomial/OV_broad-cna-jl-cpairs-min_cohort.txt'
    partitionfile = '/Users/jlu96/maf/new/OV_broad/OV_broad-cna-jl.ppf'
    load_partitions = True
    do_min_cohort = True

    geneFile = None
    minFreq = 0
    test_minFreq = 100
    compute_mutex = True

    include_cohort_info = False
    num_cohorts_list = [1, 3, 5, 7]

    numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(
        mutationmatrix, patientFile, geneFile, minFreq)

    print "number of genes is ", numGenes

    if do_min_cohort:
        cohort_dict, clusterToProp, min_cohort = partition.load_patient_cohorts(
            partitionfile, patientToGenes)
        min_cohort_genes = set.union(*(patientToGenes[p] for p in min_cohort))

        print "getting pairs"
        genepairs = met.getgenepairs(geneToCases,
                                     min_cohort_genes,
                                     test_minFreq=test_minFreq)

        print "Number of pairs ", len(genepairs)

        print "Normal cooccur test"
        t = time.time()
        cpairsdict, cgenedict = met.cooccurpairs(numCases,
                                                 geneToCases,
                                                 patientToGenes,
                                                 genepairs,
                                                 compute_mutex=compute_mutex)
        print "Normal cooccur done in ", time.time() - t

        print "Beginning cohorts"
        t = time.time()
        cpairsdict = add_BinomP_min_cohort_all_pairs(cpairsdict, geneToCases,
                                                     patientToGenes,
                                                     cohort_dict, min_cohort)
        print "Cohorts done in ", time.time() - t

    else:
        genepairs = met.getgenepairs(geneToCases,
                                     genes,
                                     test_minFreq=test_minFreq)
        print "Number of pairs ", len(genepairs)

        print "Normal cooccur test"
        cpairsdict, cgenedict = met.cooccurpairs(numCases,
                                                 geneToCases,
                                                 patientToGenes,
                                                 genepairs,
                                                 compute_mutex=compute_mutex)

        # print "Add binomial probability"
        # cpairsdict = add_BinomP_all_pairs(cpairsdict, geneToCases, patientToGenes)

        # undo
        print "Beginning cohorts"

        if load_partitions:
            cohort_dict = partition.load_patient_cohorts(partitionfile)
            cpairsdict = add_BinomP_cohorts_all_pairs(cpairsdict, geneToCases,
                                                      patientToGenes,
                                                      cohort_dict)

        else:
            for num_cohorts in num_cohorts_list:
                # get cohorts
                cohort_dict = generate_patient_cohorts(patientToGenes,
                                                       num_cohorts)

                cpairsdict = add_BinomP_cohorts_all_pairs(
                    cpairsdict, geneToCases, patientToGenes, cohort_dict)

                if include_cohort_info:
                    cpairsdict = add_cohorts_all_pairs(cpairsdict, geneToCases,
                                                       patientToGenes,
                                                       cohort_dict)

    print "Writing to file..."
    met.writeanydict(cpairsdict, cpairfile)