예제 #1
0
def find_clusters_and_survival(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean" ,plot=True):

    data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name,                                                                                                                                                    phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name)

    clfs_results = None
    results = []
    if is_unsupervised:
        clfs_results = find_clusters(end_k, gene_expression_top_var, gene_expression_top_var_headers_rows,
                                    start_k, e2g_convertor(gene_expression_top_var_headers_columns),
                                   tested_gene_list_file_name, labels_assignment, clustering_algorithm=clustering_algorithm, plot=plot)

        for i in range(start_k,end_k+1):
            results.append(km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],i)[0])

    else:
        for i, cur_groups in enumerate(meta_groups):
            labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups)
            plot_heatmap(gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns),
                         [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:],
                         gene_expression_top_var_headers_rows,
                         tested_gene_list_file_name, label_index=i)
            results.append(km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)[0])

    return results ,clfs_results
예제 #2
0
def find_clusters_and_gene_enrichment(tested_gene_list_file_name,
                                      total_gene_list_file_name,
                                      gene_expression_file_name,
                                      phenotype_file_name,
                                      gene_filter_file_name=None,
                                      tested_gene_list_path=None,
                                      total_gene_list_path=None,
                                      gene_expression_path=None,
                                      phenotype_path=None,
                                      gene_filter_file_path=None,
                                      var_th_index=None,
                                      start_k=2,
                                      end_k=6,
                                      calc_go=True,
                                      enrichment_list_file_names=None,
                                      meta_groups=None,
                                      filter_expression=None,
                                      cluster_algorithm=None):
    # fetch gene expression by gene_id, divided by tumor type
    gene_sets = []
    expression_sets = []
    averaged_expression_sets = []
    tested_gene_expression = load_gene_expression_profile_by_genes(
        tested_gene_list_file_name, gene_expression_file_name,
        gene_filter_file_name, tested_gene_list_path, gene_expression_path,
        gene_filter_file_path)
    tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers(
        tested_gene_expression)

    if filter_expression is not None:
        filtered_patients = [
            y for x in divided_patient_ids_by_label(phenotype_file_name,
                                                    groups=filter_expression)
            for y in x
        ]
        print "number of filtered patients from phenotypes: {}".format(
            len(filtered_patients))
    else:
        print "no filter applied"
        filtered_patients = tested_gene_expression_headers_columns

    tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients(
        filtered_patients, tested_gene_expression_headers_columns,
        tested_gene_expression)
    if np.shape(tested_gene_expression)[1] == 1:
        print "no expressions were found after filtering by labels {}. skipping...".format(
            filter_expression)
        return None

    total_gene_list = load_gene_list(total_gene_list_file_name)
    tested_gene_list = load_gene_list(tested_gene_list_file_name)
    row_var = np.var(tested_gene_expression, axis=1)
    row_var_sorted = np.sort(row_var)[::-1]

    labels_assignment_patients = None
    if meta_groups is not None:
        print "clustering patients by groups"
        labels_assignment_patients = labels_assignments(
            meta_groups, phenotype_file_name,
            tested_gene_expression_headers_columns)

    enrichment_lists = []
    if enrichment_list_file_names is not None:
        for cur in enrichment_list_file_names:
            enrichment_lists.append(load_gene_list(cur))

    if var_th_index is None:
        var_th_index = len(row_var_sorted) - 1
    row_var_th = row_var_sorted[var_th_index]
    row_var_masked_indices = np.where(row_var_th > row_var)[0]
    gene_expression_top_var = np.delete(tested_gene_expression,
                                        row_var_masked_indices,
                                        axis=0)
    gene_expression_top_var_header_rows = np.delete(
        tested_gene_expression_headers_rows, row_var_masked_indices, axis=0)
    gene_expression_top_var_header_columns = tested_gene_expression_headers_columns

    clfs_results = {}
    output_rows = []
    if calc_go:
        if not os.path.exists(
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
            wget.download(
                constants.GO_OBO_URL,
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))
        # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')):
        #     wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf'))
        obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

        assoc = read_ncbi_gene2go(os.path.join(
            constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)
        g = GOEnrichmentStudy(
            [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
            assoc,
            obo_dag,
            methods=["bonferroni", "fdr_bh"])
        g_res = g.run_study([
            int(cur) for cur in ensembl2entrez_convertor(
                gene_expression_top_var_header_rows)
        ])
        GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected,
                       cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05]
        print GO_results

    if cluster_algorithm == "kmeans":

        for n_clusters in range(start_k, end_k + 1):
            clfs_results[n_clusters] = []
            centres, km_clf, dist = kmeanssample(X=gene_expression_top_var,
                                                 k=n_clusters,
                                                 metric="euclidean")
            for i in range(n_clusters):

                ranks = []
                for j in range(n_clusters):
                    ranks.append(
                        np.average(
                            np.delete(gene_expression_top_var,
                                      np.where(km_clf != j)[0],
                                      axis=0)))
                ranks = rankdata(ranks)
                cluster_labels = np.array(km_clf)
                for j in range(n_clusters):
                    cluster_labels[np.where(km_clf == ranks[j] - 1)] = j
                labels_assignment = [cluster_labels + 1]

                cluster_indices = np.where(km_clf != i)[0]
                gene_expression_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                gene_headers_row_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                clfs_results[n_clusters].append(
                    (gene_headers_row_cluster, gene_headers_row_cluster))
                desc = "k={} clustering cluster {} has {} genes".format(
                    n_clusters, i, len(gene_expression_cluster))
                gene_list = ",".join(gene_headers_row_cluster)
                url = check_enrichment(gene_list)

                go_terms = []
                uncorrectd_pvals = []
                FDRs = []
                go_names = []
                go_ns = []
                if calc_go:
                    g_res = g.run_study([
                        int(cur) for cur in ensembl2entrez_convertor(
                            gene_headers_row_cluster)
                    ])
                    GO_results = [(cur.NS, cur.GO, cur.goterm.name,
                                   cur.p_uncorrected, cur.p_fdr_bh)
                                  for cur in g_res if cur.p_fdr_bh <= 0.05]
                    if len(GO_results) > 0:
                        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(
                            *GO_results)

                if len(enrichment_lists) != 0:
                    for j, cur in enumerate(enrichment_lists):
                        go_terms.append(
                            enrichment_list_file_names[j].split(".")[0])
                        uncorrectd_pvals.append(
                            calc_HG_test(
                                [x.split(".")[0] for x in tested_gene_list],
                                [x.split(".")[0] for x in cur], [
                                    x.split(".")[0]
                                    for x in gene_headers_row_cluster
                                ]))
                        FDRs.append(".")
                        go_names.append(".")
                        go_ns.append(".")

                output_rows.append((desc, "\r\n".join([
                    x.split(".")[0] for x in gene_headers_row_cluster
                ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms),
                                    "\r\n".join(go_names),
                                    "\r\n".join(map(str, uncorrectd_pvals)),
                                    "\r\n".join(map(str, FDRs))))

        gene_sorted_heatmap = np.rot90(np.flip(
            gene_expression_top_var[cluster_labels.argsort(), :], 1),
                                       k=-1,
                                       axes=(1, 0))
        find_clusters(end_k,
                      gene_sorted_heatmap,
                      gene_expression_top_var_header_columns,
                      start_k,
                      e2g_convertor(gene_expression_top_var_header_rows),
                      tested_gene_list_file_name,
                      labels_assignment=labels_assignment_patients)

        plot_heatmap(gene_expression_top_var,
                     gene_expression_top_var_header_columns,
                     labels_assignment,
                     gene_expression_top_var_header_rows,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1),
                                   k=-1,
                                   axes=(1, 0))
    if cluster_algorithm == "hierarchical":
        df = pd.DataFrame(data=gene_sorted_heatmap,
                          index=gene_expression_top_var_header_columns,
                          columns=gene_expression_top_var_header_rows)

        # correlations = df.corr()
        # correlations_array = np.asarray(df.corr())
        #
        # row_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array), method='average')
        #
        # col_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array.T), method='average')

        # enrichment_gene_list = load_gene_list("uvm_mito_part.txt")
        dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg"))
        row_colors = map(dct.get, labels_assignment_patients[0])
        dct = {1: 'b', 2: 'r'}
        gene_expression_top_var_header_rows_trimmed = [
            x.split(".")[0] for x in gene_expression_top_var_header_rows
        ]
        # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed])
        g = sns.clustermap(df,
                           row_colors=row_colors,
                           metric="euclidean",
                           robust=True,
                           method="single")
        # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage,
        #                                          labels=df.index,
        #                                          color_threshold=0.60)
        den_genes = scipy.cluster.hierarchy.dendrogram(
            g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7)
        clusters = get_cluster_classes(den_genes)

        g.savefig(
            os.path.join(constants.BASE_PROFILE, "output",
                         "hierarchical_cluster_{}.png".format(time.time())))

    for cur_labels_assignment_patient in labels_assignment_patients:
        plot_heatmap(gene_sorted_heatmap,
                     gene_expression_top_var_header_rows,
                     [cur_labels_assignment_patient],
                     gene_expression_top_var_header_columns,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    print_to_excel(
        output_rows=output_rows,
        gene_list_file_name=tested_gene_list_file_name.split(".")[0],
        gene_expression_file_name=gene_expression_file_name.split(".")[0],
        var_th_index=var_th_index)
def patient_sets_distribution_differences(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean", average_patients=True, compute_hotelling=False):

    data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name,                                                                                                                                                    phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    for i, cur_groups in enumerate(meta_groups):
        labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups)

        ordered_gene_expression = gene_expression_top_var[labels_assignment[i].argsort(), :]
        labels_assignment[i].sort()

        heatmap_values =  gene_expression_top_var


        if average_patients:
            avgs = None
            for cur_label_1 in np.unique(labels_assignment[i]):
                avg = np.average(ordered_gene_expression[np.where(labels_assignment[i] == cur_label_1)[0],:], axis=0)
                if avgs is None:
                    avgs = avg.reshape(1,len(avg))
                else:
                    avgs = np.r_[avgs, avg.reshape(1,len(avg))]
            heatmap_values=avgs

        plot_pca(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name)
        plot_pca_by_samples(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name)




        plot_heatmap(heatmap_values, e2g_convertor(gene_expression_top_var_headers_columns),
                     [np.unique(labels_assignment[i])],
                     gene_expression_top_var_headers_rows,
                     tested_gene_list_file_name, label_index=i)


        if compute_hotelling:
            with file(os.path.join(constants.BASE_PROFILE,"output", "hotelling_{}_{}_{}.txt".format(constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], time.time())),"w+") as f:
                output = "\t"
                for cur_label_1 in np.unique(labels_assignment[i]):
                    output+= "group {}\t".format(cur_label_1)
                output += "\n"
                for cur_label_1 in np.unique(labels_assignment[i]):
                    output += "group {}\t".format(cur_label_1)
                    for cur_label_2 in np.unique(labels_assignment[i]):
                        if cur_label_2 > cur_label_1: continue

                        cur_label_2_start = np.where(labels_assignment[i]==cur_label_2)[0][0]
                        cur_label_1_end = len(labels_assignment[i])
                        if cur_label_1 != labels_assignment[i][-1]:
                            cur_label_1_end = np.where(labels_assignment[i]==cur_label_1+1)[0][0]
                        cur_label_2_end = len(labels_assignment[i])
                        if cur_label_2 != labels_assignment[i][-1]:
                            cur_label_2_end = np.where(labels_assignment[i] == cur_label_2 + 1)[0][0]

                        T2 = spm1d.stats.hotellings2(ordered_gene_expression[cur_label_1_start:cur_label_1_end], ordered_gene_expression[cur_label_2_start:cur_label_2_end])
                        T2i = T2.inference(0.05)
                        output+="{}\t".format(T2i.p)
                        # km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)
                    output+="\n"
                f.write(output)
예제 #4
0
def find_clusters_and_survival(reduced_dim_file_name,
                               total_gene_list_file_name,
                               gene_expression_file_name,
                               phenotype_file_name,
                               survival_file_name,
                               gene_filter_file_name=None,
                               tested_gene_list_path=None,
                               total_gene_list_path=None,
                               gene_expression_path=None,
                               phenotype_path=None,
                               gene_filter_file_path=None,
                               var_th_index=None,
                               is_unsupervised=True,
                               start_k=2,
                               end_k=2,
                               meta_groups=None,
                               filter_expression=None,
                               clustering_algorithm="euclidean"):

    data = load_integrated_ge_data(
        tested_gene_list_file_name=reduced_dim_file_name,
        total_gene_list_file_name=total_gene_list_file_name,
        gene_expression_file_name=gene_expression_file_name,
        gene_expression_path=os.path.join(constants.OUTPUT_GLOBAL_DIR,
                                          gene_expression_file_name),
        phenotype_file_name=phenotype_file_name,
        survival_file_name=survival_file_name,
        var_th_index=var_th_index,
        meta_groups=meta_groups,
        filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    tmp = gene_expression_top_var_headers_rows
    gene_expression_top_var_headers_rows = gene_expression_top_var_headers_columns
    gene_expression_top_var_headers_columns = tmp
    gene_expression_top_var = np.rot90(np.flip(gene_expression_top_var, 1),
                                       k=-1,
                                       axes=(1, 0))

    # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name)
    lr_results_global = []
    if is_unsupervised:
        clfs_results = find_clusters(
            end_k,
            gene_expression_top_var,
            gene_expression_top_var_headers_rows,
            start_k,
            e2g_convertor(gene_expression_top_var_headers_columns),
            reduced_dim_file_name,
            labels_assignment,
            clustering_algorithm=clustering_algorithm)
        for i in range(start_k, end_k + 1):
            lr_results_global.append(
                km_curve(clfs_results[i], survival_dataset[1:],
                         gene_expression_top_var_headers_rows,
                         reduced_dim_file_name.split(".")[0], i))
            # B1 = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A']
            # B2 = ['TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A']
            # N = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A', 'TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A']
            # print "Group Low HG:"
            # print calc_HG_test(N,B1,clfs_results[i][0])
            # print calc_HG_test(N, B1, clfs_results[i][1])
            # print "Group High HG:"
            # print calc_HG_test(N, B2, clfs_results[i][0])
            # print calc_HG_test(N, B2, clfs_results[i][1])

    else:
        for i, cur_groups in enumerate(meta_groups):
            labeled_patients = divided_patient_ids_by_label(
                phenotype_file_name, groups=cur_groups)
            plot_heatmap(
                gene_expression_top_var,
                e2g_convertor(gene_expression_top_var_headers_columns),
                [labels_assignment[i]] + labels_assignment[:i] +
                labels_assignment[i + 1:],
                gene_expression_top_var_headers_rows,
                reduced_dim_file_name,
                label_index=i)
            lr_results_global.append(
                km_curve(labeled_patients,
                         survival_dataset[1:],
                         gene_expression_top_var_headers_rows,
                         reduced_dim_file_name.split(".")[0],
                         label_index=i))

    return lr_results_global
예제 #5
0
def predict_ge_by_mutation(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, phenotype_labels_heatmap = None, filter_expression=None, integ=False, min_ratio=0.1 , included_mutation_gene_list=None, excluded_mutation_gene_list=None):


    integ_data = load_integrated_mutation_data(
                                      mutation_file_name=mutation_file_name,
                                      phenotype_file_name=phenotype_file_name,
                                      survival_file_name=survival_file_name, var_th_index=var_th_index,
                                      meta_groups=meta_groups, phenotype_labels_heatmap = phenotype_labels_heatmap,
                                      filter_expression=filter_expression)
    if integ_data is None:
        print "insufficient data"
        return
    mu_data, mu_data_headers_rows, mu_data_headers_columns, labels_assignment, survival_dataset, phenotype_heatmap = integ_data



    all_patients = np.unique(mu_data_headers_rows).flatten()
    all_mutated_genes = np.unique(mu_data[:,0]).flatten()
    # mis_mutated_genes = np.unique(mu_data[np.where(np.core.defchararray.find(mu_data[1:, 8], "missense")!=-1), 1]).flatten()

    all_mutated_vectors = np.zeros((len(all_patients), len(all_mutated_genes)))
    # mis_mutated_vectors = np.array([[0 for y in mis_mutated_genes] for x in range(len(all_patients))])

    print "build vectors from {} entries".format(len(mu_data))

    stopwatch = Stopwatch()
    stopwatch.start()
    a = list(all_patients)
    b = list(all_mutated_genes)
    for i, x in enumerate(mu_data):
        all_mutated_vectors[a.index(mu_data_headers_rows[i])][b.index(x[0])] += 1
    print stopwatch.stop("end mut")
    all_mutated_vectors[all_mutated_vectors>5] =5

    if included_mutation_gene_list is not None:
        included_mutation_gene = load_gene_list(included_mutation_gene_list)
        all_mutated_vectors = all_mutated_vectors[:, np.in1d(all_mutated_genes,included_mutation_gene)]
        all_mutated_genes = all_mutated_genes[np.in1d(all_mutated_genes,included_mutation_gene)]

    if excluded_mutation_gene_list is not None:
        excluded_mutation_gene = load_gene_list(excluded_mutation_gene_list)
        for cur in excluded_mutation_gene:
            all_mutated_vectors = all_mutated_vectors[:,all_mutated_genes!=cur ]
            all_mutated_genes = all_mutated_genes[all_mutated_genes!=cur ]

    all_mutated_vectors[all_mutated_vectors > 5] = 5
    all_mutated_genes = all_mutated_genes[(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * min_ratio]
    all_mutated_vectors = all_mutated_vectors[:,(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * min_ratio]
    print "all_mutated_vectors after filter sparse: {}".format(np.shape(all_mutated_vectors))

    if np.size(all_mutated_genes) == 0:
        return


    mutation_expression_integ = all_mutated_vectors
    mutual_patients = all_patients
    mutation_expression_integ_headers_columns = all_mutated_genes
    mutual_phenotype_heatmap = phenotype_heatmap
    # mutation_expression_integ = zscore(mutation_expression_integ, axis=0)
    if integ:
        ge_data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name,
                                          total_gene_list_file_name=total_gene_list_file_name,
                                          gene_expression_file_name=gene_expression_file_name,
                                          phenotype_file_name=phenotype_file_name,
                                          survival_file_name=survival_file_name, var_th_index=var_th_index,
                                          meta_groups=meta_groups, filter_expression=filter_expression)
        if ge_data is None:
            print "insufficient data"
            return
        gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, _ , survival_dataset = ge_data

        all_mutated_vectors = np.nan_to_num(zscore(all_mutated_vectors, axis=0))
        gene_expression_top_var = np.nan_to_num(zscore(gene_expression_top_var, axis=0))

        mutual_patients = np.array([x for x in all_patients if x in gene_expression_top_var_headers_rows])
        mutual_mutations = all_mutated_vectors[np.in1d(all_patients, mutual_patients)]
        mutual_mutations = mutual_mutations[mutual_patients.argsort()]

        if phenotype_labels_heatmap is not None:
            mutual_phenotype_heatmap = phenotype_heatmap[np.in1d(all_patients, mutual_patients)]
            mutual_phenotype_heatmap = mutual_phenotype_heatmap[mutual_patients.argsort()]

        mutual_patients = np.array([x for x in gene_expression_top_var_headers_rows if x in all_patients])
        mutual_expressions = gene_expression_top_var[np.in1d(gene_expression_top_var_headers_rows, mutual_patients)]
        mutual_expressions = mutual_expressions[mutual_patients.argsort()]

        mutual_patients.sort()
        mutation_expression_integ = np.c_[mutual_mutations, mutual_expressions]
        mutation_expression_integ_headers_columns = np.r_[all_mutated_genes, gene_expression_top_var_headers_columns]
    else:
        survival_dataset = np.array(load_survival_data(survival_file_name))

    if is_unsupervised:
        print "find clusters"
        clfs_results = find_clusters(end_k, mutation_expression_integ, mutual_patients,
                                    start_k, mutation_expression_integ_headers_columns,
                                   tested_gene_list_file_name, labels_assignment=labels_assignment, phenotype_heatmap=mutual_phenotype_heatmap)
        for cur_k in range(start_k, end_k+1):
            km_curve(clfs_results[cur_k], survival_dataset[1:], mutual_patients,
                     tested_gene_list_file_name.split(".")[0], i)
    else:
        for i, cur_groups in enumerate(meta_groups):
            labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups)
            plot_heatmap(mutation_expression_integ, mutation_expression_integ_headers_columns,
                         [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:],
                         mutual_patients,
                         tested_gene_list_file_name, label_index=i)
            km_curve(labeled_patients, survival_dataset[1:], mutual_patients, tested_gene_list_file_name.split(".")[0],label_index=i)