예제 #1
0
def find_clusters_and_survival(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean" ,plot=True):

    data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name,                                                                                                                                                    phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name)

    clfs_results = None
    results = []
    if is_unsupervised:
        clfs_results = find_clusters(end_k, gene_expression_top_var, gene_expression_top_var_headers_rows,
                                    start_k, e2g_convertor(gene_expression_top_var_headers_columns),
                                   tested_gene_list_file_name, labels_assignment, clustering_algorithm=clustering_algorithm, plot=plot)

        for i in range(start_k,end_k+1):
            results.append(km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],i)[0])

    else:
        for i, cur_groups in enumerate(meta_groups):
            labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups)
            plot_heatmap(gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns),
                         [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:],
                         gene_expression_top_var_headers_rows,
                         tested_gene_list_file_name, label_index=i)
            results.append(km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)[0])

    return results ,clfs_results
예제 #2
0
def main(mrna_list_file_names, mir_list_file_names):
    output_files = []
    mirna_clusters = load_mirna_clusters("mir_clusters_by_targets.txt")
    associated_mirna = []
    for cur_mrna_list in mrna_list_file_names:
        mrna_list = load_gene_list(cur_mrna_list)
        for cur_mir_list in mir_list_file_names:
            mir_list = load_gene_list(cur_mir_list)
            for cur in mirna_clusters:
                if cur[0].split(".")[0] in mrna_list and len(
                        set(cur[1:]).intersection(mir_list)) != 0:
                    associated_mirna = associated_mirna + list(
                        set(cur[1:]).intersection(mir_list))

    associated_mirna = list(set(associated_mirna))
    associated_mirna = e2g_convertor(associated_mirna)
    f = file(
        os.path.join(
            constants.LIST_DIR, "mir_{}.txt".format("_".join(
                [x.split(".")[0] for x in mrna_list_file_names]))), "w+")
    f.write("\r\n".join(associated_mirna))
    f.close()
    print associated_mirna

    return associated_mirna
예제 #3
0
def check_group_enrichment(tested_gene_file_name, total_gene_file_name):
    total_gene_list = load_gene_list(total_gene_file_name)
    tested_gene = load_gene_list(tested_gene_file_name)

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in:
            with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
                          assoc, obo_dag, methods=["bonferroni", "fdr_bh"])
    g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
                  cur.p_fdr_bh <= 0.05]
    if len(GO_results) > 0:
        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results)
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    output_rows = [("\r\n".join(e2g_convertor(tested_gene)),  "\r\n".join(go_ns),
                        "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
                        "\r\n".join(map(str, FDRs)))]
    print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
예제 #4
0
def init_specific_params(ge_file_name=os.path.join(constants.DATA_DIR,
                                                   "ge.tsv"),
                         network_file_name=os.path.join(
                             constants.NETWORKS_DIR, NETWORK_NAME + ".sif")):
    h_rows, h_columns, values = infra.separate_headers(
        infra.load_gene_expression_profile_by_genes(
            gene_expression_file_name=ge_file_name))
    df_ge = pd.DataFrame(columns=h_columns, index=h_rows, data=values)
    df_ge_cond_col = df_ge.columns
    df_ge["gene ID"] = df_ge.index
    df_ge["GeneName"] = [
        e2g_convertor([cur])[0] if len(e2g_convertor([cur])) > 0 else np.NAN
        for cur in df_ge.index
    ]
    df_ge = df_ge[["gene ID", "GeneName"] + list(df_ge_cond_col)]
    df_ge = df_ge[~df_ge['gene ID'].duplicated(keep='first')]
    ge_file_name_mts = os.path.splitext(ge_file_name)[0] + "_mts.tsv"
    df_ge.to_csv(ge_file_name_mts, index=False, sep="\t")

    output_file_name = os.path.join(constants.OUTPUT_DIR, "matisse_output.txt")
    return ge_file_name_mts, network_file_name, output_file_name
예제 #5
0
def gene_correlation_scores(tested_gene_list_file_name,
                            total_gene_list_file_name,
                            gene_expression_file_name,
                            gene_filter_file_name=None,
                            top_n=2000):
    print "about ot analyse: {}".format(tested_gene_list_file_name)
    # fetch gene expression by gene_id, divided by tumor type
    total_gene_expression = np.array(
        load_gene_expression_profile_by_genes(total_gene_list_file_name,
                                              gene_expression_file_name,
                                              gene_filter_file_name))
    gene_ids = load_gene_list(tested_gene_list_file_name)
    ranks_dict = {}
    ranks_score = []
    ranks = []
    for cur in gene_ids:
        ranks.append([])
        cur_expression = total_gene_expression[np.where(
            total_gene_expression[:, 0] == cur)][0]
        for cur_prot_expression in total_gene_expression[1:]:
            prs = pearsonr(cur_prot_expression[1:].astype(np.float32),
                           cur_expression[1:].astype(np.float32))[0]
            if not math.isnan(prs) and cur_prot_expression[0] not in gene_ids:
                ranks[-1].append((cur_prot_expression[0], abs(prs), prs > 0))
        ranks[-1] = sorted(ranks[-1], key=lambda x: x[1], reverse=True)[:top_n]

        for cur in ranks[-1]:
            if not ranks_dict.has_key(cur[0]):
                ranks_dict[cur[0]] = []
            ranks_dict[cur[0]].append(cur[1])

    for k, v in ranks_dict.iteritems():
        ranks_score.append((k, sum(v), e2g_convertor([k])[0]))
    ranks_score = sorted(ranks_score, key=lambda x: x[1], reverse=True)[:top_n]
    print ranks_score

    f = file(
        os.path.join(
            constants.LIST_DIR, "corr_{}_top_{}.txt".format(
                tested_gene_list_file_name.split(".")[0], top_n)), 'w+')
    f.write("\r\n".join([x[0] for x in ranks_score]))
    f.close()
예제 #6
0
def plot_genes_statistic(gene_expression_top_var,
                         gene_expression_top_var_headers_columns,
                         tested_gene_list_file_name):
    ax = plt.subplot(111)
    positions = np.arange(len(gene_expression_top_var_headers_columns)) + 1
    bp = ax.boxplot(
        gene_expression_top_var,
        positions=positions,
        showmeans=True,
        labels=e2g_convertor(gene_expression_top_var_headers_columns))
    ax.set_title("genes_statistic_{}_{}_averaged var:{}".format(
        constants.CANCER_TYPE,
        tested_gene_list_file_name.split(".")[0],
        '%.3f' % np.average(np.var(gene_expression_top_var, axis=0))))
    for label in ax.xaxis.get_ticklabels():
        label.set_fontsize(7)
        label.set_rotation(90)
    plt.savefig(
        os.path.join(
            constants.BASE_PROFILE, "output",
            "genes_statistic_{}_{}_{}.png".format(
                constants.CANCER_TYPE,
                tested_gene_list_file_name.split(".")[0], time.time())))
예제 #7
0
def find_clusters_and_gene_enrichment(tested_gene_list_file_name,
                                      total_gene_list_file_name,
                                      gene_expression_file_name,
                                      phenotype_file_name,
                                      gene_filter_file_name=None,
                                      tested_gene_list_path=None,
                                      total_gene_list_path=None,
                                      gene_expression_path=None,
                                      phenotype_path=None,
                                      gene_filter_file_path=None,
                                      var_th_index=None,
                                      start_k=2,
                                      end_k=6,
                                      calc_go=True,
                                      enrichment_list_file_names=None,
                                      meta_groups=None,
                                      filter_expression=None,
                                      cluster_algorithm=None):
    # fetch gene expression by gene_id, divided by tumor type
    gene_sets = []
    expression_sets = []
    averaged_expression_sets = []
    tested_gene_expression = load_gene_expression_profile_by_genes(
        tested_gene_list_file_name, gene_expression_file_name,
        gene_filter_file_name, tested_gene_list_path, gene_expression_path,
        gene_filter_file_path)
    tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers(
        tested_gene_expression)

    if filter_expression is not None:
        filtered_patients = [
            y for x in divided_patient_ids_by_label(phenotype_file_name,
                                                    groups=filter_expression)
            for y in x
        ]
        print "number of filtered patients from phenotypes: {}".format(
            len(filtered_patients))
    else:
        print "no filter applied"
        filtered_patients = tested_gene_expression_headers_columns

    tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients(
        filtered_patients, tested_gene_expression_headers_columns,
        tested_gene_expression)
    if np.shape(tested_gene_expression)[1] == 1:
        print "no expressions were found after filtering by labels {}. skipping...".format(
            filter_expression)
        return None

    total_gene_list = load_gene_list(total_gene_list_file_name)
    tested_gene_list = load_gene_list(tested_gene_list_file_name)
    row_var = np.var(tested_gene_expression, axis=1)
    row_var_sorted = np.sort(row_var)[::-1]

    labels_assignment_patients = None
    if meta_groups is not None:
        print "clustering patients by groups"
        labels_assignment_patients = labels_assignments(
            meta_groups, phenotype_file_name,
            tested_gene_expression_headers_columns)

    enrichment_lists = []
    if enrichment_list_file_names is not None:
        for cur in enrichment_list_file_names:
            enrichment_lists.append(load_gene_list(cur))

    if var_th_index is None:
        var_th_index = len(row_var_sorted) - 1
    row_var_th = row_var_sorted[var_th_index]
    row_var_masked_indices = np.where(row_var_th > row_var)[0]
    gene_expression_top_var = np.delete(tested_gene_expression,
                                        row_var_masked_indices,
                                        axis=0)
    gene_expression_top_var_header_rows = np.delete(
        tested_gene_expression_headers_rows, row_var_masked_indices, axis=0)
    gene_expression_top_var_header_columns = tested_gene_expression_headers_columns

    clfs_results = {}
    output_rows = []
    if calc_go:
        if not os.path.exists(
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
            wget.download(
                constants.GO_OBO_URL,
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))
        # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')):
        #     wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf'))
        obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

        assoc = read_ncbi_gene2go(os.path.join(
            constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)
        g = GOEnrichmentStudy(
            [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
            assoc,
            obo_dag,
            methods=["bonferroni", "fdr_bh"])
        g_res = g.run_study([
            int(cur) for cur in ensembl2entrez_convertor(
                gene_expression_top_var_header_rows)
        ])
        GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected,
                       cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05]
        print GO_results

    if cluster_algorithm == "kmeans":

        for n_clusters in range(start_k, end_k + 1):
            clfs_results[n_clusters] = []
            centres, km_clf, dist = kmeanssample(X=gene_expression_top_var,
                                                 k=n_clusters,
                                                 metric="euclidean")
            for i in range(n_clusters):

                ranks = []
                for j in range(n_clusters):
                    ranks.append(
                        np.average(
                            np.delete(gene_expression_top_var,
                                      np.where(km_clf != j)[0],
                                      axis=0)))
                ranks = rankdata(ranks)
                cluster_labels = np.array(km_clf)
                for j in range(n_clusters):
                    cluster_labels[np.where(km_clf == ranks[j] - 1)] = j
                labels_assignment = [cluster_labels + 1]

                cluster_indices = np.where(km_clf != i)[0]
                gene_expression_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                gene_headers_row_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                clfs_results[n_clusters].append(
                    (gene_headers_row_cluster, gene_headers_row_cluster))
                desc = "k={} clustering cluster {} has {} genes".format(
                    n_clusters, i, len(gene_expression_cluster))
                gene_list = ",".join(gene_headers_row_cluster)
                url = check_enrichment(gene_list)

                go_terms = []
                uncorrectd_pvals = []
                FDRs = []
                go_names = []
                go_ns = []
                if calc_go:
                    g_res = g.run_study([
                        int(cur) for cur in ensembl2entrez_convertor(
                            gene_headers_row_cluster)
                    ])
                    GO_results = [(cur.NS, cur.GO, cur.goterm.name,
                                   cur.p_uncorrected, cur.p_fdr_bh)
                                  for cur in g_res if cur.p_fdr_bh <= 0.05]
                    if len(GO_results) > 0:
                        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(
                            *GO_results)

                if len(enrichment_lists) != 0:
                    for j, cur in enumerate(enrichment_lists):
                        go_terms.append(
                            enrichment_list_file_names[j].split(".")[0])
                        uncorrectd_pvals.append(
                            calc_HG_test(
                                [x.split(".")[0] for x in tested_gene_list],
                                [x.split(".")[0] for x in cur], [
                                    x.split(".")[0]
                                    for x in gene_headers_row_cluster
                                ]))
                        FDRs.append(".")
                        go_names.append(".")
                        go_ns.append(".")

                output_rows.append((desc, "\r\n".join([
                    x.split(".")[0] for x in gene_headers_row_cluster
                ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms),
                                    "\r\n".join(go_names),
                                    "\r\n".join(map(str, uncorrectd_pvals)),
                                    "\r\n".join(map(str, FDRs))))

        gene_sorted_heatmap = np.rot90(np.flip(
            gene_expression_top_var[cluster_labels.argsort(), :], 1),
                                       k=-1,
                                       axes=(1, 0))
        find_clusters(end_k,
                      gene_sorted_heatmap,
                      gene_expression_top_var_header_columns,
                      start_k,
                      e2g_convertor(gene_expression_top_var_header_rows),
                      tested_gene_list_file_name,
                      labels_assignment=labels_assignment_patients)

        plot_heatmap(gene_expression_top_var,
                     gene_expression_top_var_header_columns,
                     labels_assignment,
                     gene_expression_top_var_header_rows,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1),
                                   k=-1,
                                   axes=(1, 0))
    if cluster_algorithm == "hierarchical":
        df = pd.DataFrame(data=gene_sorted_heatmap,
                          index=gene_expression_top_var_header_columns,
                          columns=gene_expression_top_var_header_rows)

        # correlations = df.corr()
        # correlations_array = np.asarray(df.corr())
        #
        # row_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array), method='average')
        #
        # col_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array.T), method='average')

        # enrichment_gene_list = load_gene_list("uvm_mito_part.txt")
        dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg"))
        row_colors = map(dct.get, labels_assignment_patients[0])
        dct = {1: 'b', 2: 'r'}
        gene_expression_top_var_header_rows_trimmed = [
            x.split(".")[0] for x in gene_expression_top_var_header_rows
        ]
        # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed])
        g = sns.clustermap(df,
                           row_colors=row_colors,
                           metric="euclidean",
                           robust=True,
                           method="single")
        # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage,
        #                                          labels=df.index,
        #                                          color_threshold=0.60)
        den_genes = scipy.cluster.hierarchy.dendrogram(
            g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7)
        clusters = get_cluster_classes(den_genes)

        g.savefig(
            os.path.join(constants.BASE_PROFILE, "output",
                         "hierarchical_cluster_{}.png".format(time.time())))

    for cur_labels_assignment_patient in labels_assignment_patients:
        plot_heatmap(gene_sorted_heatmap,
                     gene_expression_top_var_header_rows,
                     [cur_labels_assignment_patient],
                     gene_expression_top_var_header_columns,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    print_to_excel(
        output_rows=output_rows,
        gene_list_file_name=tested_gene_list_file_name.split(".")[0],
        gene_expression_file_name=gene_expression_file_name.split(".")[0],
        var_th_index=var_th_index)
def patient_sets_distribution_differences(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean", average_patients=True, compute_hotelling=False):

    data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name,                                                                                                                                                    phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    for i, cur_groups in enumerate(meta_groups):
        labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups)

        ordered_gene_expression = gene_expression_top_var[labels_assignment[i].argsort(), :]
        labels_assignment[i].sort()

        heatmap_values =  gene_expression_top_var


        if average_patients:
            avgs = None
            for cur_label_1 in np.unique(labels_assignment[i]):
                avg = np.average(ordered_gene_expression[np.where(labels_assignment[i] == cur_label_1)[0],:], axis=0)
                if avgs is None:
                    avgs = avg.reshape(1,len(avg))
                else:
                    avgs = np.r_[avgs, avg.reshape(1,len(avg))]
            heatmap_values=avgs

        plot_pca(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name)
        plot_pca_by_samples(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name)




        plot_heatmap(heatmap_values, e2g_convertor(gene_expression_top_var_headers_columns),
                     [np.unique(labels_assignment[i])],
                     gene_expression_top_var_headers_rows,
                     tested_gene_list_file_name, label_index=i)


        if compute_hotelling:
            with file(os.path.join(constants.BASE_PROFILE,"output", "hotelling_{}_{}_{}.txt".format(constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], time.time())),"w+") as f:
                output = "\t"
                for cur_label_1 in np.unique(labels_assignment[i]):
                    output+= "group {}\t".format(cur_label_1)
                output += "\n"
                for cur_label_1 in np.unique(labels_assignment[i]):
                    output += "group {}\t".format(cur_label_1)
                    for cur_label_2 in np.unique(labels_assignment[i]):
                        if cur_label_2 > cur_label_1: continue

                        cur_label_2_start = np.where(labels_assignment[i]==cur_label_2)[0][0]
                        cur_label_1_end = len(labels_assignment[i])
                        if cur_label_1 != labels_assignment[i][-1]:
                            cur_label_1_end = np.where(labels_assignment[i]==cur_label_1+1)[0][0]
                        cur_label_2_end = len(labels_assignment[i])
                        if cur_label_2 != labels_assignment[i][-1]:
                            cur_label_2_end = np.where(labels_assignment[i] == cur_label_2 + 1)[0][0]

                        T2 = spm1d.stats.hotellings2(ordered_gene_expression[cur_label_1_start:cur_label_1_end], ordered_gene_expression[cur_label_2_start:cur_label_2_end])
                        T2i = T2.inference(0.05)
                        output+="{}\t".format(T2i.p)
                        # km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)
                    output+="\n"
                f.write(output)
예제 #9
0
파일: network.py 프로젝트: hag007/nn_sb
def create_modules_output(modules, score_file_name):
    scores = None
    if score_file_name is not None:
        scores = pd.read_csv(score_file_name, sep="\t").set_index("id")

        if constants.IS_PVAL_SCORES:
            scores["score"] = scores["pval"].apply(lambda x: -np.log10(x))

    zero_scores = [{
        "score": 0,
        "id": gene
    } for module in modules for gene in module
                   if scores is None or gene not in scores.index]
    if len(zero_scores) != 0:
        zero_scores = pd.DataFrame(zero_scores).set_index("id")
        zero_scores = zero_scores[~zero_scores.index.duplicated(keep='first')]
        scores = pd.concat([scores, zero_scores], axis=0)
    return [merge_two_dicts({"id" : k}, v) for k,v in reduce(reduce_to_dict, [{"eid": gene, "modules": [i], "id": gene, "gene_symbol": e2g_convertor([gene])[0], "score" : scores.loc[gene,"score"]} for i, module in enumerate(modules) for gene in module],\
            {}).iteritems()]
예제 #10
0
def find_clusters_and_survival(reduced_dim_file_name,
                               total_gene_list_file_name,
                               gene_expression_file_name,
                               phenotype_file_name,
                               survival_file_name,
                               gene_filter_file_name=None,
                               tested_gene_list_path=None,
                               total_gene_list_path=None,
                               gene_expression_path=None,
                               phenotype_path=None,
                               gene_filter_file_path=None,
                               var_th_index=None,
                               is_unsupervised=True,
                               start_k=2,
                               end_k=2,
                               meta_groups=None,
                               filter_expression=None,
                               clustering_algorithm="euclidean"):

    data = load_integrated_ge_data(
        tested_gene_list_file_name=reduced_dim_file_name,
        total_gene_list_file_name=total_gene_list_file_name,
        gene_expression_file_name=gene_expression_file_name,
        gene_expression_path=os.path.join(constants.OUTPUT_GLOBAL_DIR,
                                          gene_expression_file_name),
        phenotype_file_name=phenotype_file_name,
        survival_file_name=survival_file_name,
        var_th_index=var_th_index,
        meta_groups=meta_groups,
        filter_expression=filter_expression)
    if data is None:
        print "insufficient data"
        return
    gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data

    tmp = gene_expression_top_var_headers_rows
    gene_expression_top_var_headers_rows = gene_expression_top_var_headers_columns
    gene_expression_top_var_headers_columns = tmp
    gene_expression_top_var = np.rot90(np.flip(gene_expression_top_var, 1),
                                       k=-1,
                                       axes=(1, 0))

    # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name)
    lr_results_global = []
    if is_unsupervised:
        clfs_results = find_clusters(
            end_k,
            gene_expression_top_var,
            gene_expression_top_var_headers_rows,
            start_k,
            e2g_convertor(gene_expression_top_var_headers_columns),
            reduced_dim_file_name,
            labels_assignment,
            clustering_algorithm=clustering_algorithm)
        for i in range(start_k, end_k + 1):
            lr_results_global.append(
                km_curve(clfs_results[i], survival_dataset[1:],
                         gene_expression_top_var_headers_rows,
                         reduced_dim_file_name.split(".")[0], i))
            # B1 = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A']
            # B2 = ['TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A']
            # N = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A', 'TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A']
            # print "Group Low HG:"
            # print calc_HG_test(N,B1,clfs_results[i][0])
            # print calc_HG_test(N, B1, clfs_results[i][1])
            # print "Group High HG:"
            # print calc_HG_test(N, B2, clfs_results[i][0])
            # print calc_HG_test(N, B2, clfs_results[i][1])

    else:
        for i, cur_groups in enumerate(meta_groups):
            labeled_patients = divided_patient_ids_by_label(
                phenotype_file_name, groups=cur_groups)
            plot_heatmap(
                gene_expression_top_var,
                e2g_convertor(gene_expression_top_var_headers_columns),
                [labels_assignment[i]] + labels_assignment[:i] +
                labels_assignment[i + 1:],
                gene_expression_top_var_headers_rows,
                reduced_dim_file_name,
                label_index=i)
            lr_results_global.append(
                km_curve(labeled_patients,
                         survival_dataset[1:],
                         gene_expression_top_var_headers_rows,
                         reduced_dim_file_name.split(".")[0],
                         label_index=i))

    return lr_results_global
예제 #11
0
def mutation_pca(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, is_ge_integ=False):

    integ_data = load_integrated_mutation_data(
                                      mutation_file_name=mutation_file_name,
                                      phenotype_file_name=phenotype_file_name,
                                      survival_file_name=survival_file_name, var_th_index=var_th_index,
                                      meta_groups=meta_groups, filter_expression=filter_expression)
    if integ_data is None:
        print "insufficient data"
        return
    mu_data, mu_data_headers_rows, mu_data_headers_columns, labels_assignment, survival_dataset = integ_data



    all_patients = np.unique(mu_data_headers_rows).flatten()
    all_mutated_genes = np.unique(mu_data[:,0]).flatten()
    # mis_mutated_genes = np.unique(mu_data[np.where(np.core.defchararray.find(mu_data[1:, 8], "missense")!=-1), 1]).flatten()

    all_mutated_vectors = np.zeros((len(all_patients), len(all_mutated_genes)))
    # mis_mutated_vectors = np.array([[0 for y in mis_mutated_genes] for x in range(len(all_patients))])

    print "build vectors from {} entries".format(len(mu_data))

    stopwatch = Stopwatch()
    stopwatch.start()
    a = list(all_patients)
    b = list(all_mutated_genes)
    for i, x in enumerate(mu_data):
        all_mutated_vectors[a.index(mu_data_headers_rows[i])][b.index(x[0])] += 1
    print stopwatch.stop("end mut")
    all_mutated_vectors[all_mutated_vectors>5] =5

    all_mutated_vectors = all_mutated_vectors[:,all_mutated_genes!="TTN" ]
    all_mutated_genes = all_mutated_genes[all_mutated_genes != "TTN"]

    all_mutated_vectors[all_mutated_vectors > 5] = 5
    all_mutated_genes = all_mutated_genes[(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * 0.1]
    all_mutated_vectors = all_mutated_vectors[:,(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * 0.1]
    print "all_mutated_vectors after filter sparse: {}".format(np.shape(all_mutated_vectors))

    if np.size(all_mutated_genes) == 0:
        return


    mutation_expression_integ = all_mutated_vectors
    mutual_patients = all_patients
    mutation_expression_integ_headers_columns = all_mutated_genes

    if is_ge_integ:
        ge_data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name,
                                          total_gene_list_file_name=total_gene_list_file_name,
                                          gene_expression_file_name=gene_expression_file_name,
                                          phenotype_file_name=phenotype_file_name,
                                          survival_file_name=survival_file_name, var_th_index=var_th_index,
                                          meta_groups=meta_groups, filter_expression=filter_expression)
        if ge_data is None:
            print "insufficient data"
            return
        gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = ge_data

        all_mutated_vectors = zscore(all_mutated_vectors, axis=0)
        gene_expression_top_var = zscore(gene_expression_top_var, axis=0)
        mutual_patients = np.array([x for x in all_patients if x in gene_expression_top_var_headers_rows])
        mutual_mutations = all_mutated_vectors[np.in1d(all_patients, mutual_patients)]
        mutual_mutations = mutual_mutations[mutual_patients.argsort()]
        mutual_patients = np.array([x for x in gene_expression_top_var_headers_rows if x in all_patients])
        mutual_expressions = gene_expression_top_var[np.in1d(gene_expression_top_var_headers_rows, mutual_patients)]
        mutual_expressions = mutual_expressions[mutual_patients.argsort()]

        mutual_patients.sort()
        mutation_expression_integ = np.c_[mutual_mutations, mutual_expressions]
        mutation_expression_integ_headers_columns = np.r_[all_mutated_genes, e2g_convertor(gene_expression_top_var_headers_columns)]
    else:
        survival_dataset = np.array(load_survival_data(survival_file_name))

    plot_pca(mutation_expression_integ, labels_assignment, meta_groups)
예제 #12
0
def find_genes_correlations(tested_gene_list_file_names,
                            total_gene_list_file_name,
                            gene_expression_file_names,
                            intersection_gene_file_names,
                            phenotype_file_name=None,
                            filter_expression=None,
                            var_th_index=None,
                            list_mode="ON_THE_FLY"):
    if filter_expression is not None:
        filtered_patients = [
            y for x in divided_patient_ids_by_label(phenotype_file_name,
                                                    groups=filter_expression)
            for y in x
        ]
    print "about ot analyse: {}".format(str(tested_gene_list_file_names)[:20])
    # fetch gene expression by gene_id, divided by tumor type
    gene_sets = []
    expression_sets = []

    if list_mode == "ON_THE_FLY":
        total_gene_list = total_gene_list_file_name
        intersection_gene_sets = intersection_gene_file_names
    else:
        total_gene_list = load_gene_list(total_gene_list_file_name)
        intersection_gene_sets = []
        if intersection_gene_file_names is not None:
            intersection_gene_sets = [
                np.array([y.split(".")[0] for y in load_gene_list(x)])
                if type(x) == str else [y.split(".")[0] for y in x]
                for x in intersection_gene_file_names
            ]

    all_gene_expressions = [
        np.array(
            load_gene_expression_profile_by_genes(
                x, gene_expression_file_names[i], list_mode=list_mode))
        for i, x in enumerate(tested_gene_list_file_names)
    ]
    if filter_expression is None:
        filtered_patients = np.append(all_gene_expressions[1:],
                                      all_gene_expressions[1:])
    mutual_patients = np.array([
        x for x in all_gene_expressions[0][0][1:]
        if x in all_gene_expressions[1][0][1:] and x in filtered_patients
    ])
    all_gene_expressions[0] = np.c_[
        all_gene_expressions[0][:, 0], all_gene_expressions[0]
        [:, np.in1d(all_gene_expressions[0][0], mutual_patients)]]
    mutual_patients = np.array([
        x for x in all_gene_expressions[1][0][1:]
        if x in all_gene_expressions[0][0][1:] and x in filtered_patients
    ])
    all_gene_expressions[1] = np.c_[
        all_gene_expressions[1][:, 0], all_gene_expressions[1]
        [:, np.in1d(all_gene_expressions[1][0], mutual_patients)]]

    dataset_headers_rows, dataset_headers_columns, dataset = separate_headers(
        all_gene_expressions[0])
    row_var = np.var(dataset, axis=1)
    row_var_sorted = np.sort(row_var)[::-1]
    if var_th_index is None:
        var_th_index = len(row_var_sorted) - 1
    row_var_th = row_var_sorted[var_th_index]
    row_var_masked_indices = np.where(row_var_th > row_var)[0]
    all_gene_expressions[0] = np.delete(all_gene_expressions[0],
                                        row_var_masked_indices,
                                        axis=0)

    all_gene_expressions_1 = [[y[0], np.array(y[1:]).astype(np.float)]
                              for x in [all_gene_expressions[0]]
                              for y in x[1:]]
    all_gene_expressions_2 = [[y[0], np.array(y[1:]).astype(np.float)]
                              for x in [all_gene_expressions[1]]
                              for y in x[1:]]

    output = []
    header_columns = []
    for i, cur_1 in enumerate(all_gene_expressions_2):
        header_columns.append(e2g_convertor([all_gene_expressions_2[i][0]])[0])
    for i, cur_1 in enumerate(all_gene_expressions_1):
        for j, cur_2 in enumerate(all_gene_expressions_2):
            prsn = pearsonr(cur_1[1], cur_2[1])
            if not math.isnan(pearsonr(cur_1[1], cur_2[1])[0]):
                output.append([
                    e2g_convertor([all_gene_expressions_1[i][0]])[0], prsn[0],
                    prsn[1]
                ])

    if len(output) == 0:
        return ([], [
            "{}\t({} {} {} {})".format(1.0, 0, 0, 0, 0)
            for x in intersection_gene_file_names
        ])
    output = np.array(output)
    fdr_results = fdrcorrection0(output[:, 2].astype(np.float32),
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    output = np.c_[output, fdr_results[1]]
    output = output[output[:, 3].astype(np.float64).argsort(), :]

    hg_scores = []
    for cur_set in intersection_gene_sets:
        # hg_score = calc_HG_test(total_gene_list_N=[x[0].split(".")[0] for x in all_gene_expressions_1], tests_gene_list_B=cur_set, total_gene_list_n=g2e_convertor(output[np.logical_and(output[:, 3].astype(np.float)  < 0.05, output[:, 1].astype(np.float)  < 0)  , 0]))
        hg_score = calc_HG_test(
            total_gene_list_N=[x.split(".")[0] for x in total_gene_list],
            tests_gene_list_B=cur_set,
            total_gene_list_n=g2e_convertor(
                output[np.logical_and(output[:, 3].astype(np.float) < 0.05,
                                      output[:, 1].astype(np.float) < 0), 0]))
        print hg_score
        hg_scores.append(hg_score)
    file_names = ""
    if tested_gene_list_file_names[0] is str:
        file_names = "_".join(
            [x.split(".")[0] for x in tested_gene_list_file_names])
    print_to_excel(header_columns, output, intersection_gene_sets,
                   intersection_gene_file_names, file_names)
    return (output, hg_scores)
예제 #13
0
def check_group_enrichment_tango(tested_gene_file_name,
                                 total_gene_file_name,
                                 algo="",
                                 module=""):
    if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0:
        return []

    if type(total_gene_file_name) == str:
        total_gene_list = [
            x.split("\t")[0] for x in load_gene_list(total_gene_file_name)
        ]
    else:
        total_gene_list = total_gene_file_name

    if type(tested_gene_file_name) == str:
        tested_gene_list = [
            x.split("\t")[0] for x in load_gene_list(tested_gene_file_name)
        ]
    else:
        tested_gene_list = tested_gene_file_name

    df_tested = pd.DataFrame(index=ensembl2entrez_convertor(tested_gene_list))
    df_tested["set"] = 0
    df_tested_file_name = os.path.join(constants.OUTPUT_DIR,
                                       "_".join(["tested", algo, module]))
    df_bg_file_name = os.path.join(constants.OUTPUT_DIR,
                                   "_".join(["bg", algo, module]))
    df_tested.to_csv(df_tested_file_name, header=False, sep="\t")
    pd.DataFrame(index=ensembl2entrez_convertor(total_gene_list)).to_csv(
        df_bg_file_name, header=False, sep="\t")
    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "output_{}_{}".format(algo, module))

    conf = file(
        os.path.join(constants.ALGO_BASE_DIR, "tango",
                     "parameter_file.format")).read().format(
                         SET=df_tested_file_name,
                         BACKGROUND=df_bg_file_name,
                         OUTPUT_FILE_NAME=output_file_name)
    conf_file_name = os.path.join(
        constants.OUTPUT_DIR,
        "parameter_file_{}_{}_{}".format(algo, module, time.time()))
    file(conf_file_name, 'w+').write(conf)

    print subprocess.Popen("wine win/annot_sets.exe {}".format(conf_file_name),
                           shell=True,
                           stdout=subprocess.PIPE,
                           cwd=os.path.join(constants.ALGO_BASE_DIR,
                                            "tango")).stdout.read()

    df_results = pd.DataFrame()
    if os.path.isfile(
            output_file_name) and os.path.getsize(output_file_name) > 1:
        df_results = pd.read_csv(output_file_name,
                                 sep="\t",
                                 index_col=False,
                                 header=None)

    hg_report = []
    go_terms = []
    uncorrectd_pvals = []
    FDRs = []
    go_names = []
    go_ns = []
    if len(df_results.index) > 0:
        # go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals, FDRs = zip(*[("NA", cur[1]["Category"].split(" - ")[1], cur[1]["Category"].split(" - ")[0], cur[1]["Gene IDs"].count(',')+1, cur[1]["Raw Pvalue"], cur[1]["p-value"]) for cur in df_results.iterrows()])
        # hg_report = [{HG_GO_ROOT: "NA", HG_GO_ID: cur[1]["Category"].split(" - ")[1], HG_GO_NAME: cur[1]["Category"].split(" - ")[0], HG_VALUE: cur[1]["Gene IDs"].count(',')+1, HG_PVAL: cur[1]["Raw Pvalue"],
        #               HG_QVAL: cur[1]["p-value"]} for cur in df_results.iterrows()]
        go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals, FDRs = zip(
            *[("NA", cur[1][6], cur[1][1], cur[1][4], 10**float(cur[1][2]),
               10**float(cur[1][3])) for cur in df_results.iterrows()])
        hg_report = [{
            HG_GO_ROOT: "NA",
            HG_GO_ID: cur[1][6],
            HG_GO_NAME: cur[1][1],
            HG_VALUE: cur[1][5],
            HG_PVAL: 10**float(cur[1][2]),
            HG_QVAL: 10**float(cur[1][3])
        } for cur in df_results.iterrows()]
        hg_report.sort(key=lambda x: x[HG_QVAL])
        hg_report = filter(lambda x: x[HG_QVAL] <= 0.05, hg_report)

    output_rows = [
        ("\r\n".join(e2g_convertor(tested_gene_list)), "\r\n".join(go_ns),
         "\r\n".join(go_terms), "\r\n".join(go_names),
         "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))
    ]
    print_to_excel(output_rows,
                   str(tested_gene_file_name)[:10],
                   str(total_gene_file_name)[:10])
    return hg_report