def find_clusters_and_survival(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean" ,plot=True): data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name) clfs_results = None results = [] if is_unsupervised: clfs_results = find_clusters(end_k, gene_expression_top_var, gene_expression_top_var_headers_rows, start_k, e2g_convertor(gene_expression_top_var_headers_columns), tested_gene_list_file_name, labels_assignment, clustering_algorithm=clustering_algorithm, plot=plot) for i in range(start_k,end_k+1): results.append(km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],i)[0]) else: for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups) plot_heatmap(gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns), [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name, label_index=i) results.append(km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)[0]) return results ,clfs_results
def main(mrna_list_file_names, mir_list_file_names): output_files = [] mirna_clusters = load_mirna_clusters("mir_clusters_by_targets.txt") associated_mirna = [] for cur_mrna_list in mrna_list_file_names: mrna_list = load_gene_list(cur_mrna_list) for cur_mir_list in mir_list_file_names: mir_list = load_gene_list(cur_mir_list) for cur in mirna_clusters: if cur[0].split(".")[0] in mrna_list and len( set(cur[1:]).intersection(mir_list)) != 0: associated_mirna = associated_mirna + list( set(cur[1:]).intersection(mir_list)) associated_mirna = list(set(associated_mirna)) associated_mirna = e2g_convertor(associated_mirna) f = file( os.path.join( constants.LIST_DIR, "mir_{}.txt".format("_".join( [x.split(".")[0] for x in mrna_list_file_names]))), "w+") f.write("\r\n".join(associated_mirna)) f.close() print associated_mirna return associated_mirna
def check_group_enrichment(tested_gene_file_name, total_gene_file_name): total_gene_list = load_gene_list(total_gene_file_name) tested_gene = load_gene_list(tested_gene_file_name) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results) else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] output_rows = [("\r\n".join(e2g_convertor(tested_gene)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))] print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
def init_specific_params(ge_file_name=os.path.join(constants.DATA_DIR, "ge.tsv"), network_file_name=os.path.join( constants.NETWORKS_DIR, NETWORK_NAME + ".sif")): h_rows, h_columns, values = infra.separate_headers( infra.load_gene_expression_profile_by_genes( gene_expression_file_name=ge_file_name)) df_ge = pd.DataFrame(columns=h_columns, index=h_rows, data=values) df_ge_cond_col = df_ge.columns df_ge["gene ID"] = df_ge.index df_ge["GeneName"] = [ e2g_convertor([cur])[0] if len(e2g_convertor([cur])) > 0 else np.NAN for cur in df_ge.index ] df_ge = df_ge[["gene ID", "GeneName"] + list(df_ge_cond_col)] df_ge = df_ge[~df_ge['gene ID'].duplicated(keep='first')] ge_file_name_mts = os.path.splitext(ge_file_name)[0] + "_mts.tsv" df_ge.to_csv(ge_file_name_mts, index=False, sep="\t") output_file_name = os.path.join(constants.OUTPUT_DIR, "matisse_output.txt") return ge_file_name_mts, network_file_name, output_file_name
def gene_correlation_scores(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, gene_filter_file_name=None, top_n=2000): print "about ot analyse: {}".format(tested_gene_list_file_name) # fetch gene expression by gene_id, divided by tumor type total_gene_expression = np.array( load_gene_expression_profile_by_genes(total_gene_list_file_name, gene_expression_file_name, gene_filter_file_name)) gene_ids = load_gene_list(tested_gene_list_file_name) ranks_dict = {} ranks_score = [] ranks = [] for cur in gene_ids: ranks.append([]) cur_expression = total_gene_expression[np.where( total_gene_expression[:, 0] == cur)][0] for cur_prot_expression in total_gene_expression[1:]: prs = pearsonr(cur_prot_expression[1:].astype(np.float32), cur_expression[1:].astype(np.float32))[0] if not math.isnan(prs) and cur_prot_expression[0] not in gene_ids: ranks[-1].append((cur_prot_expression[0], abs(prs), prs > 0)) ranks[-1] = sorted(ranks[-1], key=lambda x: x[1], reverse=True)[:top_n] for cur in ranks[-1]: if not ranks_dict.has_key(cur[0]): ranks_dict[cur[0]] = [] ranks_dict[cur[0]].append(cur[1]) for k, v in ranks_dict.iteritems(): ranks_score.append((k, sum(v), e2g_convertor([k])[0])) ranks_score = sorted(ranks_score, key=lambda x: x[1], reverse=True)[:top_n] print ranks_score f = file( os.path.join( constants.LIST_DIR, "corr_{}_top_{}.txt".format( tested_gene_list_file_name.split(".")[0], top_n)), 'w+') f.write("\r\n".join([x[0] for x in ranks_score])) f.close()
def plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name): ax = plt.subplot(111) positions = np.arange(len(gene_expression_top_var_headers_columns)) + 1 bp = ax.boxplot( gene_expression_top_var, positions=positions, showmeans=True, labels=e2g_convertor(gene_expression_top_var_headers_columns)) ax.set_title("genes_statistic_{}_{}_averaged var:{}".format( constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], '%.3f' % np.average(np.var(gene_expression_top_var, axis=0)))) for label in ax.xaxis.get_ticklabels(): label.set_fontsize(7) label.set_rotation(90) plt.savefig( os.path.join( constants.BASE_PROFILE, "output", "genes_statistic_{}_{}_{}.png".format( constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], time.time())))
def find_clusters_and_gene_enrichment(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, start_k=2, end_k=6, calc_go=True, enrichment_list_file_names=None, meta_groups=None, filter_expression=None, cluster_algorithm=None): # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] averaged_expression_sets = [] tested_gene_expression = load_gene_expression_profile_by_genes( tested_gene_list_file_name, gene_expression_file_name, gene_filter_file_name, tested_gene_list_path, gene_expression_path, gene_filter_file_path) tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers( tested_gene_expression) if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "number of filtered patients from phenotypes: {}".format( len(filtered_patients)) else: print "no filter applied" filtered_patients = tested_gene_expression_headers_columns tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients( filtered_patients, tested_gene_expression_headers_columns, tested_gene_expression) if np.shape(tested_gene_expression)[1] == 1: print "no expressions were found after filtering by labels {}. skipping...".format( filter_expression) return None total_gene_list = load_gene_list(total_gene_list_file_name) tested_gene_list = load_gene_list(tested_gene_list_file_name) row_var = np.var(tested_gene_expression, axis=1) row_var_sorted = np.sort(row_var)[::-1] labels_assignment_patients = None if meta_groups is not None: print "clustering patients by groups" labels_assignment_patients = labels_assignments( meta_groups, phenotype_file_name, tested_gene_expression_headers_columns) enrichment_lists = [] if enrichment_list_file_names is not None: for cur in enrichment_list_file_names: enrichment_lists.append(load_gene_list(cur)) if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] gene_expression_top_var = np.delete(tested_gene_expression, row_var_masked_indices, axis=0) gene_expression_top_var_header_rows = np.delete( tested_gene_expression_headers_rows, row_var_masked_indices, axis=0) gene_expression_top_var_header_columns = tested_gene_expression_headers_columns clfs_results = {} output_rows = [] if calc_go: if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download( constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')): # wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_expression_top_var_header_rows) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] print GO_results if cluster_algorithm == "kmeans": for n_clusters in range(start_k, end_k + 1): clfs_results[n_clusters] = [] centres, km_clf, dist = kmeanssample(X=gene_expression_top_var, k=n_clusters, metric="euclidean") for i in range(n_clusters): ranks = [] for j in range(n_clusters): ranks.append( np.average( np.delete(gene_expression_top_var, np.where(km_clf != j)[0], axis=0))) ranks = rankdata(ranks) cluster_labels = np.array(km_clf) for j in range(n_clusters): cluster_labels[np.where(km_clf == ranks[j] - 1)] = j labels_assignment = [cluster_labels + 1] cluster_indices = np.where(km_clf != i)[0] gene_expression_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) gene_headers_row_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) clfs_results[n_clusters].append( (gene_headers_row_cluster, gene_headers_row_cluster)) desc = "k={} clustering cluster {} has {} genes".format( n_clusters, i, len(gene_expression_cluster)) gene_list = ",".join(gene_headers_row_cluster) url = check_enrichment(gene_list) go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if calc_go: g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_headers_row_cluster) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip( *GO_results) if len(enrichment_lists) != 0: for j, cur in enumerate(enrichment_lists): go_terms.append( enrichment_list_file_names[j].split(".")[0]) uncorrectd_pvals.append( calc_HG_test( [x.split(".")[0] for x in tested_gene_list], [x.split(".")[0] for x in cur], [ x.split(".")[0] for x in gene_headers_row_cluster ])) FDRs.append(".") go_names.append(".") go_ns.append(".") output_rows.append((desc, "\r\n".join([ x.split(".")[0] for x in gene_headers_row_cluster ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))) gene_sorted_heatmap = np.rot90(np.flip( gene_expression_top_var[cluster_labels.argsort(), :], 1), k=-1, axes=(1, 0)) find_clusters(end_k, gene_sorted_heatmap, gene_expression_top_var_header_columns, start_k, e2g_convertor(gene_expression_top_var_header_rows), tested_gene_list_file_name, labels_assignment=labels_assignment_patients) plot_heatmap(gene_expression_top_var, gene_expression_top_var_header_columns, labels_assignment, gene_expression_top_var_header_rows, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) if cluster_algorithm == "hierarchical": df = pd.DataFrame(data=gene_sorted_heatmap, index=gene_expression_top_var_header_columns, columns=gene_expression_top_var_header_rows) # correlations = df.corr() # correlations_array = np.asarray(df.corr()) # # row_linkage = hierarchy.linkage( # distance.pdist(correlations_array), method='average') # # col_linkage = hierarchy.linkage( # distance.pdist(correlations_array.T), method='average') # enrichment_gene_list = load_gene_list("uvm_mito_part.txt") dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg")) row_colors = map(dct.get, labels_assignment_patients[0]) dct = {1: 'b', 2: 'r'} gene_expression_top_var_header_rows_trimmed = [ x.split(".")[0] for x in gene_expression_top_var_header_rows ] # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed]) g = sns.clustermap(df, row_colors=row_colors, metric="euclidean", robust=True, method="single") # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage, # labels=df.index, # color_threshold=0.60) den_genes = scipy.cluster.hierarchy.dendrogram( g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7) clusters = get_cluster_classes(den_genes) g.savefig( os.path.join(constants.BASE_PROFILE, "output", "hierarchical_cluster_{}.png".format(time.time()))) for cur_labels_assignment_patient in labels_assignment_patients: plot_heatmap(gene_sorted_heatmap, gene_expression_top_var_header_rows, [cur_labels_assignment_patient], gene_expression_top_var_header_columns, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) print_to_excel( output_rows=output_rows, gene_list_file_name=tested_gene_list_file_name.split(".")[0], gene_expression_file_name=gene_expression_file_name.split(".")[0], var_th_index=var_th_index)
def patient_sets_distribution_differences(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean", average_patients=True, compute_hotelling=False): data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups) ordered_gene_expression = gene_expression_top_var[labels_assignment[i].argsort(), :] labels_assignment[i].sort() heatmap_values = gene_expression_top_var if average_patients: avgs = None for cur_label_1 in np.unique(labels_assignment[i]): avg = np.average(ordered_gene_expression[np.where(labels_assignment[i] == cur_label_1)[0],:], axis=0) if avgs is None: avgs = avg.reshape(1,len(avg)) else: avgs = np.r_[avgs, avg.reshape(1,len(avg))] heatmap_values=avgs plot_pca(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name) plot_pca_by_samples(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name) plot_heatmap(heatmap_values, e2g_convertor(gene_expression_top_var_headers_columns), [np.unique(labels_assignment[i])], gene_expression_top_var_headers_rows, tested_gene_list_file_name, label_index=i) if compute_hotelling: with file(os.path.join(constants.BASE_PROFILE,"output", "hotelling_{}_{}_{}.txt".format(constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], time.time())),"w+") as f: output = "\t" for cur_label_1 in np.unique(labels_assignment[i]): output+= "group {}\t".format(cur_label_1) output += "\n" for cur_label_1 in np.unique(labels_assignment[i]): output += "group {}\t".format(cur_label_1) for cur_label_2 in np.unique(labels_assignment[i]): if cur_label_2 > cur_label_1: continue cur_label_2_start = np.where(labels_assignment[i]==cur_label_2)[0][0] cur_label_1_end = len(labels_assignment[i]) if cur_label_1 != labels_assignment[i][-1]: cur_label_1_end = np.where(labels_assignment[i]==cur_label_1+1)[0][0] cur_label_2_end = len(labels_assignment[i]) if cur_label_2 != labels_assignment[i][-1]: cur_label_2_end = np.where(labels_assignment[i] == cur_label_2 + 1)[0][0] T2 = spm1d.stats.hotellings2(ordered_gene_expression[cur_label_1_start:cur_label_1_end], ordered_gene_expression[cur_label_2_start:cur_label_2_end]) T2i = T2.inference(0.05) output+="{}\t".format(T2i.p) # km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i) output+="\n" f.write(output)
def create_modules_output(modules, score_file_name): scores = None if score_file_name is not None: scores = pd.read_csv(score_file_name, sep="\t").set_index("id") if constants.IS_PVAL_SCORES: scores["score"] = scores["pval"].apply(lambda x: -np.log10(x)) zero_scores = [{ "score": 0, "id": gene } for module in modules for gene in module if scores is None or gene not in scores.index] if len(zero_scores) != 0: zero_scores = pd.DataFrame(zero_scores).set_index("id") zero_scores = zero_scores[~zero_scores.index.duplicated(keep='first')] scores = pd.concat([scores, zero_scores], axis=0) return [merge_two_dicts({"id" : k}, v) for k,v in reduce(reduce_to_dict, [{"eid": gene, "modules": [i], "id": gene, "gene_symbol": e2g_convertor([gene])[0], "score" : scores.loc[gene,"score"]} for i, module in enumerate(modules) for gene in module],\ {}).iteritems()]
def find_clusters_and_survival(reduced_dim_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean"): data = load_integrated_ge_data( tested_gene_list_file_name=reduced_dim_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, gene_expression_path=os.path.join(constants.OUTPUT_GLOBAL_DIR, gene_expression_file_name), phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data tmp = gene_expression_top_var_headers_rows gene_expression_top_var_headers_rows = gene_expression_top_var_headers_columns gene_expression_top_var_headers_columns = tmp gene_expression_top_var = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name) lr_results_global = [] if is_unsupervised: clfs_results = find_clusters( end_k, gene_expression_top_var, gene_expression_top_var_headers_rows, start_k, e2g_convertor(gene_expression_top_var_headers_columns), reduced_dim_file_name, labels_assignment, clustering_algorithm=clustering_algorithm) for i in range(start_k, end_k + 1): lr_results_global.append( km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, reduced_dim_file_name.split(".")[0], i)) # B1 = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A'] # B2 = ['TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A'] # N = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A', 'TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A'] # print "Group Low HG:" # print calc_HG_test(N,B1,clfs_results[i][0]) # print calc_HG_test(N, B1, clfs_results[i][1]) # print "Group High HG:" # print calc_HG_test(N, B2, clfs_results[i][0]) # print calc_HG_test(N, B2, clfs_results[i][1]) else: for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label( phenotype_file_name, groups=cur_groups) plot_heatmap( gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns), [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:], gene_expression_top_var_headers_rows, reduced_dim_file_name, label_index=i) lr_results_global.append( km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, reduced_dim_file_name.split(".")[0], label_index=i)) return lr_results_global
def mutation_pca(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, is_ge_integ=False): integ_data = load_integrated_mutation_data( mutation_file_name=mutation_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if integ_data is None: print "insufficient data" return mu_data, mu_data_headers_rows, mu_data_headers_columns, labels_assignment, survival_dataset = integ_data all_patients = np.unique(mu_data_headers_rows).flatten() all_mutated_genes = np.unique(mu_data[:,0]).flatten() # mis_mutated_genes = np.unique(mu_data[np.where(np.core.defchararray.find(mu_data[1:, 8], "missense")!=-1), 1]).flatten() all_mutated_vectors = np.zeros((len(all_patients), len(all_mutated_genes))) # mis_mutated_vectors = np.array([[0 for y in mis_mutated_genes] for x in range(len(all_patients))]) print "build vectors from {} entries".format(len(mu_data)) stopwatch = Stopwatch() stopwatch.start() a = list(all_patients) b = list(all_mutated_genes) for i, x in enumerate(mu_data): all_mutated_vectors[a.index(mu_data_headers_rows[i])][b.index(x[0])] += 1 print stopwatch.stop("end mut") all_mutated_vectors[all_mutated_vectors>5] =5 all_mutated_vectors = all_mutated_vectors[:,all_mutated_genes!="TTN" ] all_mutated_genes = all_mutated_genes[all_mutated_genes != "TTN"] all_mutated_vectors[all_mutated_vectors > 5] = 5 all_mutated_genes = all_mutated_genes[(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * 0.1] all_mutated_vectors = all_mutated_vectors[:,(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * 0.1] print "all_mutated_vectors after filter sparse: {}".format(np.shape(all_mutated_vectors)) if np.size(all_mutated_genes) == 0: return mutation_expression_integ = all_mutated_vectors mutual_patients = all_patients mutation_expression_integ_headers_columns = all_mutated_genes if is_ge_integ: ge_data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if ge_data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = ge_data all_mutated_vectors = zscore(all_mutated_vectors, axis=0) gene_expression_top_var = zscore(gene_expression_top_var, axis=0) mutual_patients = np.array([x for x in all_patients if x in gene_expression_top_var_headers_rows]) mutual_mutations = all_mutated_vectors[np.in1d(all_patients, mutual_patients)] mutual_mutations = mutual_mutations[mutual_patients.argsort()] mutual_patients = np.array([x for x in gene_expression_top_var_headers_rows if x in all_patients]) mutual_expressions = gene_expression_top_var[np.in1d(gene_expression_top_var_headers_rows, mutual_patients)] mutual_expressions = mutual_expressions[mutual_patients.argsort()] mutual_patients.sort() mutation_expression_integ = np.c_[mutual_mutations, mutual_expressions] mutation_expression_integ_headers_columns = np.r_[all_mutated_genes, e2g_convertor(gene_expression_top_var_headers_columns)] else: survival_dataset = np.array(load_survival_data(survival_file_name)) plot_pca(mutation_expression_integ, labels_assignment, meta_groups)
def find_genes_correlations(tested_gene_list_file_names, total_gene_list_file_name, gene_expression_file_names, intersection_gene_file_names, phenotype_file_name=None, filter_expression=None, var_th_index=None, list_mode="ON_THE_FLY"): if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "about ot analyse: {}".format(str(tested_gene_list_file_names)[:20]) # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] if list_mode == "ON_THE_FLY": total_gene_list = total_gene_list_file_name intersection_gene_sets = intersection_gene_file_names else: total_gene_list = load_gene_list(total_gene_list_file_name) intersection_gene_sets = [] if intersection_gene_file_names is not None: intersection_gene_sets = [ np.array([y.split(".")[0] for y in load_gene_list(x)]) if type(x) == str else [y.split(".")[0] for y in x] for x in intersection_gene_file_names ] all_gene_expressions = [ np.array( load_gene_expression_profile_by_genes( x, gene_expression_file_names[i], list_mode=list_mode)) for i, x in enumerate(tested_gene_list_file_names) ] if filter_expression is None: filtered_patients = np.append(all_gene_expressions[1:], all_gene_expressions[1:]) mutual_patients = np.array([ x for x in all_gene_expressions[0][0][1:] if x in all_gene_expressions[1][0][1:] and x in filtered_patients ]) all_gene_expressions[0] = np.c_[ all_gene_expressions[0][:, 0], all_gene_expressions[0] [:, np.in1d(all_gene_expressions[0][0], mutual_patients)]] mutual_patients = np.array([ x for x in all_gene_expressions[1][0][1:] if x in all_gene_expressions[0][0][1:] and x in filtered_patients ]) all_gene_expressions[1] = np.c_[ all_gene_expressions[1][:, 0], all_gene_expressions[1] [:, np.in1d(all_gene_expressions[1][0], mutual_patients)]] dataset_headers_rows, dataset_headers_columns, dataset = separate_headers( all_gene_expressions[0]) row_var = np.var(dataset, axis=1) row_var_sorted = np.sort(row_var)[::-1] if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] all_gene_expressions[0] = np.delete(all_gene_expressions[0], row_var_masked_indices, axis=0) all_gene_expressions_1 = [[y[0], np.array(y[1:]).astype(np.float)] for x in [all_gene_expressions[0]] for y in x[1:]] all_gene_expressions_2 = [[y[0], np.array(y[1:]).astype(np.float)] for x in [all_gene_expressions[1]] for y in x[1:]] output = [] header_columns = [] for i, cur_1 in enumerate(all_gene_expressions_2): header_columns.append(e2g_convertor([all_gene_expressions_2[i][0]])[0]) for i, cur_1 in enumerate(all_gene_expressions_1): for j, cur_2 in enumerate(all_gene_expressions_2): prsn = pearsonr(cur_1[1], cur_2[1]) if not math.isnan(pearsonr(cur_1[1], cur_2[1])[0]): output.append([ e2g_convertor([all_gene_expressions_1[i][0]])[0], prsn[0], prsn[1] ]) if len(output) == 0: return ([], [ "{}\t({} {} {} {})".format(1.0, 0, 0, 0, 0) for x in intersection_gene_file_names ]) output = np.array(output) fdr_results = fdrcorrection0(output[:, 2].astype(np.float32), alpha=0.05, method='indep', is_sorted=False) output = np.c_[output, fdr_results[1]] output = output[output[:, 3].astype(np.float64).argsort(), :] hg_scores = [] for cur_set in intersection_gene_sets: # hg_score = calc_HG_test(total_gene_list_N=[x[0].split(".")[0] for x in all_gene_expressions_1], tests_gene_list_B=cur_set, total_gene_list_n=g2e_convertor(output[np.logical_and(output[:, 3].astype(np.float) < 0.05, output[:, 1].astype(np.float) < 0) , 0])) hg_score = calc_HG_test( total_gene_list_N=[x.split(".")[0] for x in total_gene_list], tests_gene_list_B=cur_set, total_gene_list_n=g2e_convertor( output[np.logical_and(output[:, 3].astype(np.float) < 0.05, output[:, 1].astype(np.float) < 0), 0])) print hg_score hg_scores.append(hg_score) file_names = "" if tested_gene_list_file_names[0] is str: file_names = "_".join( [x.split(".")[0] for x in tested_gene_list_file_names]) print_to_excel(header_columns, output, intersection_gene_sets, intersection_gene_file_names, file_names) return (output, hg_scores)
def check_group_enrichment_tango(tested_gene_file_name, total_gene_file_name, algo="", module=""): if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0: return [] if type(total_gene_file_name) == str: total_gene_list = [ x.split("\t")[0] for x in load_gene_list(total_gene_file_name) ] else: total_gene_list = total_gene_file_name if type(tested_gene_file_name) == str: tested_gene_list = [ x.split("\t")[0] for x in load_gene_list(tested_gene_file_name) ] else: tested_gene_list = tested_gene_file_name df_tested = pd.DataFrame(index=ensembl2entrez_convertor(tested_gene_list)) df_tested["set"] = 0 df_tested_file_name = os.path.join(constants.OUTPUT_DIR, "_".join(["tested", algo, module])) df_bg_file_name = os.path.join(constants.OUTPUT_DIR, "_".join(["bg", algo, module])) df_tested.to_csv(df_tested_file_name, header=False, sep="\t") pd.DataFrame(index=ensembl2entrez_convertor(total_gene_list)).to_csv( df_bg_file_name, header=False, sep="\t") output_file_name = os.path.join(constants.OUTPUT_DIR, "output_{}_{}".format(algo, module)) conf = file( os.path.join(constants.ALGO_BASE_DIR, "tango", "parameter_file.format")).read().format( SET=df_tested_file_name, BACKGROUND=df_bg_file_name, OUTPUT_FILE_NAME=output_file_name) conf_file_name = os.path.join( constants.OUTPUT_DIR, "parameter_file_{}_{}_{}".format(algo, module, time.time())) file(conf_file_name, 'w+').write(conf) print subprocess.Popen("wine win/annot_sets.exe {}".format(conf_file_name), shell=True, stdout=subprocess.PIPE, cwd=os.path.join(constants.ALGO_BASE_DIR, "tango")).stdout.read() df_results = pd.DataFrame() if os.path.isfile( output_file_name) and os.path.getsize(output_file_name) > 1: df_results = pd.read_csv(output_file_name, sep="\t", index_col=False, header=None) hg_report = [] go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if len(df_results.index) > 0: # go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals, FDRs = zip(*[("NA", cur[1]["Category"].split(" - ")[1], cur[1]["Category"].split(" - ")[0], cur[1]["Gene IDs"].count(',')+1, cur[1]["Raw Pvalue"], cur[1]["p-value"]) for cur in df_results.iterrows()]) # hg_report = [{HG_GO_ROOT: "NA", HG_GO_ID: cur[1]["Category"].split(" - ")[1], HG_GO_NAME: cur[1]["Category"].split(" - ")[0], HG_VALUE: cur[1]["Gene IDs"].count(',')+1, HG_PVAL: cur[1]["Raw Pvalue"], # HG_QVAL: cur[1]["p-value"]} for cur in df_results.iterrows()] go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals, FDRs = zip( *[("NA", cur[1][6], cur[1][1], cur[1][4], 10**float(cur[1][2]), 10**float(cur[1][3])) for cur in df_results.iterrows()]) hg_report = [{ HG_GO_ROOT: "NA", HG_GO_ID: cur[1][6], HG_GO_NAME: cur[1][1], HG_VALUE: cur[1][5], HG_PVAL: 10**float(cur[1][2]), HG_QVAL: 10**float(cur[1][3]) } for cur in df_results.iterrows()] hg_report.sort(key=lambda x: x[HG_QVAL]) hg_report = filter(lambda x: x[HG_QVAL] <= 0.05, hg_report) output_rows = [ ("\r\n".join(e2g_convertor(tested_gene_list)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs))) ] print_to_excel(output_rows, str(tested_gene_file_name)[:10], str(total_gene_file_name)[:10]) return hg_report