def find_clusters_and_survival(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean" ,plot=True): data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name) clfs_results = None results = [] if is_unsupervised: clfs_results = find_clusters(end_k, gene_expression_top_var, gene_expression_top_var_headers_rows, start_k, e2g_convertor(gene_expression_top_var_headers_columns), tested_gene_list_file_name, labels_assignment, clustering_algorithm=clustering_algorithm, plot=plot) for i in range(start_k,end_k+1): results.append(km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],i)[0]) else: for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups) plot_heatmap(gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns), [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name, label_index=i) results.append(km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i)[0]) return results ,clfs_results
def find_clusters_and_gene_enrichment(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, start_k=2, end_k=6, calc_go=True, enrichment_list_file_names=None, meta_groups=None, filter_expression=None, cluster_algorithm=None): # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] averaged_expression_sets = [] tested_gene_expression = load_gene_expression_profile_by_genes( tested_gene_list_file_name, gene_expression_file_name, gene_filter_file_name, tested_gene_list_path, gene_expression_path, gene_filter_file_path) tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers( tested_gene_expression) if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "number of filtered patients from phenotypes: {}".format( len(filtered_patients)) else: print "no filter applied" filtered_patients = tested_gene_expression_headers_columns tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients( filtered_patients, tested_gene_expression_headers_columns, tested_gene_expression) if np.shape(tested_gene_expression)[1] == 1: print "no expressions were found after filtering by labels {}. skipping...".format( filter_expression) return None total_gene_list = load_gene_list(total_gene_list_file_name) tested_gene_list = load_gene_list(tested_gene_list_file_name) row_var = np.var(tested_gene_expression, axis=1) row_var_sorted = np.sort(row_var)[::-1] labels_assignment_patients = None if meta_groups is not None: print "clustering patients by groups" labels_assignment_patients = labels_assignments( meta_groups, phenotype_file_name, tested_gene_expression_headers_columns) enrichment_lists = [] if enrichment_list_file_names is not None: for cur in enrichment_list_file_names: enrichment_lists.append(load_gene_list(cur)) if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] gene_expression_top_var = np.delete(tested_gene_expression, row_var_masked_indices, axis=0) gene_expression_top_var_header_rows = np.delete( tested_gene_expression_headers_rows, row_var_masked_indices, axis=0) gene_expression_top_var_header_columns = tested_gene_expression_headers_columns clfs_results = {} output_rows = [] if calc_go: if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download( constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')): # wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_expression_top_var_header_rows) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] print GO_results if cluster_algorithm == "kmeans": for n_clusters in range(start_k, end_k + 1): clfs_results[n_clusters] = [] centres, km_clf, dist = kmeanssample(X=gene_expression_top_var, k=n_clusters, metric="euclidean") for i in range(n_clusters): ranks = [] for j in range(n_clusters): ranks.append( np.average( np.delete(gene_expression_top_var, np.where(km_clf != j)[0], axis=0))) ranks = rankdata(ranks) cluster_labels = np.array(km_clf) for j in range(n_clusters): cluster_labels[np.where(km_clf == ranks[j] - 1)] = j labels_assignment = [cluster_labels + 1] cluster_indices = np.where(km_clf != i)[0] gene_expression_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) gene_headers_row_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) clfs_results[n_clusters].append( (gene_headers_row_cluster, gene_headers_row_cluster)) desc = "k={} clustering cluster {} has {} genes".format( n_clusters, i, len(gene_expression_cluster)) gene_list = ",".join(gene_headers_row_cluster) url = check_enrichment(gene_list) go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if calc_go: g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_headers_row_cluster) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip( *GO_results) if len(enrichment_lists) != 0: for j, cur in enumerate(enrichment_lists): go_terms.append( enrichment_list_file_names[j].split(".")[0]) uncorrectd_pvals.append( calc_HG_test( [x.split(".")[0] for x in tested_gene_list], [x.split(".")[0] for x in cur], [ x.split(".")[0] for x in gene_headers_row_cluster ])) FDRs.append(".") go_names.append(".") go_ns.append(".") output_rows.append((desc, "\r\n".join([ x.split(".")[0] for x in gene_headers_row_cluster ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))) gene_sorted_heatmap = np.rot90(np.flip( gene_expression_top_var[cluster_labels.argsort(), :], 1), k=-1, axes=(1, 0)) find_clusters(end_k, gene_sorted_heatmap, gene_expression_top_var_header_columns, start_k, e2g_convertor(gene_expression_top_var_header_rows), tested_gene_list_file_name, labels_assignment=labels_assignment_patients) plot_heatmap(gene_expression_top_var, gene_expression_top_var_header_columns, labels_assignment, gene_expression_top_var_header_rows, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) if cluster_algorithm == "hierarchical": df = pd.DataFrame(data=gene_sorted_heatmap, index=gene_expression_top_var_header_columns, columns=gene_expression_top_var_header_rows) # correlations = df.corr() # correlations_array = np.asarray(df.corr()) # # row_linkage = hierarchy.linkage( # distance.pdist(correlations_array), method='average') # # col_linkage = hierarchy.linkage( # distance.pdist(correlations_array.T), method='average') # enrichment_gene_list = load_gene_list("uvm_mito_part.txt") dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg")) row_colors = map(dct.get, labels_assignment_patients[0]) dct = {1: 'b', 2: 'r'} gene_expression_top_var_header_rows_trimmed = [ x.split(".")[0] for x in gene_expression_top_var_header_rows ] # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed]) g = sns.clustermap(df, row_colors=row_colors, metric="euclidean", robust=True, method="single") # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage, # labels=df.index, # color_threshold=0.60) den_genes = scipy.cluster.hierarchy.dendrogram( g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7) clusters = get_cluster_classes(den_genes) g.savefig( os.path.join(constants.BASE_PROFILE, "output", "hierarchical_cluster_{}.png".format(time.time()))) for cur_labels_assignment_patient in labels_assignment_patients: plot_heatmap(gene_sorted_heatmap, gene_expression_top_var_header_rows, [cur_labels_assignment_patient], gene_expression_top_var_header_columns, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) print_to_excel( output_rows=output_rows, gene_list_file_name=tested_gene_list_file_name.split(".")[0], gene_expression_file_name=gene_expression_file_name.split(".")[0], var_th_index=var_th_index)
def patient_sets_distribution_differences(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean", average_patients=True, compute_hotelling=False): data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups) ordered_gene_expression = gene_expression_top_var[labels_assignment[i].argsort(), :] labels_assignment[i].sort() heatmap_values = gene_expression_top_var if average_patients: avgs = None for cur_label_1 in np.unique(labels_assignment[i]): avg = np.average(ordered_gene_expression[np.where(labels_assignment[i] == cur_label_1)[0],:], axis=0) if avgs is None: avgs = avg.reshape(1,len(avg)) else: avgs = np.r_[avgs, avg.reshape(1,len(avg))] heatmap_values=avgs plot_pca(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name) plot_pca_by_samples(ordered_gene_expression, [labels_assignment[i]], [meta_groups[i]], tested_gene_list_file_name = tested_gene_list_file_name) plot_heatmap(heatmap_values, e2g_convertor(gene_expression_top_var_headers_columns), [np.unique(labels_assignment[i])], gene_expression_top_var_headers_rows, tested_gene_list_file_name, label_index=i) if compute_hotelling: with file(os.path.join(constants.BASE_PROFILE,"output", "hotelling_{}_{}_{}.txt".format(constants.CANCER_TYPE, tested_gene_list_file_name.split(".")[0], time.time())),"w+") as f: output = "\t" for cur_label_1 in np.unique(labels_assignment[i]): output+= "group {}\t".format(cur_label_1) output += "\n" for cur_label_1 in np.unique(labels_assignment[i]): output += "group {}\t".format(cur_label_1) for cur_label_2 in np.unique(labels_assignment[i]): if cur_label_2 > cur_label_1: continue cur_label_2_start = np.where(labels_assignment[i]==cur_label_2)[0][0] cur_label_1_end = len(labels_assignment[i]) if cur_label_1 != labels_assignment[i][-1]: cur_label_1_end = np.where(labels_assignment[i]==cur_label_1+1)[0][0] cur_label_2_end = len(labels_assignment[i]) if cur_label_2 != labels_assignment[i][-1]: cur_label_2_end = np.where(labels_assignment[i] == cur_label_2 + 1)[0][0] T2 = spm1d.stats.hotellings2(ordered_gene_expression[cur_label_1_start:cur_label_1_end], ordered_gene_expression[cur_label_2_start:cur_label_2_end]) T2i = T2.inference(0.05) output+="{}\t".format(T2i.p) # km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, tested_gene_list_file_name.split(".")[0],label_index=i) output+="\n" f.write(output)
def find_clusters_and_survival(reduced_dim_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, filter_expression=None, clustering_algorithm="euclidean"): data = load_integrated_ge_data( tested_gene_list_file_name=reduced_dim_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, gene_expression_path=os.path.join(constants.OUTPUT_GLOBAL_DIR, gene_expression_file_name), phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data tmp = gene_expression_top_var_headers_rows gene_expression_top_var_headers_rows = gene_expression_top_var_headers_columns gene_expression_top_var_headers_columns = tmp gene_expression_top_var = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) # plot_genes_statistic(gene_expression_top_var, gene_expression_top_var_headers_columns, tested_gene_list_file_name) lr_results_global = [] if is_unsupervised: clfs_results = find_clusters( end_k, gene_expression_top_var, gene_expression_top_var_headers_rows, start_k, e2g_convertor(gene_expression_top_var_headers_columns), reduced_dim_file_name, labels_assignment, clustering_algorithm=clustering_algorithm) for i in range(start_k, end_k + 1): lr_results_global.append( km_curve(clfs_results[i], survival_dataset[1:], gene_expression_top_var_headers_rows, reduced_dim_file_name.split(".")[0], i)) # B1 = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A'] # B2 = ['TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A'] # N = ['TCGA-V4-A9E7-01A', 'TCGA-V4-A9E8-01A', 'TCGA-V4-A9EE-01A', 'TCGA-V4-A9EF-01A', 'TCGA-V4-A9EI-01A', 'TCGA-V4-A9EJ-01A', 'TCGA-V4-A9EK-01A', 'TCGA-V4-A9EL-01A', 'TCGA-V4-A9EQ-01A', 'TCGA-V4-A9ET-01A', 'TCGA-V4-A9EX-01A', 'TCGA-V4-A9F0-01A', 'TCGA-V4-A9F3-01A', 'TCGA-V4-A9F7-01A', 'TCGA-V4-A9F8-01A', 'TCGA-VD-A8KG-01A', 'TCGA-VD-A8KJ-01A', 'TCGA-VD-A8KL-01A', 'TCGA-VD-A8KM-01A', 'TCGA-VD-A8KN-01A', 'TCGA-VD-AA8M-01A', 'TCGA-VD-AA8N-01A', 'TCGA-VD-AA8S-01B', 'TCGA-WC-A87Y-01A', 'TCGA-WC-A880-01A', 'TCGA-WC-A883-01A', 'TCGA-WC-A884-01A', 'TCGA-WC-A885-01A', 'TCGA-WC-A888-01A', 'TCGA-YZ-A980-01A', 'TCGA-YZ-A982-01A', 'TCGA-YZ-A983-01A', 'TCGA-V4-A9E5-01A', 'TCGA-V4-A9E9-01A', 'TCGA-V4-A9EA-01A', 'TCGA-V4-A9EC-01A', 'TCGA-V4-A9ED-01A', 'TCGA-V4-A9EH-01A', 'TCGA-V4-A9EM-01A', 'TCGA-V4-A9EO-01A', 'TCGA-V4-A9ES-01A', 'TCGA-V4-A9EW-01A', 'TCGA-V4-A9EY-01A', 'TCGA-V4-A9EZ-01A', 'TCGA-V4-A9F1-01A', 'TCGA-V4-A9F2-01A', 'TCGA-V4-A9F4-01A', 'TCGA-VD-A8K7-01B', 'TCGA-VD-A8K9-01A', 'TCGA-VD-A8KA-01B', 'TCGA-VD-A8KB-01A', 'TCGA-VD-A8KE-01A', 'TCGA-VD-A8KH-01A', 'TCGA-VD-A8KK-01A', 'TCGA-VD-A8KO-01A', 'TCGA-VD-AA8P-01A', 'TCGA-VD-AA8R-01A', 'TCGA-VD-AA8T-01A', 'TCGA-WC-A87T-01A', 'TCGA-WC-A87U-01A', 'TCGA-WC-A87W-01A', 'TCGA-WC-A881-01A', 'TCGA-WC-A882-01A', 'TCGA-WC-AA9E-01A', 'TCGA-YZ-A985-01A'] # print "Group Low HG:" # print calc_HG_test(N,B1,clfs_results[i][0]) # print calc_HG_test(N, B1, clfs_results[i][1]) # print "Group High HG:" # print calc_HG_test(N, B2, clfs_results[i][0]) # print calc_HG_test(N, B2, clfs_results[i][1]) else: for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label( phenotype_file_name, groups=cur_groups) plot_heatmap( gene_expression_top_var, e2g_convertor(gene_expression_top_var_headers_columns), [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:], gene_expression_top_var_headers_rows, reduced_dim_file_name, label_index=i) lr_results_global.append( km_curve(labeled_patients, survival_dataset[1:], gene_expression_top_var_headers_rows, reduced_dim_file_name.split(".")[0], label_index=i)) return lr_results_global
def predict_ge_by_mutation(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, is_unsupervised=True, start_k=2, end_k=2, meta_groups=None, phenotype_labels_heatmap = None, filter_expression=None, integ=False, min_ratio=0.1 , included_mutation_gene_list=None, excluded_mutation_gene_list=None): integ_data = load_integrated_mutation_data( mutation_file_name=mutation_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, phenotype_labels_heatmap = phenotype_labels_heatmap, filter_expression=filter_expression) if integ_data is None: print "insufficient data" return mu_data, mu_data_headers_rows, mu_data_headers_columns, labels_assignment, survival_dataset, phenotype_heatmap = integ_data all_patients = np.unique(mu_data_headers_rows).flatten() all_mutated_genes = np.unique(mu_data[:,0]).flatten() # mis_mutated_genes = np.unique(mu_data[np.where(np.core.defchararray.find(mu_data[1:, 8], "missense")!=-1), 1]).flatten() all_mutated_vectors = np.zeros((len(all_patients), len(all_mutated_genes))) # mis_mutated_vectors = np.array([[0 for y in mis_mutated_genes] for x in range(len(all_patients))]) print "build vectors from {} entries".format(len(mu_data)) stopwatch = Stopwatch() stopwatch.start() a = list(all_patients) b = list(all_mutated_genes) for i, x in enumerate(mu_data): all_mutated_vectors[a.index(mu_data_headers_rows[i])][b.index(x[0])] += 1 print stopwatch.stop("end mut") all_mutated_vectors[all_mutated_vectors>5] =5 if included_mutation_gene_list is not None: included_mutation_gene = load_gene_list(included_mutation_gene_list) all_mutated_vectors = all_mutated_vectors[:, np.in1d(all_mutated_genes,included_mutation_gene)] all_mutated_genes = all_mutated_genes[np.in1d(all_mutated_genes,included_mutation_gene)] if excluded_mutation_gene_list is not None: excluded_mutation_gene = load_gene_list(excluded_mutation_gene_list) for cur in excluded_mutation_gene: all_mutated_vectors = all_mutated_vectors[:,all_mutated_genes!=cur ] all_mutated_genes = all_mutated_genes[all_mutated_genes!=cur ] all_mutated_vectors[all_mutated_vectors > 5] = 5 all_mutated_genes = all_mutated_genes[(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * min_ratio] all_mutated_vectors = all_mutated_vectors[:,(all_mutated_vectors != 0).sum(axis=0) > np.shape(all_mutated_vectors)[0] * min_ratio] print "all_mutated_vectors after filter sparse: {}".format(np.shape(all_mutated_vectors)) if np.size(all_mutated_genes) == 0: return mutation_expression_integ = all_mutated_vectors mutual_patients = all_patients mutation_expression_integ_headers_columns = all_mutated_genes mutual_phenotype_heatmap = phenotype_heatmap # mutation_expression_integ = zscore(mutation_expression_integ, axis=0) if integ: ge_data = load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, meta_groups=meta_groups, filter_expression=filter_expression) if ge_data is None: print "insufficient data" return gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, _ , survival_dataset = ge_data all_mutated_vectors = np.nan_to_num(zscore(all_mutated_vectors, axis=0)) gene_expression_top_var = np.nan_to_num(zscore(gene_expression_top_var, axis=0)) mutual_patients = np.array([x for x in all_patients if x in gene_expression_top_var_headers_rows]) mutual_mutations = all_mutated_vectors[np.in1d(all_patients, mutual_patients)] mutual_mutations = mutual_mutations[mutual_patients.argsort()] if phenotype_labels_heatmap is not None: mutual_phenotype_heatmap = phenotype_heatmap[np.in1d(all_patients, mutual_patients)] mutual_phenotype_heatmap = mutual_phenotype_heatmap[mutual_patients.argsort()] mutual_patients = np.array([x for x in gene_expression_top_var_headers_rows if x in all_patients]) mutual_expressions = gene_expression_top_var[np.in1d(gene_expression_top_var_headers_rows, mutual_patients)] mutual_expressions = mutual_expressions[mutual_patients.argsort()] mutual_patients.sort() mutation_expression_integ = np.c_[mutual_mutations, mutual_expressions] mutation_expression_integ_headers_columns = np.r_[all_mutated_genes, gene_expression_top_var_headers_columns] else: survival_dataset = np.array(load_survival_data(survival_file_name)) if is_unsupervised: print "find clusters" clfs_results = find_clusters(end_k, mutation_expression_integ, mutual_patients, start_k, mutation_expression_integ_headers_columns, tested_gene_list_file_name, labels_assignment=labels_assignment, phenotype_heatmap=mutual_phenotype_heatmap) for cur_k in range(start_k, end_k+1): km_curve(clfs_results[cur_k], survival_dataset[1:], mutual_patients, tested_gene_list_file_name.split(".")[0], i) else: for i, cur_groups in enumerate(meta_groups): labeled_patients = divided_patient_ids_by_label(phenotype_file_name, groups=cur_groups) plot_heatmap(mutation_expression_integ, mutation_expression_integ_headers_columns, [labels_assignment[i]] + labels_assignment[:i] + labels_assignment[i + 1:], mutual_patients, tested_gene_list_file_name, label_index=i) km_curve(labeled_patients, survival_dataset[1:], mutual_patients, tested_gene_list_file_name.split(".")[0],label_index=i)