def load_top_gene_linreg_dict(config, num_top): fn = 'top.txt' fn = get_result_path(config, fn) f = open(fn) names = [] metrics = [] slopes = [] clusters = [] for line in f: cols = line.split(' ') gene = cols[0].rstrip() slope = float(cols[5].rstrip()) metric = float(cols[3].rstrip()) cluster = int(cols[1].rstrip()) names.append(gene) slopes.append(slope) metrics.append(metric) clusters.append(cluster) names = names[0:num_top] slopes = slopes[0:num_top] metrics = metrics[0:num_top] clusters = clusters[0:num_top] top_dict = {} for id in range(0, len(names)): top_dict[names[id]] = [id, metrics[id], clusters[id], slopes[id]] return top_dict
def save_top_anova(config, num_top=500): attributes_dict = get_attributes_dict(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] curr_beta_dict = {} for key_age in attributes_dict: curr_beta_dict[key_age] = list( np.asarray(curr_vals)[attributes_dict[key_age]]) anova_res = stats.f_oneway(*curr_beta_dict.values()) pvals.append(anova_res.pvalue) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) genes_sorted = [] pvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, pvals_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, pvals_genes]) config.dt = DataType.cpg
def load_top_gene_names_by_cpg(config, method, num_top): fn = 'genes_from_cpg.txt' fn = get_result_path(config, fn) f = open(fn) gene_names = [] for line in f: gene = line.split(' ')[0].rstrip() gene_names.append(gene) gene_names = gene_names[0:num_top] return gene_names
def load_top_data(config, num_top, index): fn = 'top.txt' fn = get_result_path(config, fn) f = open(fn) gene_names = [] for line in f: gene = line.split(' ')[index].rstrip() gene_names.append(gene) gene_names = gene_names[0:num_top] return gene_names
def save_top_spearman(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) rhos = [] for id in range(0, len(cpgs)): curr_vals = vals[id] rho, pval = stats.spearmanr(attributes, curr_vals) rhos.append(rho) order = np.argsort(list(map(abs, rhos)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) rhos_sorted = list(np.array(rhos)[order]) genes_sorted = [] rhos_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] rho = rhos_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: genes_sorted.append(gene) rhos_genes.append(rho) cpgs_sorted = cpgs_sorted[0:num_top] rhos_sorted = rhos_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] rhos_genes = rhos_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, rhos_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, rhos_genes]) config.dt = DataType.cpg
def load_top_cpg_data(config, method, num_top): indexes = config.indexes db_type = config.db_type print_rate = config.print_rate cpgs_top = [] fn = 'top.txt' fn = get_result_path(config, fn) f = open(fn) for line in f: cpg = line.split(' ')[0].rstrip() cpgs_top.append(cpg) cpgs_top = cpgs_top[0:num_top] fn = db_type.value + '_average_beta.txt' path = get_path(config, fn) f = open(path) for skip_id in range(0, config.num_skip_lines): skip_line = f.readline() num_lines = 0 dict_top = {} for line in f: col_vals = line_proc(config, line) cpg = col_vals[0] vals = list(map(float, col_vals[1::])) vals = list(np.array(vals)[indexes]) if cpg in cpgs_top: dict_top[cpg] = vals num_lines += 1 if num_lines % print_rate == 0: print('num_lines: ' + str(num_lines)) vals_top = [] for cpg in cpgs_top: vals = dict_top.get(cpg) vals_top.append(vals) return cpgs_top, vals_top
def save_top_linreg(config): attributes = get_attributes(config) genes, vals = load_gene_data(config) p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(genes)): val = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( attributes, val) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order_mean = np.argsort(list(map(abs, r_values)))[::-1] p_values_sorted = list(np.array(p_values)[order_mean]) r_values_sorted = list(np.array(r_values)[order_mean]) slopes_sorted = list(np.array(slopes)[order_mean]) intercepts_sorted = list(np.array(intercepts)[order_mean]) genes_sorted = list(np.array(genes)[order_mean]) metrics_sorted_np = np.asarray(list(map(abs, r_values_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted ])
def save_top_anova(config): gene_names, gene_vals = load_gene_data(config) attributes_dict = get_attributes_dict(config) pvals = [] for id in range(0, len(gene_names)): vals = gene_vals[id] vals_dict = {} for key_age in attributes_dict: vals_dict[key_age] = list( np.asarray(vals)[attributes_dict[key_age]]) anova_mean = stats.f_oneway(*vals_dict.values()) pvals.append(anova_mean.pvalue) order = np.argsort(pvals) genes_sorted = list(np.array(gene_names)[order]) pvals_sorted = list(np.array(pvals)[order]) metrics_sorted_np = np.asarray(list(map(np.log10, pvals_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted ])
def save_top_enet(config, num_bootstrap_runs=10, num_top=500): dict_cpg_gene = get_dict_cpg_gene(config) params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) cpgs_passed, vals_passed = load_cpg_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 cpg_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) cpg_sorted = list(np.array(cpgs_passed)[order]) coef_top = coef_sorted[0:num_top] cpg_top = cpg_sorted[0:num_top] for top_id in range(0, num_top): cpg = cpg_top[top_id] if cpg in cpg_top_dict: cpg_top_dict[cpg] += 1 else: cpg_top_dict[cpg] = 1 bootstrap_id += 1 cpgs = list(cpg_top_dict.keys()) counts = list(cpg_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) counts_sorted = list(np.array(counts)[order]) genes_sorted = [] counts_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] count = counts_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) counts_genes.append(count) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, counts_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, counts_genes]) config.dt = DataType.cpg
def save_top_linreg(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) slopes = [] intercepts = [] rvals = [] pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( curr_vals, attributes) slopes.append(slope) intercepts.append(intercept) rvals.append(r_value) pvals.append(p_value) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) slopes_sorted = list(np.array(slopes)[order]) intercepts_sorted = list(np.array(intercepts)[order]) rvals_sorted = list(np.array(rvals)[order]) genes_sorted = [] pvals_genes = [] slopes_genes = [] intercepts_genes = [] rvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] slope = slopes_sorted[id] intercept = intercepts_sorted[id] rval = rvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) slopes_genes.append(slope) intercepts_genes.append(intercept) rvals_genes.append(rval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] slopes_sorted = slopes_sorted[0:num_top] intercepts_sorted = intercepts_sorted[0:num_top] rvals_sorted = rvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] slopes_genes = slopes_genes[0:num_top] intercepts_genes = intercepts_genes[0:num_top] rvals_genes = rvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted, intercepts_sorted ]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes ]) config.dt = DataType.cpg
def save_bend_linreg(config, limit, pval, num_opt=1000): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) cpg_names_l, cpg_vals_l = load_cpg_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) cpg_names_m, cpg_vals_m = load_cpg_data(config_more) cpg_gene_dict = get_dict_cpg_gene(config) cpgs_passed = [] genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] num_cpgs = 0 for cpg_id_l in range(0, len(cpg_names_l)): cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l]) vals_l = cpg_vals_l[cpg_id_l] vals_m = cpg_vals_m[cpg_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): cpgs_passed.append(cpg_names_l[cpg_id_l]) genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l]) if len(genes) > 0: if genes[0] == '': genes_passed.append('nan') else: genes_passed.append(";".join(genes)) else: genes_passed.append('nan') angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) num_cpgs += 1 if num_cpgs % config.print_rate == 0: print('num_cpgs: ' + str(num_cpgs)) order = np.argsort(angles)[::-1][0:num_opt] cpgs_opt = list(np.array(cpgs_passed)[order]) genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ]) raw_config = Config(db=config.db, dt=config.dt, approach=config.approach, scenario=config.scenario, approach_method=config.approach_method, gender=Gender.any) cpg_str_list = [] cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config) for cpg in cpgs_opt: cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)] curr_cpg_str = cpg for id in range(0, len(cpg_vals)): curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e'))) cpg_str_list.append(curr_cpg_str) fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt') np.savetxt(fn, cpg_str_list, fmt="%s")
def save_bend_linreg(config, limit, pval): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) g_names_l, g_vals_l = load_gene_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) g_names_m, g_vals_m = load_gene_data(config_more) genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] for g_id_l in range(0, len(g_names_l)): g_id_m = g_names_m.index(g_names_l[g_id_l]) vals_l = g_vals_l[g_id_l] vals_m = g_vals_m[g_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): genes_passed.append(g_names_l[g_id_l]) angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) order = np.argsort(angles)[::-1] genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ])
def save_top_enet(config, num_bootstrap_runs=100, num_top=500): params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 gene_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) gene_sorted = list(np.array(genes_passed)[order]) coef_top = coef_sorted[0:num_top] gene_top = gene_sorted[0:num_top] for top_id in range(0, num_top): gene = gene_top[top_id] if gene in gene_top_dict: gene_top_dict[gene] += 1 else: gene_top_dict[gene] = 1 bootstrap_id += 1 genes = list(gene_top_dict.keys()) counts = list(gene_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] genes_sorted = list(np.array(genes)[order]) counts_sorted = list(np.array(counts)[order]) metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, counts_sorted ])
geo=geo, cpg_condition=cpg_condition) attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) num_int = 200 int_begin = 0 int_end = 1 int_shift = (int_end - int_begin) / num_int ints = [] pdf = np.zeros(num_int) for int_id in range(0, num_int): ints.append(int_begin + int_id * int_shift + 0.5 * int_shift) for curr_cpg_vals in vals: for beta in curr_cpg_vals: int_id = math.floor((beta - int_begin) * num_int / (int_end - int_begin + 1.0e-8)) pdf[int_id] += 1 pdf = np.asarray(pdf) sum_pdf = np.sum(pdf) pdf = pdf / (sum_pdf * int_shift) print('pdf norm: ' + str(np.sum(pdf) * int_shift)) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ints, pdf]) config.dt = DataType.cpg