def save_plane_linreg(config, num_top=100, gd_type_x=GeneDataType.mean, gd_type_y=GeneDataType.mean): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names = load_top_gene_names(config, num_top) config.scenario = Scenario.validation config.approach_gd = gd_type_x gene_vals_x = load_top_gene_vals(config, gene_names) config.approach_gd = gd_type_y gene_vals_y = load_top_gene_vals(config, gene_names) p_values_x = [] r_values_x = [] p_values_y = [] r_values_y = [] for id in range(0, len(gene_names)): vals_main = gene_vals_x[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals_main, attributes) p_values_x.append(p_value) r_values_x.append(r_value) vals_aux = gene_vals_y[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals_aux, attributes) p_values_y.append(p_value) r_values_y.append(r_value) fn = 'plane.txt' fn = get_result_path(config, fn) save_features(fn, [gene_names, r_values_x, r_values_y])
def save_simple_linreg(config, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names, gene_vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(gene_names)): vals = gene_vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals, attributes) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order = np.argsort(list(map(abs, r_values)))[::-1] p_values = list(np.array(p_values)[order]) r_values = list(np.array(r_values)[order]) slopes = list(np.array(slopes)[order]) intercepts = list(np.array(intercepts)[order]) gene_names = list(np.array(gene_names)[order]) fn = 'metrics.txt' fn = get_result_path(config, fn) save_features(fn, [gene_names, p_values, r_values, slopes, intercepts])
def save_params_enet(config, num_folds=10): attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) param_names, param_values = get_enet_params(attributes, vals, num_folds) fn = 'params.txt' fn = get_param_path(config, fn) save_features(fn, [param_names, param_values])
def save_error_from_age(config, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach names, vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation X = vals y = attributes model = linreg_mult(y, X) ages = [] maes = [] str_list = [] x_all = [] y_all = [] for age in range(0, 150): indexes = [i for i, x in enumerate(attributes) if x == age] if len(indexes) > 0: ages.append(age) X_test = np.array(vals).T[indexes].tolist() y_test_pred = model.get_prediction(X_test).predicted_mean curr_str = str(age) mae = 0 for pred_age in y_test_pred: mae += abs(pred_age - age) curr_str += (' ' + str(format(pred_age, '0.8e'))) x_all.append(age) y_all.append(pred_age - age) mae /= len(indexes) str_list.append(curr_str) maes.append(mae) fn = 'error_from_age.txt' fn = get_result_path(config, fn) save_features(fn, [ages, maes]) fn = 'errors.txt' fn = get_result_path(config, fn) np.savetxt(fn, str_list, fmt="%s") slope, intercept, r_value, p_value, std_err = stats.linregress(x_all, y_all) print('slope: ' + str(slope)) print('intercept: ' + str(intercept)) print('r_value: ' + str(r_value)) print('p_value: ' + str(p_value)) print('std_err: ' + str(std_err))
def save_params_enet(config, num_folds=10): attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) param_names, param_values = get_enet_params(vals_passed, attributes, num_folds) fn = 'params.txt' fn = get_param_path(config, fn) save_features(fn, [param_names, param_values])
def save_gene_by_cpg(config, fn): cpg_gene_dict = get_dict_cpg_gene(config) f = open(fn + '.txt') target_cpgs = f.read().splitlines() genes = [] for cpg in target_cpgs: if cpg in cpg_gene_dict: curr_genes = cpg_gene_dict[cpg] for gene in curr_genes: if gene not in genes: genes.append(gene) save_features(fn + '_genes.txt', genes)
def save_top_anova(config, num_top=500): attributes_dict = get_attributes_dict(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] curr_beta_dict = {} for key_age in attributes_dict: curr_beta_dict[key_age] = list( np.asarray(curr_vals)[attributes_dict[key_age]]) anova_res = stats.f_oneway(*curr_beta_dict.values()) pvals.append(anova_res.pvalue) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) genes_sorted = [] pvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, pvals_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, pvals_genes]) config.dt = DataType.cpg
def save_top_spearman(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) rhos = [] for id in range(0, len(cpgs)): curr_vals = vals[id] rho, pval = stats.spearmanr(attributes, curr_vals) rhos.append(rho) order = np.argsort(list(map(abs, rhos)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) rhos_sorted = list(np.array(rhos)[order]) genes_sorted = [] rhos_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] rho = rhos_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: genes_sorted.append(gene) rhos_genes.append(rho) cpgs_sorted = cpgs_sorted[0:num_top] rhos_sorted = rhos_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] rhos_genes = rhos_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, rhos_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, rhos_genes]) config.dt = DataType.cpg
def save_simple_linreg_mult(config, num_bootstrap_runs=500, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names, gene_vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation counts, R2s = R2_from_count(gene_vals, attributes) fn = 'R2s_' + str(num_top) + '.txt' fn = get_result_path(config, fn) save_features(fn, [counts, R2s]) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size metrics_names, metrics_vals = validation_metrics(gene_vals, attributes, test_size, train_size, num_bootstrap_runs) fn = 'metrics_' + str(num_top) + '.txt' fn = get_result_path(config, fn) save_features(fn, [metrics_names, metrics_vals]) print(linreg_mult_with_const(attributes, gene_vals).summary())
def save_top_linreg(config): attributes = get_attributes(config) genes, vals = load_gene_data(config) p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(genes)): val = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( attributes, val) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order_mean = np.argsort(list(map(abs, r_values)))[::-1] p_values_sorted = list(np.array(p_values)[order_mean]) r_values_sorted = list(np.array(r_values)[order_mean]) slopes_sorted = list(np.array(slopes)[order_mean]) intercepts_sorted = list(np.array(intercepts)[order_mean]) genes_sorted = list(np.array(genes)[order_mean]) metrics_sorted_np = np.asarray(list(map(abs, r_values_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted ])
def save_top_anova(config): gene_names, gene_vals = load_gene_data(config) attributes_dict = get_attributes_dict(config) pvals = [] for id in range(0, len(gene_names)): vals = gene_vals[id] vals_dict = {} for key_age in attributes_dict: vals_dict[key_age] = list( np.asarray(vals)[attributes_dict[key_age]]) anova_mean = stats.f_oneway(*vals_dict.values()) pvals.append(anova_mean.pvalue) order = np.argsort(pvals) genes_sorted = list(np.array(gene_names)[order]) pvals_sorted = list(np.array(pvals)[order]) metrics_sorted_np = np.asarray(list(map(np.log10, pvals_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted ])
def save_top_enet(config, num_bootstrap_runs=10, num_top=500): dict_cpg_gene = get_dict_cpg_gene(config) params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) cpgs_passed, vals_passed = load_cpg_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 cpg_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) cpg_sorted = list(np.array(cpgs_passed)[order]) coef_top = coef_sorted[0:num_top] cpg_top = cpg_sorted[0:num_top] for top_id in range(0, num_top): cpg = cpg_top[top_id] if cpg in cpg_top_dict: cpg_top_dict[cpg] += 1 else: cpg_top_dict[cpg] = 1 bootstrap_id += 1 cpgs = list(cpg_top_dict.keys()) counts = list(cpg_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) counts_sorted = list(np.array(counts)[order]) genes_sorted = [] counts_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] count = counts_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) counts_genes.append(count) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, counts_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, counts_genes]) config.dt = DataType.cpg
def save_top_linreg(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) slopes = [] intercepts = [] rvals = [] pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( curr_vals, attributes) slopes.append(slope) intercepts.append(intercept) rvals.append(r_value) pvals.append(p_value) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) slopes_sorted = list(np.array(slopes)[order]) intercepts_sorted = list(np.array(intercepts)[order]) rvals_sorted = list(np.array(rvals)[order]) genes_sorted = [] pvals_genes = [] slopes_genes = [] intercepts_genes = [] rvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] slope = slopes_sorted[id] intercept = intercepts_sorted[id] rval = rvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) slopes_genes.append(slope) intercepts_genes.append(intercept) rvals_genes.append(rval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] slopes_sorted = slopes_sorted[0:num_top] intercepts_sorted = intercepts_sorted[0:num_top] rvals_sorted = rvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] slopes_genes = slopes_genes[0:num_top] intercepts_genes = intercepts_genes[0:num_top] rvals_genes = rvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted, intercepts_sorted ]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes ]) config.dt = DataType.cpg
def save_top_enet(config, num_bootstrap_runs=100, num_top=500): params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 gene_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) gene_sorted = list(np.array(genes_passed)[order]) coef_top = coef_sorted[0:num_top] gene_top = gene_sorted[0:num_top] for top_id in range(0, num_top): gene = gene_top[top_id] if gene in gene_top_dict: gene_top_dict[gene] += 1 else: gene_top_dict[gene] = 1 bootstrap_id += 1 genes = list(gene_top_dict.keys()) counts = list(gene_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] genes_sorted = list(np.array(genes)[order]) counts_sorted = list(np.array(counts)[order]) metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, counts_sorted ])
def save_bend_linreg(config, limit, pval, num_opt=1000): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) cpg_names_l, cpg_vals_l = load_cpg_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) cpg_names_m, cpg_vals_m = load_cpg_data(config_more) cpg_gene_dict = get_dict_cpg_gene(config) cpgs_passed = [] genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] num_cpgs = 0 for cpg_id_l in range(0, len(cpg_names_l)): cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l]) vals_l = cpg_vals_l[cpg_id_l] vals_m = cpg_vals_m[cpg_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): cpgs_passed.append(cpg_names_l[cpg_id_l]) genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l]) if len(genes) > 0: if genes[0] == '': genes_passed.append('nan') else: genes_passed.append(";".join(genes)) else: genes_passed.append('nan') angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) num_cpgs += 1 if num_cpgs % config.print_rate == 0: print('num_cpgs: ' + str(num_cpgs)) order = np.argsort(angles)[::-1][0:num_opt] cpgs_opt = list(np.array(cpgs_passed)[order]) genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ]) raw_config = Config(db=config.db, dt=config.dt, approach=config.approach, scenario=config.scenario, approach_method=config.approach_method, gender=Gender.any) cpg_str_list = [] cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config) for cpg in cpgs_opt: cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)] curr_cpg_str = cpg for id in range(0, len(cpg_vals)): curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e'))) cpg_str_list.append(curr_cpg_str) fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt') np.savetxt(fn, cpg_str_list, fmt="%s")
geo=geo, cpg_condition=cpg_condition) attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) num_int = 200 int_begin = 0 int_end = 1 int_shift = (int_end - int_begin) / num_int ints = [] pdf = np.zeros(num_int) for int_id in range(0, num_int): ints.append(int_begin + int_id * int_shift + 0.5 * int_shift) for curr_cpg_vals in vals: for beta in curr_cpg_vals: int_id = math.floor((beta - int_begin) * num_int / (int_end - int_begin + 1.0e-8)) pdf[int_id] += 1 pdf = np.asarray(pdf) sum_pdf = np.sum(pdf) pdf = pdf / (sum_pdf * int_shift) print('pdf norm: ' + str(np.sum(pdf) * int_shift)) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ints, pdf]) config.dt = DataType.cpg
def save_bend_linreg(config, limit, pval): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) g_names_l, g_vals_l = load_gene_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) g_names_m, g_vals_m = load_gene_data(config_more) genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] for g_id_l in range(0, len(g_names_l)): g_id_m = g_names_m.index(g_names_l[g_id_l]) vals_l = g_vals_l[g_id_l] vals_m = g_vals_m[g_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): genes_passed.append(g_names_l[g_id_l]) angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) order = np.argsort(angles)[::-1] genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ])
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett): dict_bop_cpgs = load_bop_cpg_dict(config) dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs) cpgs, betas = load_cpg_data(config) atr_table = [] atr_cols = [] for atr_type in attributes_types: if isinstance(atr_type, Attribute): atr_table.append(get_attributes(config, atr_type)) elif isinstance(atr_type, CellPop): atr_table.append(get_cell_pop(config, [atr_type])) atr_cols.append(atr_type.value) num_bops = 0 bops_passed = [] bops_pvals = [] for bop in dict_bop_cpgs: curr_cpgs = dict_bop_cpgs.get(bop) cpgs_passed = [] for cpg in curr_cpgs: if cpg in cpgs: cpgs_passed.append(cpg) if len(cpgs_passed) > 2: pvals_on_bop = [] for win_id in range(0, len(cpgs_passed) - 2): val_table = [] val_cols = [] for cpg_id in range(0, window): cpg = cpgs_passed[win_id + cpg_id] beta = betas[cpgs.index(cpg)] val_table.append(beta) val_cols.append('cpg_'+str(cpg_id)) table = atr_table + val_table cols = atr_cols + val_cols formula = val_cols[0] for val_col_id in range(1, len(val_cols)): val_col = val_cols[val_col_id] formula += ' + ' + val_col formula += ' ~ ' + atr_cols[0] for atr_col_id in range(1, len(atr_cols)): atr_col = atr_cols[atr_col_id] formula += ' + ' + atr_col table = list(map(list, zip(*table))) x = pd.DataFrame(table, columns=cols) manova = MANOVA.from_formula(formula, x) mv_test_res = manova.mv_test() pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4] target_pval = pvals[0] if test is MANOVATest.wilks: target_pval = pvals[0] elif test is MANOVATest.pillai_bartlett: target_pval = pvals[1] elif test is MANOVATest.lawley_hotelling: target_pval = pvals[2] elif test is MANOVATest.roy: target_pval = pvals[3] pvals_on_bop.append(target_pval) min_pval = np.min(pvals_on_bop) bops_passed.append(bop) bops_pvals.append(min_pval) num_bops += 1 if num_bops % config.print_rate == 0: print('num_bops: ' + str(num_bops)) reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh') order = np.argsort(pvals_corrected) bops_opt = list(np.array(bops_passed)[order])[0:num_top] pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top] genes_opt = [] genes_from_bop = [] for bop in bops_opt: curr_genes = dict_bop_genes.get(bop) genes_str = curr_genes[0] for gene_id in range(1, len(curr_genes)): genes_str += ';' + curr_genes[gene_id] genes_opt.append(genes_str) for gene in curr_genes: if gene not in genes_from_bop: genes_from_bop.append(gene) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [bops_opt, genes_opt, pvals_opt]) config.approach_gd = GeneDataType.from_bop config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_from_bop]) config.dt = DataType.cpg