def save_params_enet(config, num_folds=10): attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) param_names, param_values = get_enet_params(vals_passed, attributes, num_folds) fn = 'params.txt' fn = get_param_path(config, fn) save_features(fn, [param_names, param_values])
def save_top_linreg(config): attributes = get_attributes(config) genes, vals = load_gene_data(config) p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(genes)): val = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( attributes, val) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order_mean = np.argsort(list(map(abs, r_values)))[::-1] p_values_sorted = list(np.array(p_values)[order_mean]) r_values_sorted = list(np.array(r_values)[order_mean]) slopes_sorted = list(np.array(slopes)[order_mean]) intercepts_sorted = list(np.array(intercepts)[order_mean]) genes_sorted = list(np.array(genes)[order_mean]) metrics_sorted_np = np.asarray(list(map(abs, r_values_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted ])
def save_top_anova(config): gene_names, gene_vals = load_gene_data(config) attributes_dict = get_attributes_dict(config) pvals = [] for id in range(0, len(gene_names)): vals = gene_vals[id] vals_dict = {} for key_age in attributes_dict: vals_dict[key_age] = list( np.asarray(vals)[attributes_dict[key_age]]) anova_mean = stats.f_oneway(*vals_dict.values()) pvals.append(anova_mean.pvalue) order = np.argsort(pvals) genes_sorted = list(np.array(gene_names)[order]) pvals_sorted = list(np.array(pvals)[order]) metrics_sorted_np = np.asarray(list(map(np.log10, pvals_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted ])
def save_bend_linreg(config, limit, pval): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) g_names_l, g_vals_l = load_gene_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) g_names_m, g_vals_m = load_gene_data(config_more) genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] for g_id_l in range(0, len(g_names_l)): g_id_m = g_names_m.index(g_names_l[g_id_l]) vals_l = g_vals_l[g_id_l] vals_m = g_vals_m[g_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): genes_passed.append(g_names_l[g_id_l]) angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) order = np.argsort(angles)[::-1] genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ])
def save_top_enet(config, num_bootstrap_runs=100, num_top=500): params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 gene_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) gene_sorted = list(np.array(genes_passed)[order]) coef_top = coef_sorted[0:num_top] gene_top = gene_sorted[0:num_top] for top_id in range(0, num_top): gene = gene_top[top_id] if gene in gene_top_dict: gene_top_dict[gene] += 1 else: gene_top_dict[gene] = 1 bootstrap_id += 1 genes = list(gene_top_dict.keys()) counts = list(gene_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] genes_sorted = list(np.array(genes)[order]) counts_sorted = list(np.array(counts)[order]) metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, counts_sorted ])