Exemplo n.º 1
0
def save_params_enet(config, num_folds=10):

    attributes = get_attributes(config)
    genes_passed, vals_passed = load_gene_data(config)

    param_names, param_values = get_enet_params(vals_passed, attributes,
                                                num_folds)

    fn = 'params.txt'
    fn = get_param_path(config, fn)
    save_features(fn, [param_names, param_values])
Exemplo n.º 2
0
def save_top_linreg(config):
    attributes = get_attributes(config)
    genes, vals = load_gene_data(config)

    p_values = []
    r_values = []
    slopes = []
    intercepts = []
    for id in range(0, len(genes)):
        val = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            attributes, val)
        r_values.append(r_value)
        p_values.append(p_value)
        slopes.append(slope)
        intercepts.append(intercept)

    order_mean = np.argsort(list(map(abs, r_values)))[::-1]
    p_values_sorted = list(np.array(p_values)[order_mean])
    r_values_sorted = list(np.array(r_values)[order_mean])
    slopes_sorted = list(np.array(slopes)[order_mean])
    intercepts_sorted = list(np.array(intercepts)[order_mean])
    genes_sorted = list(np.array(genes)[order_mean])

    metrics_sorted_np = np.asarray(list(map(abs,
                                            r_values_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted
    ])
Exemplo n.º 3
0
def save_top_anova(config):
    gene_names, gene_vals = load_gene_data(config)
    attributes_dict = get_attributes_dict(config)

    pvals = []
    for id in range(0, len(gene_names)):

        vals = gene_vals[id]

        vals_dict = {}
        for key_age in attributes_dict:
            vals_dict[key_age] = list(
                np.asarray(vals)[attributes_dict[key_age]])

        anova_mean = stats.f_oneway(*vals_dict.values())
        pvals.append(anova_mean.pvalue)

    order = np.argsort(pvals)
    genes_sorted = list(np.array(gene_names)[order])
    pvals_sorted = list(np.array(pvals)[order])

    metrics_sorted_np = np.asarray(list(map(np.log10,
                                            pvals_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted
    ])
Exemplo n.º 4
0
def save_bend_linreg(config, limit, pval):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    g_names_l, g_vals_l = load_gene_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    g_names_m, g_vals_m = load_gene_data(config_more)

    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    for g_id_l in range(0, len(g_names_l)):
        g_id_m = g_names_m.index(g_names_l[g_id_l])
        vals_l = g_vals_l[g_id_l]
        vals_m = g_vals_m[g_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            genes_passed.append(g_names_l[g_id_l])
            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

    order = np.argsort(angles)[::-1]
    genes_opt = list(np.array(genes_passed)[order])
    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt,
        p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt,
        r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])
Exemplo n.º 5
0
def save_top_enet(config, num_bootstrap_runs=100, num_top=500):

    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)

    genes_passed, vals_passed = load_gene_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    gene_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        gene_sorted = list(np.array(genes_passed)[order])
        coef_top = coef_sorted[0:num_top]
        gene_top = gene_sorted[0:num_top]

        for top_id in range(0, num_top):
            gene = gene_top[top_id]
            if gene in gene_top_dict:
                gene_top_dict[gene] += 1
            else:
                gene_top_dict[gene] = 1

        bootstrap_id += 1

    genes = list(gene_top_dict.keys())
    counts = list(gene_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    genes_sorted = list(np.array(genes)[order])
    counts_sorted = list(np.array(counts)[order])

    metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        counts_sorted
    ])