示例#1
0
def save_plane_linreg(config,
                      num_top=100,
                      gd_type_x=GeneDataType.mean,
                      gd_type_y=GeneDataType.mean):
    attributes = get_attributes(config)
    config.scenario = Scenario.approach
    gene_names = load_top_gene_names(config, num_top)
    config.scenario = Scenario.validation
    config.approach_gd = gd_type_x
    gene_vals_x = load_top_gene_vals(config, gene_names)
    config.approach_gd = gd_type_y
    gene_vals_y = load_top_gene_vals(config, gene_names)

    p_values_x = []
    r_values_x = []
    p_values_y = []
    r_values_y = []
    for id in range(0, len(gene_names)):
        vals_main = gene_vals_x[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            vals_main, attributes)
        p_values_x.append(p_value)
        r_values_x.append(r_value)

        vals_aux = gene_vals_y[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            vals_aux, attributes)
        p_values_y.append(p_value)
        r_values_y.append(r_value)

    fn = 'plane.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [gene_names, r_values_x, r_values_y])
示例#2
0
def save_simple_linreg(config, num_top=100):
    attributes = get_attributes(config)
    config.scenario = Scenario.approach
    gene_names, gene_vals = load_top_gene_data(config, num_top)
    config.scenario = Scenario.validation

    p_values = []
    r_values = []
    slopes = []
    intercepts = []
    for id in range(0, len(gene_names)):
        vals = gene_vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            vals, attributes)
        r_values.append(r_value)
        p_values.append(p_value)
        slopes.append(slope)
        intercepts.append(intercept)

    order = np.argsort(list(map(abs, r_values)))[::-1]
    p_values = list(np.array(p_values)[order])
    r_values = list(np.array(r_values)[order])
    slopes = list(np.array(slopes)[order])
    intercepts = list(np.array(intercepts)[order])
    gene_names = list(np.array(gene_names)[order])

    fn = 'metrics.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [gene_names, p_values, r_values, slopes, intercepts])
示例#3
0
def save_params_enet(config, num_folds=10):
    attributes = get_attributes(config)
    cpgs, vals = load_cpg_data(config)

    param_names, param_values = get_enet_params(attributes, vals, num_folds)

    fn = 'params.txt'
    fn = get_param_path(config, fn)
    save_features(fn, [param_names, param_values])
示例#4
0
def save_error_from_age(config, num_top=100):
    attributes = get_attributes(config)
    config.scenario = Scenario.approach
    names, vals = load_top_gene_data(config, num_top)
    config.scenario = Scenario.validation

    X = vals
    y = attributes

    model = linreg_mult(y, X)

    ages = []
    maes = []
    str_list = []
    x_all = []
    y_all = []
    for age in range(0, 150):

        indexes = [i for i, x in enumerate(attributes) if x == age]

        if len(indexes) > 0:

            ages.append(age)

            X_test = np.array(vals).T[indexes].tolist()
            y_test_pred = model.get_prediction(X_test).predicted_mean

            curr_str = str(age)
            mae = 0
            for pred_age in y_test_pred:
                mae += abs(pred_age - age)
                curr_str += (' ' + str(format(pred_age, '0.8e')))
                x_all.append(age)
                y_all.append(pred_age - age)

            mae /= len(indexes)

            str_list.append(curr_str)
            maes.append(mae)

    fn = 'error_from_age.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [ages, maes])

    fn = 'errors.txt'
    fn = get_result_path(config, fn)
    np.savetxt(fn, str_list, fmt="%s")

    slope, intercept, r_value, p_value, std_err = stats.linregress(x_all, y_all)
    print('slope: ' + str(slope))
    print('intercept: ' + str(intercept))
    print('r_value: ' + str(r_value))
    print('p_value: ' + str(p_value))
    print('std_err: ' + str(std_err))
示例#5
0
def save_params_enet(config, num_folds=10):

    attributes = get_attributes(config)
    genes_passed, vals_passed = load_gene_data(config)

    param_names, param_values = get_enet_params(vals_passed, attributes,
                                                num_folds)

    fn = 'params.txt'
    fn = get_param_path(config, fn)
    save_features(fn, [param_names, param_values])
示例#6
0
def save_gene_by_cpg(config, fn):
    cpg_gene_dict = get_dict_cpg_gene(config)

    f = open(fn + '.txt')
    target_cpgs = f.read().splitlines()

    genes = []
    for cpg in target_cpgs:
        if cpg in cpg_gene_dict:
            curr_genes = cpg_gene_dict[cpg]
            for gene in curr_genes:
                if gene not in genes:
                    genes.append(gene)

    save_features(fn + '_genes.txt', genes)
示例#7
0
def save_top_anova(config, num_top=500):
    attributes_dict = get_attributes_dict(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]

        curr_beta_dict = {}
        for key_age in attributes_dict:
            curr_beta_dict[key_age] = list(
                np.asarray(curr_vals)[attributes_dict[key_age]])

        anova_res = stats.f_oneway(*curr_beta_dict.values())
        pvals.append(anova_res.pvalue)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    genes_sorted = []
    pvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, pvals_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, pvals_genes])
    config.dt = DataType.cpg
示例#8
0
def save_top_spearman(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    rhos = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        rho, pval = stats.spearmanr(attributes, curr_vals)
        rhos.append(rho)

    order = np.argsort(list(map(abs, rhos)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    rhos_sorted = list(np.array(rhos)[order])

    genes_sorted = []
    rhos_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        rho = rhos_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            genes_sorted.append(gene)
            rhos_genes.append(rho)

    cpgs_sorted = cpgs_sorted[0:num_top]
    rhos_sorted = rhos_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    rhos_genes = rhos_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, rhos_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, rhos_genes])
    config.dt = DataType.cpg
示例#9
0
def save_simple_linreg_mult(config, num_bootstrap_runs=500, num_top=100):
    attributes = get_attributes(config)
    config.scenario = Scenario.approach
    gene_names, gene_vals = load_top_gene_data(config, num_top)
    config.scenario = Scenario.validation

    counts, R2s = R2_from_count(gene_vals, attributes)
    fn = 'R2s_' + str(num_top) + '.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [counts, R2s])

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    metrics_names, metrics_vals = validation_metrics(gene_vals, attributes,
                                                     test_size, train_size,
                                                     num_bootstrap_runs)
    fn = 'metrics_' + str(num_top) + '.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [metrics_names, metrics_vals])

    print(linreg_mult_with_const(attributes, gene_vals).summary())
示例#10
0
def save_top_linreg(config):
    attributes = get_attributes(config)
    genes, vals = load_gene_data(config)

    p_values = []
    r_values = []
    slopes = []
    intercepts = []
    for id in range(0, len(genes)):
        val = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            attributes, val)
        r_values.append(r_value)
        p_values.append(p_value)
        slopes.append(slope)
        intercepts.append(intercept)

    order_mean = np.argsort(list(map(abs, r_values)))[::-1]
    p_values_sorted = list(np.array(p_values)[order_mean])
    r_values_sorted = list(np.array(r_values)[order_mean])
    slopes_sorted = list(np.array(slopes)[order_mean])
    intercepts_sorted = list(np.array(intercepts)[order_mean])
    genes_sorted = list(np.array(genes)[order_mean])

    metrics_sorted_np = np.asarray(list(map(abs,
                                            r_values_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted
    ])
示例#11
0
def save_top_anova(config):
    gene_names, gene_vals = load_gene_data(config)
    attributes_dict = get_attributes_dict(config)

    pvals = []
    for id in range(0, len(gene_names)):

        vals = gene_vals[id]

        vals_dict = {}
        for key_age in attributes_dict:
            vals_dict[key_age] = list(
                np.asarray(vals)[attributes_dict[key_age]])

        anova_mean = stats.f_oneway(*vals_dict.values())
        pvals.append(anova_mean.pvalue)

    order = np.argsort(pvals)
    genes_sorted = list(np.array(gene_names)[order])
    pvals_sorted = list(np.array(pvals)[order])

    metrics_sorted_np = np.asarray(list(map(np.log10,
                                            pvals_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted
    ])
示例#12
0
def save_top_enet(config, num_bootstrap_runs=10, num_top=500):

    dict_cpg_gene = get_dict_cpg_gene(config)
    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)
    cpgs_passed, vals_passed = load_cpg_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    cpg_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        cpg_sorted = list(np.array(cpgs_passed)[order])
        coef_top = coef_sorted[0:num_top]
        cpg_top = cpg_sorted[0:num_top]

        for top_id in range(0, num_top):
            cpg = cpg_top[top_id]
            if cpg in cpg_top_dict:
                cpg_top_dict[cpg] += 1
            else:
                cpg_top_dict[cpg] = 1

        bootstrap_id += 1

    cpgs = list(cpg_top_dict.keys())
    counts = list(cpg_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    counts_sorted = list(np.array(counts)[order])
    genes_sorted = []
    counts_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        count = counts_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                counts_genes.append(count)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, counts_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, counts_genes])
    config.dt = DataType.cpg
示例#13
0
def save_top_linreg(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    slopes = []
    intercepts = []
    rvals = []
    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            curr_vals, attributes)
        slopes.append(slope)
        intercepts.append(intercept)
        rvals.append(r_value)
        pvals.append(p_value)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    slopes_sorted = list(np.array(slopes)[order])
    intercepts_sorted = list(np.array(intercepts)[order])
    rvals_sorted = list(np.array(rvals)[order])

    genes_sorted = []
    pvals_genes = []
    slopes_genes = []
    intercepts_genes = []
    rvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        slope = slopes_sorted[id]
        intercept = intercepts_sorted[id]
        rval = rvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)
                slopes_genes.append(slope)
                intercepts_genes.append(intercept)
                rvals_genes.append(rval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]
    slopes_sorted = slopes_sorted[0:num_top]
    intercepts_sorted = intercepts_sorted[0:num_top]
    rvals_sorted = rvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]
    slopes_genes = slopes_genes[0:num_top]
    intercepts_genes = intercepts_genes[0:num_top]
    rvals_genes = rvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted,
        intercepts_sorted
    ])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes
    ])
    config.dt = DataType.cpg
示例#14
0
def save_top_enet(config, num_bootstrap_runs=100, num_top=500):

    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)

    genes_passed, vals_passed = load_gene_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    gene_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        gene_sorted = list(np.array(genes_passed)[order])
        coef_top = coef_sorted[0:num_top]
        gene_top = gene_sorted[0:num_top]

        for top_id in range(0, num_top):
            gene = gene_top[top_id]
            if gene in gene_top_dict:
                gene_top_dict[gene] += 1
            else:
                gene_top_dict[gene] = 1

        bootstrap_id += 1

    genes = list(gene_top_dict.keys())
    counts = list(gene_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    genes_sorted = list(np.array(genes)[order])
    counts_sorted = list(np.array(counts)[order])

    metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        counts_sorted
    ])
示例#15
0
def save_bend_linreg(config, limit, pval, num_opt=1000):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    cpg_names_l, cpg_vals_l = load_cpg_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    cpg_names_m, cpg_vals_m = load_cpg_data(config_more)

    cpg_gene_dict = get_dict_cpg_gene(config)

    cpgs_passed = []
    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    num_cpgs = 0

    for cpg_id_l in range(0, len(cpg_names_l)):
        cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l])
        vals_l = cpg_vals_l[cpg_id_l]
        vals_m = cpg_vals_m[cpg_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            cpgs_passed.append(cpg_names_l[cpg_id_l])

            genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l])
            if len(genes) > 0:
                if genes[0] == '':
                    genes_passed.append('nan')
                else:
                    genes_passed.append(";".join(genes))
            else:
                genes_passed.append('nan')

            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

        num_cpgs += 1
        if num_cpgs % config.print_rate == 0:
            print('num_cpgs: ' + str(num_cpgs))

    order = np.argsort(angles)[::-1][0:num_opt]

    cpgs_opt = list(np.array(cpgs_passed)[order])

    genes_opt = list(np.array(genes_passed)[order])

    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt,
        r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt,
        intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])

    raw_config = Config(db=config.db,
                        dt=config.dt,
                        approach=config.approach,
                        scenario=config.scenario,
                        approach_method=config.approach_method,
                        gender=Gender.any)

    cpg_str_list = []
    cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config)
    for cpg in cpgs_opt:
        cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)]
        curr_cpg_str = cpg
        for id in range(0, len(cpg_vals)):
            curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e')))
        cpg_str_list.append(curr_cpg_str)

    fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt')
    np.savetxt(fn, cpg_str_list, fmt="%s")
示例#16
0
                        geo=geo,
                        cpg_condition=cpg_condition)

        attributes = get_attributes(config)
        cpgs, vals = load_cpg_data(config)

        num_int = 200
        int_begin = 0
        int_end = 1
        int_shift = (int_end - int_begin) / num_int
        ints = []
        pdf = np.zeros(num_int)
        for int_id in range(0, num_int):
            ints.append(int_begin + int_id * int_shift + 0.5 * int_shift)

        for curr_cpg_vals in vals:
            for beta in curr_cpg_vals:
                int_id = math.floor((beta - int_begin) * num_int /
                                    (int_end - int_begin + 1.0e-8))
                pdf[int_id] += 1

        pdf = np.asarray(pdf)
        sum_pdf = np.sum(pdf)
        pdf = pdf / (sum_pdf * int_shift)
        print('pdf norm: ' + str(np.sum(pdf) * int_shift))

        fn = 'top.txt'
        fn = get_result_path(config, fn)
        save_features(fn, [ints, pdf])
        config.dt = DataType.cpg
示例#17
0
def save_bend_linreg(config, limit, pval):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    g_names_l, g_vals_l = load_gene_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    g_names_m, g_vals_m = load_gene_data(config_more)

    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    for g_id_l in range(0, len(g_names_l)):
        g_id_m = g_names_m.index(g_names_l[g_id_l])
        vals_l = g_vals_l[g_id_l]
        vals_m = g_vals_m[g_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            genes_passed.append(g_names_l[g_id_l])
            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

    order = np.argsort(angles)[::-1]
    genes_opt = list(np.array(genes_passed)[order])
    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt,
        p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt,
        r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])
示例#18
0
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett):
    dict_bop_cpgs = load_bop_cpg_dict(config)
    dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs)
    cpgs, betas = load_cpg_data(config)

    atr_table = []
    atr_cols = []
    for atr_type in attributes_types:
        if isinstance(atr_type, Attribute):
            atr_table.append(get_attributes(config, atr_type))
        elif isinstance(atr_type, CellPop):
            atr_table.append(get_cell_pop(config, [atr_type]))
        atr_cols.append(atr_type.value)

    num_bops = 0
    bops_passed = []
    bops_pvals = []
    for bop in dict_bop_cpgs:
        curr_cpgs = dict_bop_cpgs.get(bop)
        cpgs_passed = []
        for cpg in curr_cpgs:
            if cpg in cpgs:
                cpgs_passed.append(cpg)
        if len(cpgs_passed) > 2:
            pvals_on_bop = []
            for win_id in range(0, len(cpgs_passed) - 2):
                val_table = []
                val_cols = []
                for cpg_id in range(0, window):
                    cpg = cpgs_passed[win_id + cpg_id]
                    beta = betas[cpgs.index(cpg)]
                    val_table.append(beta)
                    val_cols.append('cpg_'+str(cpg_id))
                table = atr_table + val_table
                cols = atr_cols + val_cols

                formula = val_cols[0]
                for val_col_id in range(1, len(val_cols)):
                    val_col = val_cols[val_col_id]
                    formula += ' + ' + val_col
                formula += ' ~ ' + atr_cols[0]
                for atr_col_id in range(1, len(atr_cols)):
                    atr_col = atr_cols[atr_col_id]
                    formula += ' + ' + atr_col

                table = list(map(list, zip(*table)))
                x = pd.DataFrame(table, columns=cols)
                manova = MANOVA.from_formula(formula, x)
                mv_test_res = manova.mv_test()
                pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4]
                target_pval = pvals[0]
                if test is MANOVATest.wilks:
                    target_pval = pvals[0]
                elif test is MANOVATest.pillai_bartlett:
                    target_pval = pvals[1]
                elif test is MANOVATest.lawley_hotelling:
                    target_pval = pvals[2]
                elif test is MANOVATest.roy:
                    target_pval = pvals[3]
                pvals_on_bop.append(target_pval)
            min_pval = np.min(pvals_on_bop)
            bops_passed.append(bop)
            bops_pvals.append(min_pval)
        num_bops += 1
        if num_bops % config.print_rate == 0:
            print('num_bops: ' + str(num_bops))

    reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh')
    order = np.argsort(pvals_corrected)
    bops_opt = list(np.array(bops_passed)[order])[0:num_top]
    pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top]
    genes_opt = []
    genes_from_bop = []
    for bop in bops_opt:
        curr_genes = dict_bop_genes.get(bop)
        genes_str = curr_genes[0]
        for gene_id in range(1, len(curr_genes)):
            genes_str += ';' + curr_genes[gene_id]
        genes_opt.append(genes_str)
        for gene in curr_genes:
            if gene not in genes_from_bop:
                genes_from_bop.append(gene)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [bops_opt, genes_opt, pvals_opt])

    config.approach_gd = GeneDataType.from_bop
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_from_bop])
    config.dt = DataType.cpg