Пример #1
0
def load_top_gene_linreg_dict(config, num_top):
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    f = open(fn)
    names = []
    metrics = []
    slopes = []
    clusters = []
    for line in f:
        cols = line.split(' ')
        gene = cols[0].rstrip()
        slope = float(cols[5].rstrip())
        metric = float(cols[3].rstrip())
        cluster = int(cols[1].rstrip())
        names.append(gene)
        slopes.append(slope)
        metrics.append(metric)
        clusters.append(cluster)
    names = names[0:num_top]
    slopes = slopes[0:num_top]
    metrics = metrics[0:num_top]
    clusters = clusters[0:num_top]

    top_dict = {}
    for id in range(0, len(names)):
        top_dict[names[id]] = [id, metrics[id], clusters[id], slopes[id]]

    return top_dict
Пример #2
0
def save_top_anova(config, num_top=500):
    attributes_dict = get_attributes_dict(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]

        curr_beta_dict = {}
        for key_age in attributes_dict:
            curr_beta_dict[key_age] = list(
                np.asarray(curr_vals)[attributes_dict[key_age]])

        anova_res = stats.f_oneway(*curr_beta_dict.values())
        pvals.append(anova_res.pvalue)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    genes_sorted = []
    pvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, pvals_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, pvals_genes])
    config.dt = DataType.cpg
Пример #3
0
def load_top_gene_names_by_cpg(config, method, num_top):
    fn = 'genes_from_cpg.txt'
    fn = get_result_path(config, fn)
    f = open(fn)
    gene_names = []
    for line in f:
        gene = line.split(' ')[0].rstrip()
        gene_names.append(gene)
    gene_names = gene_names[0:num_top]
    return gene_names
Пример #4
0
def load_top_data(config, num_top, index):
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    f = open(fn)
    gene_names = []
    for line in f:
        gene = line.split(' ')[index].rstrip()
        gene_names.append(gene)
    gene_names = gene_names[0:num_top]
    return gene_names
Пример #5
0
def save_top_spearman(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    rhos = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        rho, pval = stats.spearmanr(attributes, curr_vals)
        rhos.append(rho)

    order = np.argsort(list(map(abs, rhos)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    rhos_sorted = list(np.array(rhos)[order])

    genes_sorted = []
    rhos_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        rho = rhos_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            genes_sorted.append(gene)
            rhos_genes.append(rho)

    cpgs_sorted = cpgs_sorted[0:num_top]
    rhos_sorted = rhos_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    rhos_genes = rhos_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, rhos_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, rhos_genes])
    config.dt = DataType.cpg
Пример #6
0
def load_top_cpg_data(config, method, num_top):
    indexes = config.indexes
    db_type = config.db_type
    print_rate = config.print_rate
    cpgs_top = []
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    f = open(fn)
    for line in f:
        cpg = line.split(' ')[0].rstrip()
        cpgs_top.append(cpg)

    cpgs_top = cpgs_top[0:num_top]

    fn = db_type.value + '_average_beta.txt'
    path = get_path(config, fn)
    f = open(path)
    for skip_id in range(0, config.num_skip_lines):
        skip_line = f.readline()

    num_lines = 0
    dict_top = {}

    for line in f:

        col_vals = line_proc(config, line)
        cpg = col_vals[0]
        vals = list(map(float, col_vals[1::]))
        vals = list(np.array(vals)[indexes])

        if cpg in cpgs_top:
            dict_top[cpg] = vals

        num_lines += 1
        if num_lines % print_rate == 0:
            print('num_lines: ' + str(num_lines))

    vals_top = []
    for cpg in cpgs_top:
        vals = dict_top.get(cpg)
        vals_top.append(vals)

    return cpgs_top, vals_top
Пример #7
0
def save_top_linreg(config):
    attributes = get_attributes(config)
    genes, vals = load_gene_data(config)

    p_values = []
    r_values = []
    slopes = []
    intercepts = []
    for id in range(0, len(genes)):
        val = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            attributes, val)
        r_values.append(r_value)
        p_values.append(p_value)
        slopes.append(slope)
        intercepts.append(intercept)

    order_mean = np.argsort(list(map(abs, r_values)))[::-1]
    p_values_sorted = list(np.array(p_values)[order_mean])
    r_values_sorted = list(np.array(r_values)[order_mean])
    slopes_sorted = list(np.array(slopes)[order_mean])
    intercepts_sorted = list(np.array(intercepts)[order_mean])
    genes_sorted = list(np.array(genes)[order_mean])

    metrics_sorted_np = np.asarray(list(map(abs,
                                            r_values_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted
    ])
Пример #8
0
def save_top_anova(config):
    gene_names, gene_vals = load_gene_data(config)
    attributes_dict = get_attributes_dict(config)

    pvals = []
    for id in range(0, len(gene_names)):

        vals = gene_vals[id]

        vals_dict = {}
        for key_age in attributes_dict:
            vals_dict[key_age] = list(
                np.asarray(vals)[attributes_dict[key_age]])

        anova_mean = stats.f_oneway(*vals_dict.values())
        pvals.append(anova_mean.pvalue)

    order = np.argsort(pvals)
    genes_sorted = list(np.array(gene_names)[order])
    pvals_sorted = list(np.array(pvals)[order])

    metrics_sorted_np = np.asarray(list(map(np.log10,
                                            pvals_sorted))).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop, pvals_sorted
    ])
Пример #9
0
def save_top_enet(config, num_bootstrap_runs=10, num_top=500):

    dict_cpg_gene = get_dict_cpg_gene(config)
    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)
    cpgs_passed, vals_passed = load_cpg_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    cpg_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        cpg_sorted = list(np.array(cpgs_passed)[order])
        coef_top = coef_sorted[0:num_top]
        cpg_top = cpg_sorted[0:num_top]

        for top_id in range(0, num_top):
            cpg = cpg_top[top_id]
            if cpg in cpg_top_dict:
                cpg_top_dict[cpg] += 1
            else:
                cpg_top_dict[cpg] = 1

        bootstrap_id += 1

    cpgs = list(cpg_top_dict.keys())
    counts = list(cpg_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    counts_sorted = list(np.array(counts)[order])
    genes_sorted = []
    counts_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        count = counts_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                counts_genes.append(count)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, counts_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, counts_genes])
    config.dt = DataType.cpg
Пример #10
0
def save_top_linreg(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    slopes = []
    intercepts = []
    rvals = []
    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            curr_vals, attributes)
        slopes.append(slope)
        intercepts.append(intercept)
        rvals.append(r_value)
        pvals.append(p_value)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    slopes_sorted = list(np.array(slopes)[order])
    intercepts_sorted = list(np.array(intercepts)[order])
    rvals_sorted = list(np.array(rvals)[order])

    genes_sorted = []
    pvals_genes = []
    slopes_genes = []
    intercepts_genes = []
    rvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        slope = slopes_sorted[id]
        intercept = intercepts_sorted[id]
        rval = rvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)
                slopes_genes.append(slope)
                intercepts_genes.append(intercept)
                rvals_genes.append(rval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]
    slopes_sorted = slopes_sorted[0:num_top]
    intercepts_sorted = intercepts_sorted[0:num_top]
    rvals_sorted = rvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]
    slopes_genes = slopes_genes[0:num_top]
    intercepts_genes = intercepts_genes[0:num_top]
    rvals_genes = rvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted,
        intercepts_sorted
    ])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes
    ])
    config.dt = DataType.cpg
Пример #11
0
def save_bend_linreg(config, limit, pval, num_opt=1000):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    cpg_names_l, cpg_vals_l = load_cpg_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    cpg_names_m, cpg_vals_m = load_cpg_data(config_more)

    cpg_gene_dict = get_dict_cpg_gene(config)

    cpgs_passed = []
    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    num_cpgs = 0

    for cpg_id_l in range(0, len(cpg_names_l)):
        cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l])
        vals_l = cpg_vals_l[cpg_id_l]
        vals_m = cpg_vals_m[cpg_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            cpgs_passed.append(cpg_names_l[cpg_id_l])

            genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l])
            if len(genes) > 0:
                if genes[0] == '':
                    genes_passed.append('nan')
                else:
                    genes_passed.append(";".join(genes))
            else:
                genes_passed.append('nan')

            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

        num_cpgs += 1
        if num_cpgs % config.print_rate == 0:
            print('num_cpgs: ' + str(num_cpgs))

    order = np.argsort(angles)[::-1][0:num_opt]

    cpgs_opt = list(np.array(cpgs_passed)[order])

    genes_opt = list(np.array(genes_passed)[order])

    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt,
        r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt,
        intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])

    raw_config = Config(db=config.db,
                        dt=config.dt,
                        approach=config.approach,
                        scenario=config.scenario,
                        approach_method=config.approach_method,
                        gender=Gender.any)

    cpg_str_list = []
    cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config)
    for cpg in cpgs_opt:
        cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)]
        curr_cpg_str = cpg
        for id in range(0, len(cpg_vals)):
            curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e')))
        cpg_str_list.append(curr_cpg_str)

    fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt')
    np.savetxt(fn, cpg_str_list, fmt="%s")
Пример #12
0
def save_bend_linreg(config, limit, pval):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    g_names_l, g_vals_l = load_gene_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    g_names_m, g_vals_m = load_gene_data(config_more)

    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    for g_id_l in range(0, len(g_names_l)):
        g_id_m = g_names_m.index(g_names_l[g_id_l])
        vals_l = g_vals_l[g_id_l]
        vals_m = g_vals_m[g_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            genes_passed.append(g_names_l[g_id_l])
            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

    order = np.argsort(angles)[::-1]
    genes_opt = list(np.array(genes_passed)[order])
    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt,
        p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt,
        r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])
Пример #13
0
def save_top_enet(config, num_bootstrap_runs=100, num_top=500):

    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)

    genes_passed, vals_passed = load_gene_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    gene_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        gene_sorted = list(np.array(genes_passed)[order])
        coef_top = coef_sorted[0:num_top]
        gene_top = gene_sorted[0:num_top]

        for top_id in range(0, num_top):
            gene = gene_top[top_id]
            if gene in gene_top_dict:
                gene_top_dict[gene] += 1
            else:
                gene_top_dict[gene] = 1

        bootstrap_id += 1

    genes = list(gene_top_dict.keys())
    counts = list(gene_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    genes_sorted = list(np.array(genes)[order])
    counts_sorted = list(np.array(counts)[order])

    metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1)
    bandwidth = estimate_bandwidth(metrics_sorted_np)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(metrics_sorted_np)
    labels_mean_shift = list(ms.labels_)
    clusters_mean_shift = clustering_order(labels_mean_shift)
    af = AffinityPropagation().fit(metrics_sorted_np)
    labels_affinity_propagation = list(af.labels_)
    clusters_affinity_prop = clustering_order(labels_affinity_propagation)

    fn = get_result_path(config, 'top.txt')
    save_features(fn, [
        genes_sorted, clusters_mean_shift, clusters_affinity_prop,
        counts_sorted
    ])
Пример #14
0
                        geo=geo,
                        cpg_condition=cpg_condition)

        attributes = get_attributes(config)
        cpgs, vals = load_cpg_data(config)

        num_int = 200
        int_begin = 0
        int_end = 1
        int_shift = (int_end - int_begin) / num_int
        ints = []
        pdf = np.zeros(num_int)
        for int_id in range(0, num_int):
            ints.append(int_begin + int_id * int_shift + 0.5 * int_shift)

        for curr_cpg_vals in vals:
            for beta in curr_cpg_vals:
                int_id = math.floor((beta - int_begin) * num_int /
                                    (int_end - int_begin + 1.0e-8))
                pdf[int_id] += 1

        pdf = np.asarray(pdf)
        sum_pdf = np.sum(pdf)
        pdf = pdf / (sum_pdf * int_shift)
        print('pdf norm: ' + str(np.sum(pdf) * int_shift))

        fn = 'top.txt'
        fn = get_result_path(config, fn)
        save_features(fn, [ints, pdf])
        config.dt = DataType.cpg