Exemplo n.º 1
0
def simulate_mult_n(file_path,
                    runs=1,
                    num_snps=100,
                    h=0.3,
                    num_inds_vals=(1000, 5000, 10000, 20000, 50000, 100000),
                    bzs=[-0.6, -0.3, -0.1, 0, 0.1, 0.3, 0.6, 0.9, 1.2, 1.5]):
    independent_snps = generate_pss_model_simple(num_snps, h)
    hetero_snps = generate_pss_model_simple(num_snps, h)

    weight_funcs = {
        "log1": log1,
        "log1p5": log1p5,
        "log3": log3,
        "sigmoid": sigmoid,
        "linear": linear,
        "step": step,
        "polynom2": polynom2,
        "polynom4": polynom4,
        "polynom6": polynom6
    }
    hom_thresh_runs = {}
    het_thresh_runs = {}
    hom_continuous_runs = {}
    het_continuous_runs = {}
    hom_contin_exp_runs = {}

    for bz in bzs:
        hom_thresh_runs[bz] = {ni: [] for ni in num_inds_vals}
        het_thresh_runs[bz] = {ni: [] for ni in num_inds_vals}
    for ws in weight_funcs:
        hom_continuous_runs[ws] = {ni: [] for ni in num_inds_vals}
        het_continuous_runs[ws] = {ni: [] for ni in num_inds_vals}
        hom_contin_exp_runs[ws] = {ni: None for ni in num_inds_vals}

    for num_inds in num_inds_vals:
        for ws in weight_funcs:
            print("running exp hom score: " + ws)
            hom_contin_exp_runs[ws][num_inds] = get_exp_heterogeneity(
                num_inds, independent_snps, h, weight_funcs[ws])

        for i in range(0, runs):
            print(str(num_inds) + "-" + str(i))
            independent_pop = generate_population(independent_snps,
                                                  num_inds=num_inds,
                                                  h=h)
            hetero_pop = generate_population(hetero_snps,
                                             num_inds=num_inds,
                                             h=h)

            # mean-center phenos
            independent_pop = (independent_pop[0],
                               np.array(independent_pop[1]) -
                               np.mean(independent_pop[1]))
            hetero_pop = (hetero_pop[0],
                          np.array(hetero_pop[1]) - np.mean(hetero_pop[1]))

            het_mean = np.mean(hetero_pop[1])
            het_std = np.std(hetero_pop[1])
            hetero_pop[1][int(num_inds / 2):] = np.random.normal(
                loc=het_mean, scale=het_std, size=int(num_inds / 2))

            for ws in weight_funcs:
                print("num inds:", num_inds, "run:", i, "weight func:", ws)
                hom_continuous_runs[ws][num_inds].append(
                    run_cont_heterogeneity_on_pop(
                        independent_pop,
                        independent_snps,
                        weight_func=weight_funcs[ws]))
                het_continuous_runs[ws][num_inds].append(
                    run_cont_heterogeneity_on_pop(
                        hetero_pop, hetero_snps, weight_func=weight_funcs[ws]))

            for bz in bzs:
                print("num inds:", num_inds, "run:", i, "binary thresh:", bz)
                hom_thresh_runs[bz][num_inds].append(
                    run_heterogeneity_on_pop(independent_pop,
                                             independent_snps,
                                             z=bz))
                het_thresh_runs[bz][num_inds].append(
                    run_heterogeneity_on_pop(hetero_pop, hetero_snps, z=bz))

    with open(file_path, "wb") as f:
        pickle.dump(
            {
                "hom_thresh_runs": hom_thresh_runs,
                "het_thresh_runs": het_thresh_runs,
                "hom_continuous_runs": hom_continuous_runs,
                "het_continuous_runs": het_continuous_runs,
                "hom_contin_exp_runs": hom_contin_exp_runs
            }, f)
Exemplo n.º 2
0
def run(num_inds=50000,
        num_snps=10,
        h=0.1,
        deg=8,
        numtrain=5,
        symmetric=False,
        verbose=False):

    # Sample data
    training_data = []
    for i in range(numtrain):
        independent_snps = generate_pss_model_simple(num_snps, h)
        hetero_snps = generate_pss_model_simple(num_snps, h)

        independent_pop = generate_population(independent_snps,
                                              num_inds=num_inds,
                                              h=h)
        hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h)

        # mean-center phenos
        independent_pop = (independent_pop[0], np.array(independent_pop[1]) -
                           np.mean(independent_pop[1]))
        hetero_pop = (hetero_pop[0],
                      np.array(hetero_pop[1]) - np.mean(hetero_pop[1]))

        het_mean = np.mean(hetero_pop[1])
        het_std = np.std(hetero_pop[1])
        hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean,
                                                             scale=het_std,
                                                             size=int(
                                                                 num_inds / 2))
        training_data.append({
            "independent_snps": independent_snps,
            "independent_pop": independent_pop,
            "hetero_snps": hetero_snps,
            "hetero_pop": hetero_pop
        })

    plt.ion()
    count = 0
    plot_count = 1
    plot_cnt_tot = 10

    ## when learn_coef=True, coefficients are learned directly rather than roots
    ## -- This allows coefficients for odd degrees to be set to 0
    ## -- Not all polynomials have maximum number of roots

    # initialization of weight distribution here
    coef_wts = np.random.normal(loc=0, scale=500, size=deg + 1)
    if symmetric:
        coef_wts[1::2] = 0

    while not valid_poly(
            coef_wts, symmetric=symmetric, minval=-0.5, maxval=0.5):
        coef_wts = np.random.normal(loc=0, scale=500, size=deg + 1)
        if symmetric:
            coef_wts[1::2] = 0
    scrhoms = []
    scrhets = []
    for i in range(numtrain):
        independent_pop = training_data[i]["independent_pop"]
        independent_snps = training_data[i]["independent_snps"]
        hetero_pop = training_data[i]["hetero_pop"]
        hetero_snps = training_data[i]["hetero_snps"]
        HetScorehom = run_cont_heterogeneity_on_pop(independent_pop,
                                                    independent_snps,
                                                    weight_func=None,
                                                    coef_wts=coef_wts,
                                                    symmetric=symmetric)
        HetScorehet = run_cont_heterogeneity_on_pop(hetero_pop,
                                                    hetero_snps,
                                                    weight_func=None,
                                                    coef_wts=coef_wts,
                                                    symmetric=symmetric)
        scrhoms.append(HetScorehom)
        scrhets.append(HetScorehet)
    HetScorehom = np.mean(scrhoms)
    HetScorehet = np.mean(scrhets)

    # HetScorediff = HetScorehom
    HetScorediff = HetScorehet - HetScorehom

    if symmetric:
        pc_range = np.linspace(-0.5, 0.5, 500, endpoint=False)
    else:
        pc_range = np.linspace(0, 1, 500, endpoint=False)

    polynom = np.poly1d(coef_wts, r=True)
    if verbose:
        plt.scatter(pc_range, [polynom(x) for x in pc_range],
                    color=str(1.0 - plot_count / plot_cnt_tot))

    if verbose:
        plt.show(block=False)
        plt.pause(0.05)
    while count < 50:
        count += 1

        inc = np.random.randint(0, deg + 1)
        if symmetric and inc % 2 == 1:
            while inc % 2 == 1:
                inc = np.random.randint(0, deg + 1)
        coef_wts_cand = np.copy(coef_wts)
        coef_wts_cand[inc] += np.random.normal(loc=0, scale=500)
        # check that this is valid
        while not valid_poly(
                coef_wts_cand, symmetric=symmetric, minval=-0.5, maxval=0.5):
            print(".", )
            inc = np.random.randint(0, deg + 1)
            if symmetric and inc % 2 == 1:
                while inc % 2 == 1:
                    inc = np.random.randint(0, deg + 1)
            coef_wts_cand = np.copy(coef_wts)
            coef_wts_cand[inc] += np.random.normal(loc=0, scale=10)
        HetScorehom_cand = run_cont_heterogeneity_on_pop(
            independent_pop,
            independent_snps,
            weight_func=None,
            coef_wts=coef_wts_cand,
            symmetric=symmetric)
        HetScorehet_cand = run_cont_heterogeneity_on_pop(
            hetero_pop,
            hetero_snps,
            weight_func=None,
            coef_wts=coef_wts_cand,
            symmetric=symmetric)
        scrhoms = []
        scrhets = []
        for i in range(numtrain):
            independent_pop = training_data[i]["independent_pop"]
            independent_snps = training_data[i]["independent_snps"]
            hetero_pop = training_data[i]["hetero_pop"]
            hetero_snps = training_data[i]["hetero_snps"]
            HetScorehom_cand = run_cont_heterogeneity_on_pop(
                independent_pop,
                independent_snps,
                weight_func=None,
                coef_wts=coef_wts_cand,
                symmetric=symmetric)
            HetScorehet_cand = run_cont_heterogeneity_on_pop(
                hetero_pop,
                hetero_snps,
                weight_func=None,
                coef_wts=coef_wts_cand,
                symmetric=symmetric)
            scrhoms.append(HetScorehom_cand)
            scrhets.append(HetScorehet_cand)
        HetScorehom_cand = np.mean(scrhoms)
        HetScorehet_cand = np.mean(scrhets)

        # HetScorediff_cand = HetScorehom_cand
        HetScorediff_cand = HetScorehet_cand - HetScorehom_cand

        print("count: ", count)
        print("HetScorediff:", HetScorediff)
        print("HetScorediff_cand: ", HetScorediff_cand)
        print("coef_wts:", coef_wts)
        print("-" * 20)
        if HetScorediff_cand > HetScorediff:
            HetScorediff = HetScorediff_cand
            coef_wts = coef_wts_cand
            # plot updated value here
            polynom = np.poly1d(coef_wts, r=False)
            if verbose:
                plt.figure()
                plt.scatter(pc_range, [polynom(x) for x in pc_range],
                            color=str(1.0 - plot_count / plot_cnt_tot))
                plt.xlabel("PRS percentile")
                plt.ylabel("individual weight phi")
                plt.show(block=False)
                plt.pause(0.05)
            plot_count = min(plot_count + 1, plot_cnt_tot)

    return HetScorediff, coef_wts
def learn_weight_func(num_snps=10, h=0.1, num_inds=5000):
    """
    Evaluate quantitative phenotype score for multiple values of case sample size
    """
    independent_snps = generate_pss_model_simple(num_snps, h)
    hetero_snps = generate_pss_model_simple(num_snps, h)

    independent_pop = generate_population(independent_snps,
                                          num_inds=num_inds,
                                          h=h)
    hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h)

    # mean-center phenos
    independent_pop = (independent_pop[0], np.array(independent_pop[1]) -
                       np.mean(independent_pop[1]))
    hetero_pop = (hetero_pop[0],
                  np.array(hetero_pop[1]) - np.mean(hetero_pop[1]))

    het_mean = np.mean(hetero_pop[1])
    het_std = np.std(hetero_pop[1])
    hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean,
                                                         scale=het_std,
                                                         size=int(num_inds /
                                                                  2))

    ###########################
    ### Test to find best w ###
    ###########################
    learn_coefs = False
    symmetric = False
    if not learn_coefs:  # don't apply symmetry to the bin weights method
        symmetric = False

    plt.ion()
    count = 0
    plot_count = 1
    plot_cnt_tot = 50 if not learn_coefs else 10
    deg = 4

    ## when learn_coef=True, coefficients are learned directly rather than roots
    ## -- This allows coefficients for odd degrees to be set to 0
    ## -- Not all polynomials have maximum number of roots

    # initialization of weight distribution here
    if learn_coefs:
        coef_wts = np.random.normal(loc=0, scale=10, size=deg + 1)
        if symmetric:
            coef_wts[1::2] = 0

        while not valid_poly(
                coef_wts, symmetric=symmetric, minval=-0.5, maxval=0.5):
            coef_wts = np.random.normal(loc=0, scale=1, size=deg + 1)
            if symmetric:
                coef_wts[1::2] = 0
        HSC_hom = run_cont_heterogeneity_on_pop(independent_pop,
                                                independent_snps,
                                                weight_func=None,
                                                coef_wts=coef_wts,
                                                symmetric=symmetric)
        HSC_het = run_cont_heterogeneity_on_pop(hetero_pop,
                                                hetero_snps,
                                                weight_func=None,
                                                coef_wts=coef_wts,
                                                symmetric=symmetric)
    else:  # block coef
        numbins = 40
        block_wts = np.array([0.0] * int(numbins / 2) +
                             [1.0] * int(numbins / 2))
        # block_wts = np.random.normal(loc=1.0, scale=0.01, size=numbins)

        HSC_hom = run_cont_heterogeneity_on_pop(independent_pop,
                                                independent_snps,
                                                weight_func=None,
                                                block_wts=block_wts,
                                                numbins=numbins)
        HSC_het = run_cont_heterogeneity_on_pop(hetero_pop,
                                                hetero_snps,
                                                weight_func=None,
                                                block_wts=block_wts,
                                                numbins=numbins)
    # HSC_diff = HSC_hom
    HSC_diff = HSC_het - HSC_hom

    if symmetric:
        # pc_range = np.linspace(np.amin(independent_pop[1]), np.amax(independent_pop[1]), 500)
        pc_range = np.linspace(-0.5, 0.5, 500, endpoint=False)
    else:
        pc_range = np.linspace(0, 1, 500, endpoint=False)

    if learn_coefs:
        polynom = np.poly1d(coef_wts, r=True)
        plt.scatter(pc_range, [polynom(x) for x in pc_range],
                    color=str(1.0 - plot_count / plot_cnt_tot))
    else:
        plt.scatter(pc_range,
                    block_wts[[int(x)
                               for x in np.floor((pc_range) * numbins)]],
                    color=str(1.0 - plot_count / plot_cnt_tot))
    plt.show(block=False)
    plt.pause(0.05)
    if symmetric:
        plt.figure()
    while True:
        count += 1

        if learn_coefs:
            inc = np.random.randint(0, deg + 1)
            if symmetric and inc % 2 == 1:
                while inc % 2 == 1:
                    inc = np.random.randint(0, deg + 1)
            coef_wts_cand = np.copy(coef_wts)
            coef_wts_cand[inc] += np.random.normal(loc=0, scale=10)
            # check that this is valid
            while not valid_poly(
                    coef_wts_cand, symmetric=symmetric, minval=-0.5,
                    maxval=0.5):
                print(".", )
                inc = np.random.randint(0, deg + 1)
                if symmetric and inc % 2 == 1:
                    while inc % 2 == 1:
                        inc = np.random.randint(0, deg + 1)
                coef_wts_cand = np.copy(coef_wts)
                coef_wts_cand[inc] += np.random.normal(loc=0, scale=10)
            HSC_hom_cand = run_cont_heterogeneity_on_pop(
                independent_pop,
                independent_snps,
                weight_func=None,
                coef_wts=coef_wts_cand,
                symmetric=symmetric)
            HSC_het_cand = run_cont_heterogeneity_on_pop(
                hetero_pop,
                hetero_snps,
                weight_func=None,
                coef_wts=coef_wts_cand,
                symmetric=symmetric)
        else:
            inc = np.random.randint(0, numbins)
            block_wts_cand = np.copy(block_wts)

            candidate = block_wts_cand[inc] + np.random.normal(loc=0,
                                                               scale=0.01)
            if candidate < 0:
                continue
            block_wts_cand[inc] = candidate

            HSC_hom_cand = run_cont_heterogeneity_on_pop(
                independent_pop,
                independent_snps,
                weight_func=None,
                block_wts=block_wts_cand,
                numbins=numbins)
            HSC_het_cand = run_cont_heterogeneity_on_pop(
                hetero_pop,
                hetero_snps,
                weight_func=None,
                block_wts=block_wts_cand,
                numbins=numbins)
        # HSC_diff_cand = HSC_hom_cand
        HSC_diff_cand = HSC_het_cand - HSC_hom_cand
        if learn_coefs:
            print("count: ", count)
            print("HSC_diff:", HSC_diff)
            print("HSC_diff_cand: ", HSC_diff_cand)
            print("coef_wts:", coef_wts)
            print("-" * 20)
        if HSC_diff_cand > HSC_diff:
            HSC_diff = HSC_diff_cand
            if learn_coefs:
                coef_wts = coef_wts_cand
                # plot updated value here
                polynom = np.poly1d(coef_wts, r=False)
                plt.scatter(pc_range, [polynom(x) for x in pc_range],
                            color=str(1.0 - plot_count / plot_cnt_tot))
                plt.xlabel("PRS percentile")
                plt.ylabel("individual weight phi")
                plt.show(block=False)
                plt.pause(0.05)
                plot_count = min(plot_count + 1, plot_cnt_tot)
            else:
                block_wts = block_wts_cand

        if not learn_coefs and count > 500:
            count = 0
            if plot_count < plot_cnt_tot:
                plot_count += 1
            print("-" * 20)
            print("HSC diff:", HSC_diff)

            coefs = np.polyfit(np.linspace(0, 1, numbins), block_wts, deg=deg)
            print("poly coefs:", coefs)

            # score from the poly coefs directly
            try:
                HSC_hom_cand = run_cont_heterogeneity_on_pop(independent_pop,
                                                             independent_snps,
                                                             weight_func=None,
                                                             coef_wts=coefs)
                HSC_het_cand = run_cont_heterogeneity_on_pop(hetero_pop,
                                                             hetero_snps,
                                                             weight_func=None,
                                                             coef_wts=coefs)
                HSC_diff_cand = HSC_het_cand - HSC_hom_cand
                print("poly coef HSC:", HSC_diff_cand)
            except:
                print("poly coef HSC: invalid")
            plt.scatter(
                pc_range,
                block_wts[[int(x) for x in np.floor((pc_range) * numbins)]],
                color=str(1.0 - plot_count / plot_cnt_tot))
            polynom = np.poly1d(coefs)
            plt.plot(np.linspace(0, 1, numbins),
                     polynom(np.linspace(0, 1, numbins)),
                     color=str(1.0 - plot_count / plot_cnt_tot))
            plt.xlabel("PRS percentile")
            plt.ylabel("individual weight phi")
            plt.show(block=False)
            plt.pause(0.05)
Exemplo n.º 4
0
def test_funcs(filename,
               numtrain=20,
               num_snps=10,
               h=0.1,
               num_inds=100000,
               symmetric=False,
               verbose=True):
    training_data = []
    for i in range(numtrain):
        independent_snps = generate_pss_model_simple(num_snps, h)
        hetero_snps = generate_pss_model_simple(num_snps, h)

        independent_pop = generate_population(independent_snps,
                                              num_inds=num_inds,
                                              h=h)
        hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h)

        # mean-center phenos
        independent_pop = (independent_pop[0], np.array(independent_pop[1]) -
                           np.mean(independent_pop[1]))
        hetero_pop = (hetero_pop[0],
                      np.array(hetero_pop[1]) - np.mean(hetero_pop[1]))

        het_mean = np.mean(hetero_pop[1])
        het_std = np.std(hetero_pop[1])
        hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean,
                                                             scale=het_std,
                                                             size=int(
                                                                 num_inds / 2))
        training_data.append({
            "independent_snps": independent_snps,
            "independent_pop": independent_pop,
            "hetero_snps": hetero_snps,
            "hetero_pop": hetero_pop
        })

    # test all polynoms in file
    coef_cands = []
    HetScorecands = []
    HetScorecand_stds = []
    count = 1
    with open(filename) as f:
        for line in f:
            line = line.split(":")[1]
            coef_wts = [float(x) for x in line.split(",")]

            scr_diffs = []
            scr_homs = []
            for i in range(numtrain):
                independent_pop = training_data[i]["independent_pop"]
                independent_snps = training_data[i]["independent_snps"]
                hetero_pop = training_data[i]["hetero_pop"]
                hetero_snps = training_data[i]["hetero_snps"]
                HetScorehom = run_cont_heterogeneity_on_pop(
                    independent_pop,
                    independent_snps,
                    weight_func=None,
                    coef_wts=coef_wts,
                    symmetric=symmetric)
                HetScorehet = run_cont_heterogeneity_on_pop(
                    hetero_pop,
                    hetero_snps,
                    weight_func=None,
                    coef_wts=coef_wts,
                    symmetric=symmetric)
                HetScorediff = HetScorehet - HetScorehom
                scr_diffs.append(HetScorediff)
                scr_homs.append(HetScorehom)
            coef_cands.append(coef_wts)
            HetScorecands.append(np.mean(scr_diffs))
            HetScorecand_stds.append(np.std(scr_homs))
            print(count, HetScorecands[-1], HetScorecand_stds[-1])
            with open("evaluated_candidates.txt", "a") as f:
                f.write("%s|%s|%s|%s\n" % (count, HetScorecands[-1],
                                           HetScorecand_stds[-1], filename))
            count += 1

    if verbose:
        plt.errorbar(range(len(coef_cands)),
                     HetScorecands,
                     yerr=HetScorecand_stds)
        plt.show()