예제 #1
0
def test_read_msmc():
    true_sizes = [
        0.00086526447673996, 9.79088625144171e-05, 0.000171382542974173,
        0.000457984501804459, 0.000272927253969727, 0.000131071906047658,
        6.82882858274492e-05, 4.10030998343475e-05, 2.79371748811273e-05,
        2.12348489352847e-05, 1.99020025394955e-05, 1.99020025394955e-05,
        2.2919799039202e-05, 2.2919799039202e-05, 2.98841094236551e-05,
        2.98841094236551e-05, 4.04164511122607e-05, 4.04164511122607e-05,
        5.57810348943842e-05, 5.57810348943842e-05, 8.00343187158654e-05,
        8.00343187158654e-05, 0.000116691016192045, 0.000116691016192045,
        0.000167278907468, 0.000167278907468, 0.000226324223028942,
        0.000226324223028942, 0.000287970327537451, 0.000287970327537451,
        0.000345006037605658, 0.000345006037605658, 0.000392215310516861,
        0.000392215310516861, 0.000424300540558889, 0.000424300540558889,
        0.000447447313078885, 0.000447447313078885, 0.000841088166225936,
        0.000841088166225936
    ]
    true_times = [
        1.58858e-06, 3.21843e-06, 4.89174e-06, 6.61091e-06, 8.3785e-06,
        1.01973e-05, 1.20705e-05, 1.40013e-05, 1.59934e-05, 1.80508e-05,
        2.01779e-05, 2.23798e-05, 2.46617e-05, 2.70297e-05, 2.94906e-05,
        3.2052e-05, 3.47225e-05, 3.75116e-05, 4.04305e-05, 4.34919e-05,
        4.67103e-05, 5.01028e-05, 5.36893e-05, 5.74932e-05, 6.15427e-05,
        6.58717e-05, 7.05216e-05, 7.5544e-05, 8.10035e-05, 8.69838e-05,
        9.35947e-05, 0.000100985, 0.000109364, 0.000119036, 0.000130476,
        0.000144477, 0.000162528, 0.000187969, 0.000231461
    ]
    sizes, times = read_msmc(joinpath(THIS_DIR, 'msmc_test.final.txt'), 1.0)
    assert np.allclose(sizes, true_sizes)
    assert np.allclose(times, true_times)

    sizes, times = read_msmc(joinpath(THIS_DIR, 'msmc_test.final.txt'),
                             1.25e-8)
    assert np.allclose(sizes, np.array(true_sizes) / 1.25e-8)
    assert np.allclose(times, np.array(true_times) / 1.25e-8)
예제 #2
0
def test_issue6():
    sizes, times = read_msmc(joinpath(THIS_DIR, 'issue6_msmc.txt'), 1.0)
    N_ref = .0005 / 4.
    times = [t / (2. * N_ref) for t in times]
    sizes = [p / N_ref for p in sizes]
    new_sizes, new_times = decimate_sizes(sizes, times, 0.0, None)
    assert np.abs(new_times[0] - 1.85199e-07 / (2. * N_ref)) < 1e-13
예제 #3
0
def _main(args):
    n_ref = 0.0005 / (4. * args.mu)
    if args.msmc_file:
        if args.smcpp_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_msmc(args.msmc_file, args.mu)
    elif args.smcpp_file:
        if args.msmc_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_smcpp(args.smcpp_file)
    else:
        times = []
        if args.epochtimes:
            times = list(map(float, args.epochtimes.split(',')))
        pop_sizes = list(map(float, args.popsizes.split(',')))

    times = [t / (2. * n_ref) for t in times]
    pop_sizes = [p / n_ref for p in pop_sizes]

    if len(pop_sizes) != len(times)+1:
        raise IOError('Number of population sizes must match '
                      'number of epochs.')
    pop_sizes, times = decimate_sizes(pop_sizes,
                                      times,
                                      args.decimate_rel_tol,
                                      args.decimate_anc_size)
    logging.info('Size history to be used when computing lookup table is\n'
                 + 'Scaled Size\tScaled Left Time\tScaled Right Time\n'
                 + '\n'.join([str(p) + '\t' + str(t1) + '\t' + str(t2)
                              for p, t1, t2 in zip(pop_sizes,
                                                   [0] + times,
                                                   times + [float('inf')])]))
    max_size = args.samplesize
    num_particles = max_size
    if args.moran_pop_size:
        if not args.approx:
            raise IOError('Cannot use moran_pop_size when computing an exact '
                          'lookup table.  Turn off --aprox flag.')
        if max_size > args.moran_pop_size:
            raise IOError('moran_pop_size must be at least as large as the '
                          'desired sample size.')
        num_particles = args.moran_pop_size

    rho_grid = [i * .1 for i in range(100)] + list(range(10, 101))
    logging.info('Beginning Lookup Table.  This may take a while')
    table = LookupTable(num_particles, 0.0005, rho_grid, pop_sizes,
                        times, not args.approx, args.numthreads,
                        store_stationary=args.store_stationary,
                        load_stationary=args.load_stationary).table
    logging.info('\t...complete')
    table.columns /= 4. * n_ref
    if num_particles > max_size:
        logging.info('Downsampling')
        table = downsample(table, max_size)
        logging.info('\t...complete')
    table.to_hdf(args.outfile, 'ldtable', mode='w')
예제 #4
0
def _main(args):
    table = read_hdf(args.tablefile, 'ldtable')
    table_size = sum(map(int, table.index.values[0].split()))
    if table_size < args.samplesize:
        raise IOError('Lookup table was constructed for {} haploids, '
                      'but --samplesize is {} haploids.  Either build '
                      'a larger lookup table or simulate fewer '
                      'individuals.'.format(table_size, args.samplesize))
    max_rho = table.columns[-1]
    table.columns *= 100. / max_rho
    block_penalties = list(map(float, args.blockpenalty.split(',')))
    window_sizes = list(map(float, args.windowsize.split(',')))
    logging.info('Searching over Windowsizes %s, and Block Penalties %s',
                 window_sizes, block_penalties)
    if args.msmc_file:
        if args.smcpp_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_msmc(args.msmc_file, args.mu)
    elif args.smcpp_file:
        if args.msmc_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_smcpp(args.smcpp_file)
    else:
        pop_sizes = list(map(float, args.popsizes.split(',')))
        times = []
        if args.epochtimes:
            times = list(map(float, args.epochtimes.split(',')))
    if len(pop_sizes) != len(times) + 1:
        raise IOError('Number of population sizes must '
                      'match number of epochs.')
    pop_sizes, times = decimate_sizes(pop_sizes,
                                      times,
                                      args.decimate_rel_tol,
                                      args.decimate_anc_size)

    pop_config = [
        msprime.PopulationConfiguration(sample_size=args.samplesize,
                                        initial_size=pop_sizes[0])]
    demography = []
    if times:
        for pop_size, time in zip(pop_sizes[1:], times):
            demography.append(
                msprime.PopulationParametersChange(time=time * 2,
                                                   initial_size=pop_size,
                                                   population_id=0))
    reco_maps = _load_hapmap()
    pool = Pool(args.numthreads, maxtasksperchild=100)
    logging.info('Simulating data...')
    simulation_args = [((pop_config, args.mu, demography, args.ploidy),
                        reco_maps) for k in range(args.num_sims)]
    test_set = list(pool.imap(_simulate_data, simulation_args, chunksize=10))
    logging.info('\tdone simulating')
    scores = {}
    for block_penalty in block_penalties:
        for window_size in window_sizes:
            estimates = list(pool.imap(partial(_call_optimize,
                                               metawindow=args.metawindow,
                                               windowsize=window_size,
                                               table=table,
                                               ploidy=args.ploidy,
                                               bpen=block_penalty,
                                               overlap=args.overlap,
                                               max_rho=max_rho),
                                       test_set,
                                       chunksize=10))
            scores[(block_penalty,
                    window_size)] = _score(estimates,
                                           [ts[1] for ts in test_set],
                                           [ts[2] for ts in test_set],
                                           pool)
    ofile = open(args.outfile, 'w') if args.outfile else sys.stdout
    ofile.write('\t'.join(['Block_Penalty',
                           'Window_Size',
                           'Pearson_Corr_1bp',
                           'Pearson_Corr_10kb',
                           'Pearson_Corr_100kb',
                           'Log_Pearson_Corr_1bp',
                           'Log_Pearson_Corr_10kb',
                           'Log_Pearson_Corr_100kb',
                           'Spearman_Corr_1bp',
                           'Spearman_Corr_10kb',
                           'Spearman_Corr_100kb',
                           'L2',
                           'Log_L2']) + '\n')
    for block_penalty, window_size in sorted(scores):
        line = ([block_penalty, window_size]
                + scores[block_penalty, window_size])
        ofile.write('\t'.join(map(str, line)) + '\n')
    if args.outfile:
        ofile.close()