Exemplo n.º 1
0
def _main(args):
    n_ref = 0.0005 / (4. * args.mu)
    if args.msmc_file:
        if args.smcpp_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_msmc(args.msmc_file, args.mu)
    elif args.smcpp_file:
        if args.msmc_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_smcpp(args.smcpp_file)
    else:
        times = []
        if args.epochtimes:
            times = list(map(float, args.epochtimes.split(',')))
        pop_sizes = list(map(float, args.popsizes.split(',')))

    times = [t / (2. * n_ref) for t in times]
    pop_sizes = [p / n_ref for p in pop_sizes]

    if len(pop_sizes) != len(times)+1:
        raise IOError('Number of population sizes must match '
                      'number of epochs.')
    pop_sizes, times = decimate_sizes(pop_sizes,
                                      times,
                                      args.decimate_rel_tol,
                                      args.decimate_anc_size)
    logging.info('Size history to be used when computing lookup table is\n'
                 + 'Scaled Size\tScaled Left Time\tScaled Right Time\n'
                 + '\n'.join([str(p) + '\t' + str(t1) + '\t' + str(t2)
                              for p, t1, t2 in zip(pop_sizes,
                                                   [0] + times,
                                                   times + [float('inf')])]))
    max_size = args.samplesize
    num_particles = max_size
    if args.moran_pop_size:
        if not args.approx:
            raise IOError('Cannot use moran_pop_size when computing an exact '
                          'lookup table.  Turn off --aprox flag.')
        if max_size > args.moran_pop_size:
            raise IOError('moran_pop_size must be at least as large as the '
                          'desired sample size.')
        num_particles = args.moran_pop_size

    rho_grid = [i * .1 for i in range(100)] + list(range(10, 101))
    logging.info('Beginning Lookup Table.  This may take a while')
    table = LookupTable(num_particles, 0.0005, rho_grid, pop_sizes,
                        times, not args.approx, args.numthreads,
                        store_stationary=args.store_stationary,
                        load_stationary=args.load_stationary).table
    logging.info('\t...complete')
    table.columns /= 4. * n_ref
    if num_particles > max_size:
        logging.info('Downsampling')
        table = downsample(table, max_size)
        logging.info('\t...complete')
    table.to_hdf(args.outfile, 'ldtable', mode='w')
Exemplo n.º 2
0
def test_read_smcpp():
    true_sizes_start = [
        138482.84333082315, 138482.84333082315, 139331.82583178935
    ]
    true_sizes_end = [
        19408.187247411068, 20959.43140840318, 23058.569473392425
    ]
    true_times_start = [50.0, 53.97505585700569, 58.2661330953377]
    true_times_end = [83485.36048509754, 90122.53990850793, 97287.38251073883]
    sizes, times = read_smcpp(joinpath(THIS_DIR, 'ACB_pop_sizes.csv'))
    assert np.allclose(sizes[0:3], true_sizes_start)
    assert np.allclose(sizes[-3:], true_sizes_end)
    assert np.allclose(times[0:3], true_times_start)
    assert np.allclose(times[-3:], true_times_end)
Exemplo n.º 3
0
def test_decimate_sizes():
    sizes, times = read_smcpp(joinpath(THIS_DIR, 'ACB_pop_sizes.csv'))
    new_sizes, new_times = decimate_sizes(sizes, times, 0.0, None)
    assert np.allclose(sizes[1:], new_sizes)
    assert np.allclose(times[1:], new_times)
    new_sizes, new_times = decimate_sizes(sizes, times, 0.0, 1.0)
    assert np.allclose(sizes[1:-1], new_sizes[:-1])
    assert np.allclose(1.0, new_sizes[-1])
    assert np.allclose(times[1:], new_times)
    new_sizes, new_times = decimate_sizes(sizes, times, 0.25, None)
    new_idx = 0
    for idx, t in enumerate(times):
        if t > new_times[new_idx]:
            new_idx += 1
            assert new_times[new_idx] >= t
        rel_error = np.abs((sizes[idx] - new_sizes[new_idx]))
        rel_error /= sizes[idx]
        assert rel_error < 0.25
Exemplo n.º 4
0
def _main(args):
    table = read_hdf(args.tablefile, 'ldtable')
    table_size = sum(map(int, table.index.values[0].split()))
    if table_size < args.samplesize:
        raise IOError('Lookup table was constructed for {} haploids, '
                      'but --samplesize is {} haploids.  Either build '
                      'a larger lookup table or simulate fewer '
                      'individuals.'.format(table_size, args.samplesize))
    max_rho = table.columns[-1]
    table.columns *= 100. / max_rho
    block_penalties = list(map(float, args.blockpenalty.split(',')))
    window_sizes = list(map(float, args.windowsize.split(',')))
    logging.info('Searching over Windowsizes %s, and Block Penalties %s',
                 window_sizes, block_penalties)
    if args.msmc_file:
        if args.smcpp_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_msmc(args.msmc_file, args.mu)
    elif args.smcpp_file:
        if args.msmc_file or args.epochtimes or args.popsizes:
            raise IOError('Can only specify one of msmc_file, smcpp_file, or '
                          'popsizes')
        pop_sizes, times = read_smcpp(args.smcpp_file)
    else:
        pop_sizes = list(map(float, args.popsizes.split(',')))
        times = []
        if args.epochtimes:
            times = list(map(float, args.epochtimes.split(',')))
    if len(pop_sizes) != len(times) + 1:
        raise IOError('Number of population sizes must '
                      'match number of epochs.')
    pop_sizes, times = decimate_sizes(pop_sizes,
                                      times,
                                      args.decimate_rel_tol,
                                      args.decimate_anc_size)

    pop_config = [
        msprime.PopulationConfiguration(sample_size=args.samplesize,
                                        initial_size=pop_sizes[0])]
    demography = []
    if times:
        for pop_size, time in zip(pop_sizes[1:], times):
            demography.append(
                msprime.PopulationParametersChange(time=time * 2,
                                                   initial_size=pop_size,
                                                   population_id=0))
    reco_maps = _load_hapmap()
    pool = Pool(args.numthreads, maxtasksperchild=100)
    logging.info('Simulating data...')
    simulation_args = [((pop_config, args.mu, demography, args.ploidy),
                        reco_maps) for k in range(args.num_sims)]
    test_set = list(pool.imap(_simulate_data, simulation_args, chunksize=10))
    logging.info('\tdone simulating')
    scores = {}
    for block_penalty in block_penalties:
        for window_size in window_sizes:
            estimates = list(pool.imap(partial(_call_optimize,
                                               metawindow=args.metawindow,
                                               windowsize=window_size,
                                               table=table,
                                               ploidy=args.ploidy,
                                               bpen=block_penalty,
                                               overlap=args.overlap,
                                               max_rho=max_rho),
                                       test_set,
                                       chunksize=10))
            scores[(block_penalty,
                    window_size)] = _score(estimates,
                                           [ts[1] for ts in test_set],
                                           [ts[2] for ts in test_set],
                                           pool)
    ofile = open(args.outfile, 'w') if args.outfile else sys.stdout
    ofile.write('\t'.join(['Block_Penalty',
                           'Window_Size',
                           'Pearson_Corr_1bp',
                           'Pearson_Corr_10kb',
                           'Pearson_Corr_100kb',
                           'Log_Pearson_Corr_1bp',
                           'Log_Pearson_Corr_10kb',
                           'Log_Pearson_Corr_100kb',
                           'Spearman_Corr_1bp',
                           'Spearman_Corr_10kb',
                           'Spearman_Corr_100kb',
                           'L2',
                           'Log_L2']) + '\n')
    for block_penalty, window_size in sorted(scores):
        line = ([block_penalty, window_size]
                + scores[block_penalty, window_size])
        ofile.write('\t'.join(map(str, line)) + '\n')
    if args.outfile:
        ofile.close()