cutoff=2 #number of repetitions for each model (to ensure convergence of optimization) rep_pp=2 rep_pp_px=2 #only trio individuals names=[ "NA19700","NA19701", "NA19704" , "NA19703", "NA19819" , "NA19818","NA19835" , "NA19834","NA19901" , "NA19900" ,"NA19909" , "NA19908","NA19917" , "NA19916" ,"NA19713" , "NA19982","NA20127" , "NA20126","NA20357" , "NA20356"] chroms=['%d' % (i,) for i in range(1,23)] #load the population pop=tracts.population(names=names,fname=(directory,"",".bed"),selectchrom=chroms) (bins, data)=pop.get_global_tractlengths(npts=50) #choose order of populations and sort data accordingly labels=['EUR','AFR'] data=[data[poplab] for poplab in labels] startparams=numpy.array([ 0.173632 , 0.0683211 ]) # (initial European proportion, and time). Times are measured in units of hundred generations (i.e., multiply the number by 100 to get the time in generations). The reason is that some python optimizers do a poor job when the parameters (time and ancestry proportions) are of different magnitudes. #you can also look at the "_mig" output file for a generation-by-generation breakdown of the migration rates. Ls=pop.Ls nind=pop.nind
_files = os.listdir(directory) files = [file for file in _files if file.split('.')[-1] == "bed"] # only consider bed files # Get unique individual labels names = list(set(file.split('_')[0] for file in files)) if len(_files) != len(files): warn("some files in the bed directory were ignored, since they do not " "end with `.bed`.") # Load the population using the population class's constructor. It # automatically iterates over individuals and haploid copies (labeled _A" # and "_B" by default pop = tracts.population(names=names, fname=(directory, inter, end)) # Rather than creating a new population for each bootstrap instance, we # just replace the list of individuals to iterate over. We need to save a # copy of the initial list of individuals to do this! indivs = pop.indivs def bootsamp(num): #generates a list of positions of the samples to pick in a bootstrap return numpy.random.choice(range(num),replace=True,size=num) # iterate over bootstrap instances. Iteration 0 is the un-bootstrapped value for bootnum in runboots: # Use a seed for reproducibility. numpy.random.seed(seed=bootnum)
def load_population(path_pairs): """ Given a list of pairs of paths, each pair identifying the two haplotypes for an individual, build a tracts population. """ eprint('loading population') return tracts.population([tracts.indiv.from_files(t) for t in path_pairs])
_files = os.listdir(directory) files = [file for file in _files if file.split('.')[-1] == "bed"] # only consider bed files # Get unique individual labels names = list(set(file.split('_')[0] for file in files)) if len(_files) != len(files): warn("some files in the bed directory were ignored, since they do not " "end with `.bed`.") # Load the population using the population class's constructor. It # automatically iterates over individuals and haploid copies (labeled _A" # and "_B" by default pop = tracts.population(names=names, fname=(directory, inter, end)) # Rather than creating a new population for each bootstrap instance, we # just replace the list of individuals to iterate over. We need to save a # copy of the initial list of individuals to do this! indivs = pop.indivs def bootsamp(num): r = range(num) # memoize the choice function to avoid the dictionary lookups on each # iteration in r choice = numpy.random.choice return [choice(r) for _ in r] # iterate over bootstrap instances. Iteration 0 is the un-bootstrapped value for bootnum in runboots:
rep_pp = 2 rep_pp_px = 2 # only trio individuals names = [ "NA19700", "NA19701", "NA19704", "NA19703", "NA19819", "NA19818", "NA19835", "NA19834", "NA19901", "NA19900", "NA19909", "NA19908", "NA19917", "NA19916", "NA19713", "NA19982", "NA20127", "NA20126", "NA20357", "NA20356" ] chroms = ['%d' % (i, ) for i in range(1, 23)] # load the population pop = tracts.population(names=names, fname=(directory, "", ".bed"), selectchrom=chroms) (bins, data) = pop.get_global_tractlengths(npts=50) # choose order of populations and sort data accordingly labels = ['EUR', 'AFR'] data = [data[poplab] for poplab in labels] # we're fixing the global ancestry proportions, so we only need one parameter startparams = numpy.array([0.0683211]) # (initial admixture time). Times are measured in units of hundred generations # (i.e., multiply the number by 100 to get the time in generations). The reason # is that some python optimizers do a poor job when the parameters (time and # ancestry proportions) are of different magnitudes. you can also look at the # "_mig" output file for a generation-by-generation breakdown of the migration # rates.
### PSMC simulations PSMCinds = [] for i in range(1000): print "PSMC", i PSMCind = P.PSMC_ind(M_TMat, F_TMat, M_leaflist, F_leaflist, ChromLengths) PSMCinds.append(PSMCind) outfile = os.path.join(bedpath, "PSMC_IND" + str(i + 1)) ped.tracts_ind_to_bed(PSMCind, outfile, conv = "M->cM") # plotoutfile = os.path.expanduser('~/project/tracts/sims/results/PSMC/simple/PSMC0001.png') # plotoutpath = os.path.expanduser(sys.argv[2]) if pedtype == '0001': plotoutfile = outpath + "PSMC0001.png" elif pedtype == '1001': plotoutfile = outpath + "PSMC1001.png" pop = tracts.population(list_indivs = PSMCinds) pop.plot_global_tractlengths(colordict, outfile = plotoutfile) ## Forward simulations #if pedtype == '0001': # P = ped.Pedigree(migmat, split_parents=False) # leaflist = P.SortLeafNode(P.indlist)[0] # # ## If this sum is one we know that there is only one leaf with ancestry 1 # while np.sum([leaf.ancestry for leaf in leaflist]) != 1: # P = ped.Pedigree(migmat, split_parents=False) # leaflist = P.SortLeafNode(P.indlist)[0] #elif pedtype == '1001': # P = ped.Pedigree(migmat, split_parents=False) # leaflist, nodelist = P.SortLeafNode(P.indlist)
print "Unknown simulation method" sys.exit ## Save simulated individual to list indlist.append(tracts_ind) ## Write simulated individuals to BED files # if bed_dir != "None": # outfile = os.path.join(bed_dir, "IND" + str(i + 1)) # ped.tracts_ind_to_bed(tracts_ind, outfile, conv = "M->cM") times.append(time.time() - start_time) print "Number of generations simulated:", len(P.MigPropMat) print "Average time per simulated individual:", np.mean(times) ## Plot tracts distribution for simulated population pop = tracts.population(list_indivs = indlist) #(bins, data) = pop.get_global_tractlengths(npts=50) ##outdir = "./out" #if migmat is None: # migmat, ancestries = P.ped_to_migmat(P.indlist) # print ancestries #D = tracts.demographic_model(mig=migmat) # #with open(outdir + popname + "_bins", 'w') as fbins: # fbins.write("\t".join(map(str, bins))) # #with open(outdir + popname + "_dat", 'w') as fdat: # for label in data.keys(): # fdat.write("\t".join(map(str, data[label])) + "\n") # #with open(outdir + popname + "_mig", 'w') as fmig: