def test_per_base(self): pos = [1, 12, 15, 27] # boolean array, all true b = [True, True, True, True] # N.B., final bin includes right edge expected_nnz = [1, 2, 1] expected_windows = [[1, 10], [11, 20], [21, 27]] expected_counts = [1, 2, 1] expected_densities = [1 / 10, 2 / 10, 1 / 7] expected_n_bases = [10, 10, 7] nnz, windows, counts = allel.windowed_statistic( pos, b, statistic=np.count_nonzero, size=10, start=1) densities, n_bases = allel.per_base(nnz, windows) aeq(expected_nnz, nnz) aeq(expected_windows, windows) aeq(expected_counts, counts) aeq(expected_densities, densities) aeq(expected_n_bases, n_bases) # boolean array, not all true b = [False, True, False, True] expected_densities = [0 / 10, 1 / 10, 1 / 7] expected_n_bases = [10, 10, 7] nnz, windows, counts = allel.windowed_statistic( pos, b, statistic=np.count_nonzero, size=10, start=1) densities, n_bases = allel.per_base(nnz, windows) aeq(expected_densities, densities) aeq(expected_n_bases, n_bases) # 2D, 4 variants, 2 samples b = [[True, False], [True, True], [True, False], [True, True]] expected_densities = [[1 / 10, 0 / 10], [2 / 10, 1 / 10], [1 / 7, 1 / 7]] expected_n_bases = [10, 10, 7] nnz, windows, counts = allel.windowed_statistic( pos, b, statistic=lambda x: np.sum(x, axis=0), size=10, start=1) densities, n_bases = allel.per_base(nnz, windows) aeq(expected_densities, densities) aeq(expected_n_bases, n_bases) # include is_accessible array option is_accessible = np.array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], dtype=bool) b = [False, True, False, True] expected_densities = [-1, 1 / 6, 1 / 7] expected_n_bases = [0, 6, 7] nnz, windows, counts = allel.windowed_statistic( pos, b, statistic=np.count_nonzero, size=10, start=1) densities, n_bases = allel.per_base(nnz, windows, is_accessible=is_accessible, fill=-1) aeq(expected_densities, densities) aeq(expected_n_bases, n_bases)
def mutrate(ps, polymorphic_loci_outgroup, chrom): ''' Input: - ps, - polymorphic_loci_outgroup, - chrom Output: - ingroup_index : np array with shape (x, ) x being the number of ingroup individuals. The ingroup is considered any non-african from HGDP. ''' #Counts of polymorphic sites in windows of 1Kb snp_count, windows, _ = allel.windowed_statistic( pos=ps, values=polymorphic_loci_outgroup, statistic=np.sum, windows=get_windowed(chrom), fill=0) #np array precursor of the final mutation rate. The columns are: # 1. Chromosome # 2. Start coordinate for the window (0-based, included) # 3. Number of polymorphic sites # 4. Percentage of callable bases in that window mut_rate = pd.DataFrame({ "chrom": [chrom] * windows.shape[0], "start": (windows[:, 0] - 1).astype("int32"), "segregating": snp_count, "call": np.loadtxt( "/home/moicoll/GenerationInterval/people/moi/tmp/weigths/chr{}_weigths.txt" .format(chrom), usecols=[2]) }) #average genomic mutation rate in the inputed chromosome genomic_mut_rate = np.sum(mut_rate["segregating"]) / np.sum( mut_rate["call"]) #average genomic mutation rate in the inputed chromosome over 1Mb. However, a row per 1Kb window is given. mut_rate["mut_rate"] = (mut_rate.assign( start_big_window=(mut_rate["start"] / 1000000).astype(int)).groupby( "start_big_window", group_keys=False ).apply(lambda x: ((x["start"] + 1) / (x["start"] + 1)) * np.sum(x[ "segregating"]) / np.sum(x["call"]) / genomic_mut_rate).fillna(0)) #save dataframe, droping columns "segregating" and "call" mut_rate.drop(["segregating", "call"], axis=1).to_csv( "/home/moicoll/GenerationInterval/people/moi/tmp/mutrate/chr{}.tmp". format(chrom), header=False, index=False, sep='\t')
meta_data_samples['callset_index'] = samples_callset_index def het_counting(gt): return gt.count_het() gt_zarr = callset["{}/calldata/GT".format(chrom)] pos = callset["{}/variants/POS".format(chrom)] gt = allel.GenotypeDaskArray(gt_zarr) df_list = [] for i, row in meta_data_samples.iterrows(): df = pd.DataFrame() individual = (gt.take([row.callset_index], axis=1)) nnz, windows, counts = allel.windowed_statistic(pos, individual, statistic=het_counting, size=window_size) df["het"] = nnz if i % 10 == 0: print(i) window_numbering = [] df.insert(0, column="chr", value=chrom) window_numbering.extend(range(len(nnz))) df.insert(1, column="window", value=window_numbering) df.insert(2, column="PGDP_ID", value=row.PGDP_ID) df_list.append(df) chr_df = pd.concat(df_list, axis=0) chr_df.to_csv("../steps/het_counts_windows_{}.txt".format(chrom), sep=" ", index=False) print("Finished with {}".format(chrom))
# Use the middle of the window as the index window_middle = np.sum(eqa, axis=1)/2 vref_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int), columns=list(subpops.keys()) + list(species.keys())) xpop_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int)) # Calculate distance from the reference in each sub-population for pop in subpops.keys(): print('processing', pop) # Faster if we drop non variant loci first, and load into mem loc = ac[pop].is_variant().compute() pop_ac = ac[pop].compress(loc, axis=0).compute() pop_pos = pos.compress(loc, axis=0) print('computing divergence...', loc.sum()) vals, windows, counts = allel.windowed_statistic( pop_pos, pop_ac.to_frequencies(), compute_divergence, windows=eqa) vref_dxy_by_window[chrom][pop] = vals / window_size # Calculate distance from the reference in each species for pop in species.keys(): print('processing', pop) # Faster if we drop non variant loci first, and load into mem loc = ac_species[pop].is_variant().compute() pop_ac = ac_species[pop].compress(loc, axis=0).compute() pop_pos = pos.compress(loc, axis=0) print('computing divergence...', loc.sum()) vals, windows, counts = allel.windowed_statistic( pop_pos, pop_ac.to_frequencies(), compute_divergence, windows=eqa) vref_dxy_by_window[chrom][pop] = vals / window_size
hap_div = allel.haplotype_diversity(region_complete) # calculate nucleotide diversity specifically on nonmissing region ac = region_complete.count_alleles() diffs = allel.mean_pairwise_difference(ac, fill=0) pi = np.sum(diffs) / 200 return [nh, hap_div, pi, freqs] chromosomes = allel.read_vcf(vcf_path, fields=['CHROM']) chromosomes_list = np.unique(chromosomes['variants/CHROM']) for chrom in chromosomes_list: print(chrom) # read in that chromosome data only callset = allel.read_vcf(vcf_path, region=chrom, fields='*') gt = allel.GenotypeArray(callset["calldata/GT"]) # remove any het calls, convert to haploid gt.mask = gt.is_het() gt_hom_only = gt.fill_masked(value=-1) gt_hap_array = gt_hom_only.haploidify_samples() # remove individuals with missing data and calculate stats n_list, w, n = allel.windowed_statistic(pos=callset["variants/POS"], values=gt_hap_array, statistic=removeMissingStats, size=200, step=50, start=1) df = pd.DataFrame(list(zip(n_list, w, n)), columns=["n_list", "windows_n", "n_var_n"]) file_name = os.getcwd() + "/" + prefix + chrom + "_hap_div.csv" df.to_csv(file_name, header=True)