def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c") try: tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"] except ValueError: logger.info("File not tab delimited as expected- trying with spaces") tbl = pd.read_csv(mapfn, sep=" ", header=None, engine="c", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) try: vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") except ValueError: tbl = tbl.sort_values(["CHROM", "POS"]) logger.warning( "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient" ) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) pos = allel.SortedIndex(vartbl.POS[:]) assert np.isnan(pos).sum() == 0, "nans values are not supported" return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn): tbl = pd.read_csv(mapfn, sep=" ", names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"]) vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS") d1 = np.loadtxt(pop_a_fn, dtype="int8") geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2))) d2 = np.loadtxt(pop_b_fn, dtype="int8") geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2))) return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
def work(self): import numpy as np import allel import h5py import pandas as pd from luigi.file import atomic_file # Opens the SynSNPS file, which contains only biallelic synonymous sites callset = h5py.File(self.input()['syn'].path, mode='r') genotypes = allel.GenotypeChunkedArray(callset['calldata']['genotype']) samples = np.array([x.decode() for x in callset['samples']]) # Selects site with r**2 linkage < max_linkage n_ref = genotypes.to_n_ref(fill=-9) unlinked = allel.locate_unlinked(n_ref, threshold=self.max_linkage)[:] # Create pseudohaplotypes (0=ref, 1=alt, -1=missing) hap_matrix = genotypes[:][unlinked].to_haplotypes() # Double up the sample names samples_dup = np.array(list(zip(samples, samples))).reshape(-1, 1) hap_df = pd.DataFrame(np.hstack((samples_dup, hap_matrix.T))) # Atomic write TSV file output af = atomic_file(self.output().path) hap_df.to_csv(af.tmp_path, sep='\t', index=False) af.move_to_final_destination()
def genotype_array_from_vcf(filename): """ @Params: filename: relative path to genotpye data file """ g = allel.GenotypeChunkedArray(allel.read_vcf(filename)['calldata/GT']) logging.info(f"Loaded Genotype Data from file {filename}") return g
def plth12(chromlist): """ """ for c in chromlist: # callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # H12 h12 = allel.moving_garud_h(h, window_size)[1] # set window size h12_pos = [] p = 0 end = window_size i = 0 while i < len(h12): stop = pos[end] while pos[p] < stop: h12_pos.append(h12[i]) p += 1 i += 1 end += window_size while len(h12_pos) < len(pos): h12_pos.append(h12[-1]) plt.plot(pos, h12_pos) plt.xlabel("{} genomic position".format(c)) plt.ylabel("H12") plt.savefig("PNG.{}.H12.pdf".format(c)) plt.clf()
def get_genotype_array_concat(callsets, genotype_array_type=config.GENOTYPE_ARRAY_DASK): if len(callsets) == 1: # Only one callset provided. No need for concatenation callset = callsets[0] return get_genotype_array(callset=callset, genotype_array_type=genotype_array_type) gt_list = [] # Get genotype data for each callset for callset in callsets: gt = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Encapsulate underlying zarr array with a chunked dask array gt = da.from_array(gt, chunks=gt.chunks) gt_list.append(gt) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: combined_gt = da.concatenate(gt_list, axis=0) combined_gt = allel.GenotypeDaskArray(combined_gt) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: combined_gt = allel.GenotypeChunkedArray( np.concatenate(gt_list, axis=0)) elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0)) else: raise ValueError( 'Error: Invalid option specified for genotype_array_type.') return combined_gt
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None): import zarr samples1 = get_sample_ids(s1) samples2 = get_sample_ids(s2) zfh = zarr.open_group(zarr_fn, mode="r")[chrom] samples_x = zfh["samples"][:] sample_name = [sid.decode() for sid in samples_x.tolist()] idx1 = np.array([sample_name.index(sid) for sid in samples1]) idx2 = np.array([sample_name.index(sid) for sid in samples2]) g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"]) pos = allel.SortedIndex(zfh["variants"]["POS"][:]) if gdistkey is not None: gdist = h5fh["variants"][gdistkey][:] else: gdist = None return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
def load_hdf5_data(hdf5_fn, chrom, s1, s2): callset = h5py.File(hdf5_fn, mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] idx1 = np.array([sample_name.index(sid) for sid in s1]) idx2 = np.array([sample_name.index(sid) for sid in s2]) g = allel.GenotypeChunkedArray(callset["calldata/GT"]) pos = allel.SortedIndex(callset["variants/POS"][:]) return g.take(idx1, axis=1), g.take(idx2, axis=1), pos
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix): """ 计算pop1和pop2之间的Fst using the method of Hudson (1992) elaborated by Bhatia et al. (2013). """ pop1 = [x.strip() for x in open(pop1)] pop2 = [x.strip() for x in open(pop2)] callset = allel.read_vcf(vcffile) allsamples = callset['samples'] genotypes = allel.GenotypeChunkedArray(callset['calldata/GT']) variant_selection = np.full((genotypes.shape[0] + 1), True) # 选择vcf中的全部位点 sample_selection = [True if x in pop1 else False for x in allsamples] ac1 = getAC(genotypes, variant_selection, sample_selection) sample_selection = [True if x in pop2 else False for x in allsamples] ac2 = getAC(genotypes, variant_selection, sample_selection) num, den = allel.hudson_fst(ac1, ac2) fst = num / den meanFst = np.sum(num) / np.sum(den) print('meanFst: %s' % meanFst) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst}) df.to_csv(f'{outprefix}_persite.tsv.gz', sep='\t', index=False, na_rep='nan', compression='gzip') df['num'] = num df['den'] = den # sliding bins bdf = [] for offset in range(0, binwidth, stepsize): df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth for group_name, gdf in df.groupby(by=['chrom', 'bin_index']): chrom, bin_index = group_name start = bin_index * binwidth + offset + 1 if start < 0: # 开头几个窗口长度不足的就直接跳过 continue end = start + binwidth - 1 n_snp = gdf.shape[0] sum_num = gdf['num'].sum() sum_den = gdf['den'].sum() if sum_den > 0: meanFst = sum_num / sum_den else: meanFst = np.nan bdf.append([chrom, start, end, n_snp, meanFst]) bdf = pd.DataFrame(bdf, columns=['chrom', 'start', 'end', 'n_snp', 'meanFst']).sort_values(by=['chrom', 'start']) bdf.to_csv(f'{outprefix}_meanFst.tsv.gz', index=False, compression='gzip', sep='\t', float_format='%.3f')
def get_genotype_array(callset, genotype_array_type=config.GENOTYPE_ARRAY_DASK): gtz = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: return allel.GenotypeArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_DASK: return allel.GenotypeDaskArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: return allel.GenotypeChunkedArray(gtz) else: return None
def run(self, n_samples=1000): ''' Run PCA and UMAP on the data. ''' start = time.time() self.timestamp = int(start) gt = allel.GenotypeChunkedArray(self.genome_file['calldata/GT']) gn = gt.to_n_alt() vidx = np.random.choice(gn.shape[0], n_samples, replace=False) vidx.sort() gnr = gn.take(vidx, axis=0)[:] pca = PCA(n_components=2) gnu_pca = pca.fit_transform(gnr.T) reducer = umap.UMAP() #gnu_umap = reducer.fit_transform(gnr.T) gnu_pca_df = pd.DataFrame(gnu_pca, self.panel_pop) gnu_pca_df["pop"] = gnu_pca_df.index gnu_umap_df = pd.DataFrame(gnu_pca, self.panel_pop) gnu_umap_df["pop"] = gnu_umap_df.index plot_signal = self.plot(gnu_pca_df, gnu_umap_df) write_signal = self.write(gnu_pca_df, gnu_umap_df) end = time.time() if (plot_signal and write_signal): log_file = open(str(self.timestamp) + ".log", "w+") log_file.write("Log File Start \n") log_file.write("\nRun time : " + str(end - start)) log_file.write("\nRandom Seed : " + str(self.seed)) log_file.write("\nPackage Versions : \n") log_file.write("\nScikit Allel Version : " + str(allel.__version__)) log_file.write("\nSklearn Version : " + str(sklearn.__version__)) log_file.write("\nNumpy Version : " + str(np.__version__)) #log_file.write("Matplotlib Version : " + str(matplotlib.__version__)) log_file.write("\nPandas Version : " + str(pd.__version__)) log_file.write("\nUMAP Version : " + str(umap.__version__)) log_file.write("\nPCA Parameters : \n") log_file.write(str(pca)) log_file.write("\nUMAP Parameters : \n") log_file.write(str(reducer)) log_file.close()
def pltPi(chromlist): """ """ for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') # callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) pos = allel.SortedIndex(callset["variants/POS"][:]) acc = g.count_alleles() pi_windowed = allel.windowed_diversity(pos, acc, size=10) plt.plot(pos, h12_pos) plt.xlabel("{} genomic position".format(c)) plt.ylabel("H12") plt.savefig("PNG.{}.H12.pdf".format(c)) plt.clf()
def write_hap_array(pop, chrom, p1, p2, name, samples, inaccessible=False): """ Function to write a haplotype array for a specific region and population. currently using for iSAFE """ if inaccessible is False: ############ Read zarrs ############# Ag_store = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r')[:] else: Ag_store = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS", mode='r')[:] print("--------------------------------------------------") print(f"Zarrs loaded: {pop}, Chromosome {chrom}") ############ Load intro gen.array and compute statistics ########### ag_geno = allel.GenotypeChunkedArray(Ag_store) pop_bool = samples.population == pop print("Constructing HaplotypeArray") pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() flt_region = np.where((positions >= p1) & (positions <= p2))[0] #get chrom positions sweep = pop_haplo.take(flt_region, axis=0) ac = sweep.count_alleles() flt_ac = ac.is_segregating() sweep = sweep.compress(flt_ac, axis=0) #eep only segregating flt_seg = positions.take( flt_region[flt_ac]) #repeat filtering on positions dt = pd.DataFrame(data=sweep) dt.index = flt_seg dt.to_csv(f'../data/{pop}/{chrom}/sweep_hapl_{name}', index=True, sep="\t") print(f"Writing Haplotype array for {name} region for iSAFE algorithm")
def main(h5file, samplesfile, outgroupfile, outprefix, blen): """ use samples in samplesfile to calculate pair-wise outgroup-f3 use samples in outgroupfile as outgroup h5file generate from vcffile from scikit-allele(1.1.10) import allel; allel.vcf_to_hdf5('in.vcf.gz', 'out.h5') """ print(__doc__) print('scikit-allel', allel.__version__) samples = [x.strip() for x in open(samplesfile).readlines()] # 待计算个体 callset = h5py.File(h5file, mode='r') allsamples = list(callset['samples']) # vcf包含的全部个体 calldata = callset['calldata'] genotypes = allel.GenotypeChunkedArray(calldata['GT']) variant_selection = np.full((genotypes.shape[0] + 1), True) # 选择vcf中的全部位点 ac_outgroup = cal_outgroup_ac(genotypes, outgroupfile, allsamples, variant_selection) ac_dict = cal_all_ac(genotypes, samples, allsamples, variant_selection) print('begin to cal outgroup f3') n_comb = len(list(combinations(samples, 2))) print(f'total combinations is {n_comb}') n_iter = 0 n_samples = len(samples) f3_ay = np.full((n_samples, n_samples), None) z_ay = np.full((n_samples, n_samples), None) for sample1, sample2 in combinations(samples, 2): x = samples.index(sample1) y = samples.index(sample2) n_iter += 1 print(f'{n_iter}/{n_comb}') f3, se, z, vb, vj = allel.average_patterson_f3(ac_dict[sample1], ac_dict[sample2], ac_outgroup, blen) f3_ay[x, y] = f3 f3_ay[y, x] = f3 z_ay[x, y] = z z_ay[y, x] = z pd.DataFrame(f3_ay, columns=samples, index=samples).to_csv(f'{outprefix}.f3.tsv', sep='\t') pd.DataFrame(z_ay, columns=samples, index=samples).to_csv(f'{outprefix}.z.tsv', sep='\t')
def sim_load_h5_to_PCA(h5_path): ''' load dataset from h5 format file, remove non-informative columns, fit a PCA input: path file output:PCA coordenates ''' callset = h5py.File(h5_path, mode='r') #Reference: http://alimanfoo.github.io/2015/09/28/fast-pca.html g = allel.GenotypeChunkedArray(callset['calldata/GT']) ac = g.count_alleles()[:] # remove singletons and multiallelic SNPs. Singletons are not informative for PCA, flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) # transform the genotype data into a 2-dimensional matrix where each cell has the number of non-reference alleles per call gn = gf.to_n_alt() #Removing correlated features (LD pruning): each SNP is a feature, SNPs tend to be correlated #It takes a while 5:15- def ld_prune(gn, size, step, threshold=.1, n_iter=1): for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i + 1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn #more than 3 does not remove almost anything gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3) #PCA k = 2 coords1, model1 = allel.pca(gnu, n_components=k, scaler='patterson') np.savetxt('data_s//tgp_pca' + str(k) + '.txt', coords1, delimiter=',') return coords1
def prepData(directory, outfn, newVCF, samples, bs): ## can either use an existing vcf file or make a new vcf file from a list of samples if newVCF == True: makeVCF(directory, samples, outfn) vcffile = directory + outfn + ".vcf" #runConversion(outfn, vcffile, bs) callsetfn = directory + '/analysis-vcf2hdf5/' + outfn + ".snps.hdf5" callset = h5py.File(callsetfn, mode='r') #callset = allel.read_vcf(vcffile) #get genotype data g = allel.GenotypeChunkedArray(callset['genos']) ## transform data gn = transform(g) return gn, callset
def main(callset, samplesA, samplesB, window_size): callset_samples = callset["samples"][:].astype("U8").tolist() sa = [callset_samples.index(x) for x in samplesA if x in callset_samples] sb = [callset_samples.index(x) for x in samplesB if x in callset_samples] check_samples(sa, samplesA, "A") check_samples(sb, samplesB, "B") positions = allel.SortedIndex(callset["variants/POS"]) last_pos = positions[-1] window_starts = np.arange(1, last_pos, window_size, dtype=int) df = pd.DataFrame(columns=["start", "stop", "nvar"], index=window_starts) df["fst"] = pd.Series(index=window_starts, dtype=float) df["start"] = window_starts df["stop"] = window_starts + window_size gt = allel.GenotypeChunkedArray(callset["calldata/GT"]) for start in window_starts: try: loc = positions.locate_range(start, start + window_size - 1) except KeyError: df.at[start, "nvar"] = 0 continue g = gt[loc] ac1 = g.count_alleles(subpop=sa) ac2 = g.count_alleles(subpop=sb) num, den = allel.stats.hudson_fst(ac1, ac2) df.at[start, "fst"] = (np.sum(num) / np.sum(den)) df.at[start, "nvar"] = num.size return df
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000): """ main function to run pca visualization """ import pdb #gn, callset = prepData(directory, outfn, newVCF, samples, bs) callset = allel.read_vcf(directory + outfn + ".vcf") g = allel.GenotypeChunkedArray(callset['calldata/GT']) gn = transform(g) ## get metadata df = fp.retrieveMetaData(samples, directory, outfn) coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson') fig_pca(directory, outfn, coords1, model1, 'Conventional PCA.', sample_population=df[column])
def import_data(filepath, chrom_name, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles']): '''Take the path to a well-formed h5py file and return a VariantTable and a GenotypeArray.''' ##to-do: check that h5py file is well-formed callset_handle = filepath callset = h5py.File(callset_handle, mode='r') variants = allel.VariantChunkedTable(callset[chrom_name]['variants'], names=names, index='POS') genotypes = allel.GenotypeChunkedArray( callset[chrom_name]['calldata']['genotype']) if not genotypes.shape[0] == variants.shape[0]: raise ValueError("Genotypes and variant table must contain the\ same number of positions") return variants, genotypes
''' if not len(sys.argv) in [15, 17]: sys.exit( "usage:\npython2 empirical_convert_to_FVs.py chrArmFileName chrArm chrLen [segmentStart segmentEnd] subWinSize numSubWins unmaskedFracCutoff pMisPol partialStatAndDafFileName maskFileName ancestralArmFaFileName sampleToPopFileName targetPop statFileName fvecFileName\n" ) if len(sys.argv) == 17: chrArmFileName, chrArm, chrLen, segmentStart, segmentEnd, subWinSize, numSubWins, unmaskedFracCutoff, pMisPol, partialStatAndDafFileName, maskFileName, ancestralArmFaFileName, sampleToPopFileName, targetPop, statFileName, fvecFileName = sys.argv[ 1:] else: chrArmFileName, chrArm, chrLen, subWinSize, numSubWins, unmaskedFracCutoff, pMisPol, partialStatAndDafFileName, maskFileName, ancestralArmFaFileName, sampleToPopFileName, targetPop, statFileName, fvecFileName = sys.argv[ 1:] segmentStart = None chrArmFile = h5py.File(chrArmFileName, "r") genos = allel.GenotypeChunkedArray(chrArmFile[chrArm]["calldata"]["genotype"]) positions = allel.SortedIndex(chrArmFile["/%s/variants/POS" % (chrArm)][:]) refAlleles = chrArmFile[chrArm]['variants']['REF'] altAlleles = chrArmFile[chrArm]['variants']['ALT'] samples = chrArmFile[chrArm]["samples"] chrLen = int(chrLen) assert chrLen > 0 if segmentStart != None: segmentStart, segmentEnd = int(segmentStart), int(segmentEnd) assert segmentStart > 0 and segmentEnd >= segmentStart snpIndicesToKeep = [ x for x in range(len(positions)) if segmentStart <= positions[x] <= segmentEnd ] genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) positions = [positions[x] for x in snpIndicesToKeep]
# load the prerolled genomeplot instance f = genomeplot.anophelesgambiae.load() # load metadata meta = pd.read_table(os.path.join(release_dir, "samples/samples.meta.txt"), index_col=0) # identify Ugandan samples ugs_samples = meta.query("population == 'UGS'").index diversity = {} # loop through contigs to generate diversity frame for seq in f.contigs: gt = allel.GenotypeChunkedArray(callset[seq]["calldata/genotype"]).take( ugs_samples, axis=1) pos = allel.SortedIndex(callset[seq]["variants/POS"]) accessible = accessibility[seq]["is_accessible"] ac = gt.count_alleles() pi, windows, bases, counts = allel.stats.windowed_diversity( pos, ac, size=100000, is_accessible=accessible) diversity[seq] = pd.DataFrame.from_dict({ "pi": pi, "start": windows.T[0], "stop": windows.T[1], "nbases": bases, "counts": counts })
def selective_sweep(chroms, pop, samples, haplo=True, plot=False, inaccessible=False): """ Function to calculate H12 statistic across chromosome for given population. Currently not standardised or normalised. """ for chrom in chroms: if inaccessible is False: ############ Read zarrs ############# Ag_store = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r')[:] else: Ag_store = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS", mode='r')[:] print("--------------------------------------------------") print(f"Zarrs loaded: {pop}, Chromosome {chrom}") ############ Load intro gen.array and compute statistics ########### ag_geno = allel.GenotypeChunkedArray(Ag_store) pop_bool = samples.population == pop print("Constructing HaplotypeArray") pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("Computing statistics") h1, h12, h123, h2_h1 = allel.moving_garud_h(pop_haplo, size=1000) median_pos = allel.moving_statistic(positions, np.median, size=1000) print(f"mean {chrom} h12", np.mean(h12)) if plot is True: print("Producing figure") sns.set_palette("muted") xtick = np.arange(0, median_pos.max(), 1000000) plt.figure(figsize=(30, 10)) sns.lineplot( median_pos, h12).set_title(f'{pop} {chrom} H12 in 1000 snp windows') plt.xticks(xtick) plt.savefig(f"../data/{pop}/{chrom}/{pop}_{chrom}_H12_scatter.png", dpi=800) plt.close if haplo is True: return (pop_haplo, h12, np.around(median_pos), positions) else: return (h12, np.around(median_pos), positions)
def get_haplos(pops, chrom, p1, p2, samples, inaccessible=False, geno=False, biallelic=False, zarrpath=None): """ Returns a haplotype array or genotype array for the region and populations requested """ print( '---------------------- retrieving haplotypes -----------------------') # Open Zarrs, genotype and variant data if zarrpath is False: if inaccessible is False: ############ Read zarrs ############# Ag_array = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/", mode='r') else: Ag_array = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/", mode='r') else: if inaccessible is False: ############ Read zarrs ############# Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r') Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r') else: Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r') Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r') variants = allel.VariantChunkedTable( Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS')[:] positions = allel.SortedIndex(variants['POS']) positions = positions.intersect_range(p1, p2) # focus on haplotype region sweep_region = (variants['POS'] >= p1) & (variants['POS'] <= p2) ag_geno = allel.GenotypeChunkedArray(Ag_array) print('Zarr arrays opened') ag_geno = ag_geno.compress(sweep_region, axis=0) print( f'------------------------------- {pops} ------------------------------------' ) # Restrict genotypeArray to population and make HapArray pop_bool = samples.population.isin(pops) pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("HaplotypeArray constructed") if biallelic is True: ac = pop_geno.count_alleles() bi_al = ac.is_biallelic_01() pop_haplo = pop_haplo.compress(bi_al, axis=0) positions = positions[bi_al] if geno is True: return (pop_geno, pop_bool, sweep_region, positions) else: return (pop_haplo, pop_bool, sweep_region, positions)
def multiple_alignment(pops, chrom, p1, p2, samples, hap_only=False): """ Returns a multiple sequence alignment FASTA for a region, given populations, chromosome and locations. Useful for constructing phylogenetic trees (in IQTREE, e.g) Currently not bi-allelic which may be incorrect """ print( '---------------------- multiple sequence alignment -----------------------' ) # Open Zarrs, genotype and variant data Ag_array = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') Ag_store = zarr.open_group( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/", mode='r') variants = allel.VariantChunkedTable( Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS') # focus on haplotype region sweep_region = (variants['POS'][:] >= p1) & (variants['POS'][:] <= p2) variants_in_region = variants.compress(sweep_region, axis=0) ag_geno = allel.GenotypeChunkedArray(Ag_array) print('Zarr arrays opened') ag_geno = ag_geno.compress(sweep_region, axis=0) # clean metadata species_map = {'M': 'coluzzii', 'S': 'gambiae'} samples['species'] = samples['m_s'].map(species_map) color_map = {'BFcol': 'gold'} samples = samples[[ 'ox_code', 'population', 'country', 'species', 'region' ]] #empty df for FASTAS multi_fastas = pd.DataFrame() all_samples = pd.DataFrame() for pop in pops: print( f'------------------------------- {pop} ------------------------------------' ) # Restrict genotypeArray to population and make HapArray pop_bool = samples.population == pop pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("HaplotypeArray constructed") list_of_haplotypes = np.arange(0, pop_haplo.shape[1]).astype('str') # all_haps = pd.DataFrame(np.repeat(all_samples.values,2,axis=0)) list_of_haplotypes = list(list_of_haplotypes) pop_hap_sizes = dict() pop_hap_sizes[pop] = len(list_of_haplotypes) # THIS CREATES AN EMPTY DATAFRAME TO FILL WITH SEQUENCES # EACH ROW IS A HAPLOTYPE fastas = pd.DataFrame({ "hap": np.nan, "seq": np.nan }, columns=["hap", "seq"]) # THIS LOOPS THROUGH HAPLOTYPES AND POPULATES "seq" VARIABLE WITH A CONCATENATED ARRAY OF ALT/REF VARIANTS # genotypes_in_region: array of genotypes as loaded by scikit-allel (compress it to region of interest) # variants_in_region: table of variants as loaded by scikit-allel (compress it to region of interest) print(f"Extracting variants and writing to Pandas Dataframe") for n, i in enumerate(list_of_haplotypes): gen = np.ndarray.tolist(pop_haplo[:, n]) endstring = '' for gn, allele in enumerate(gen): if allele == 1: seq = variants_in_region['ALT'][gn][0].astype(str) if allele == 2: seq = variants_in_region['ALT'][gn][1].astype( str ) #should this be here, or should it be bi-allelic only? else: seq = variants_in_region['REF'][gn].astype( str) # if allele 0 then REF endstring += seq # concatenate bases into sequence fastas["seq"][ n] = endstring #input to corresponding seq column of df # Join the dfs of different pops multi_fastas = multi_fastas.append(fastas, ignore_index=True) print(len(multi_fastas), "Haplotypes complete") pop_samples = samples[samples.population == pop] all_samples = all_samples.append(pop_samples) multi_fastas['hap'] = '>' + all_samples['population'].astype( str) + '_' + all_samples['ox_code'].astype(str) #write to csv with \n sep to make FASTA file multi_fastas.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.fasta", sep="\n", index=False, header=False) print('Multiple alignment FASTA written') #remove > and join with metadata for each pop, useful for plotting phylo trees multi_fastas['hap'] = multi_fastas['hap'].str.strip('>') all_haps = pd.DataFrame(np.repeat(all_samples.values, 2, axis=0)) all_haps.columns = all_samples.columns all_haps = pd.concat([multi_fastas.reset_index(drop=True), all_haps], axis=1) all_haps.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.metadata", sep="\t", index=False, header=True) return (multi_fastas, all_haps)
b'#CHROM', b'POS', b'ID', b'REF', b'ALT', b'QUAL', b'FILTER', b'INFO', b'FORMAT' ] # rememeber to act on all 1st level keys! # does not support multiple chromosomes currently! # Actually should probably add to filter script... assert len(h5_handle.keys()) <= 1 for k in h5_handle.keys(): fh_samples = [str(s) for s in callset_fn['3R']["samples"][:]] samples = list(compress(fh_samples, pop_selection)) missing_rates = np.zeros(len(samples)) ok_samples = np.ones(len(samples), dtype="bool") gt = allel.GenotypeChunkedArray(h5_handle[k][:]) if not args.keepmissing: missing_gt = gt.is_missing() for i, s in enumerate(samples): consecutive_miss = get_consecutive_true(missing_gt[:, i]) miss_rate_i = consecutive_miss / float(missing_gt.shape[0]) print("Missing rate of", s, ':', "{:.8f}".format(miss_rate_i), "({0}/{1})".format(i + 1, len(samples))) missing_rates[i] = miss_rate_i
print("* Samples = ", oc_samples.shape[0]) print("* Populations = ", set(oc_samples[oc_popc])) print(oc_samples.groupby(("population")).size()) # Phased variants and genotypes: # declare objects with variant data oc_hapcall = h5py.File(oc_hapcall_fn) # variants of genotypes print("Variants phased...") oc_hapcall_var = oc_hapcall[chrom]["variants"] oc_hapvars = allel.VariantChunkedTable(oc_hapcall_var,names=["POS","REF","ALT"],index="POS") print(oc_hapvars.shape) # genotype data print("Genotypes phased...") oc_hapcall_hap = oc_hapcall[chrom]["calldata"]["genotype"] oc_haploty = allel.GenotypeChunkedArray(oc_hapcall_hap) oc_haploty = oc_haploty.subset(sel1=oc_samples_bool) print(oc_haploty.shape) # Effects: oc_effcall = zarr.open(oc_effcall_fn) oc_effvars = allel.VariantChunkedTable(oc_effcall["variants"],names=[ "POS","REF","ALT","ANN_HGVS_p","ANN_HGVS_c", "ANN_Annotation","ANN_AA_pos","ANN_CDS_pos", "ANN_Feature_ID","ANN_Gene_ID","ANN_Gene_Name" ],index="POS") # Is effect among phased variants? is_eff_in_phased = np.isin(oc_effvars["POS"], oc_hapvars["POS"])
def get_alternates(pops, chrom, p1, p2, samples, haps=None, t=0.3, missense=True, inaccessible=False): """ This function returns a dict of alternate alleles for each pop above a given frequency t, given a list of populations, chromosome and region p1-p2. It also extracts the SNP_effect values for bi-allelic variants """ if inaccessible is False: ############ Read zarrs ############# Ag_store = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r')[:] callset_fn = '../../data/snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.pass.h5' callset = h5py.File(callset_fn, mode='r') snp_eff = callset[chrom]['variants']['ANN'] else: Ag_store = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS", mode='r')[:] callset_fn = '../../data/all_snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.h5' callset = h5py.File(callset_fn, mode='r') snp_eff = callset[chrom]['variants']['ANN'] pos = (positions > p1) & (positions < p2) if haps is None: ag_geno = allel.GenotypeChunkedArray(Ag_store) ag_geno = ag_geno.compress(pos, axis=0) snp_eff = snp_eff[pos] snps_in_region = dict() for pop in pops: if haps is None: pop_bool = samples.population == pop print( f"Constructing HaplotypeArray for {pop} {chrom} between {p1} and {p2}" ) pop_geno = ag_geno.compress(pop_bool, axis=1) haps = pop_geno.to_haplotypes() ac = haps.count_alleles() freq = ac.to_frequencies()[:] print("Calculating allele frequencies") alt1 = freq[:, 1] > t alt2 = freq[:, 2] > t alts = alt1 + alt2 region_positions = positions[:][pos] snps = region_positions[alts] freq = freq[alts] snp_eff_alts = pd.DataFrame(snp_eff[alts]) df = pd.DataFrame([snps, freq]).T df.columns = ['pos', 'freqs'] df['annotation'] = snp_eff_alts.Annotation.str.decode('utf8') df['aa'] = snp_eff_alts.HGVS_p.str.decode('utf8') df['ID'] = snp_eff_alts.Gene_Name.str.decode('utf8') df = df.set_index('pos') if missense is True: df = df[df.annotation == 'missense_variant'] snps_in_region[pop] = df return (snps_in_region)
# In[5]: # declare objects with variant data p2_callset = zarr.open(p2_callset_fn) # variants of genotypes print("Variants...") p2_callset_var = p2_callset[chrom]["variants"] p2_genvars = allel.VariantChunkedTable(p2_callset_var, names=["POS", "REF", "ALT"], index="POS") print(p2_genvars.shape) # genotype data print("Genotypes...") p2_callset_gen = p2_callset[chrom]["calldata"]["genotype"] p2_genotyp = allel.GenotypeChunkedArray(p2_callset_gen) p2_genotyp = p2_genotyp.subset(sel1=p2_samples_bool) print(p2_genotyp.shape) # #### Outgroups # # Loads one outgroup, removes indels (duplicated variant positions) and subsets phase2 to include variants present in this outgroup. Then, loads outgroup genotypes and subsets them to remove indels and fit phase2. Then, loads the second outgroup and performs the same task. Thus, at each iteration, less and less variants remain (hopefully not too many are lost; worst offenders are `chri` and `epir`). # In[6]: oc_genotyp = p2_genotyp oc_genvars = p2_genvars for outn, outi in enumerate(ou_species): print("# p2 genotypes remaining: %i" % oc_genotyp.shape[0])
import numpy as np import h5py import seaborn as sns import pandas as pd chromlist = [ "Wb_Chr1_0", "Wb_Chr1_1", "Wb_Chr2_0", "Wb_Chr2_1", "Wb_Chr2_2", "Wb_Chr2_3", "Wb_Chr3_0", "Wb_Chr3_1", "Wb_Chr4_0", "Wb_Chr4_1", "Wb_Chr4_2" ] seldict = {} for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # ihs ihs = allel.ihs(h, pos, include_edges=True) ihs_std = allel.standardize_by_allele_count(ihs, acc) plt.plot(pos, -np.log10(ihs_std[0])) nan = ~np.isnan(ihs) ihs_real = ihs[nan] pos_ihs = pos[nan] # nsl nsl = allel.nsl(h) nsl_std = allel.standardize_by_allele_count(nsl, acc) plt.plot(pos, -np.log10(nsl_std[0])) nan = ~np.isnan(ihs)
#list(csh['variants'].keys()) # In[21]: ## apply filter var_pass = var_tb.compress(var_tb_fltr) # ## Genotype from HDF5 # In[22]: list(csh['calldata'].keys()) # In[23]: gth = allel.GenotypeChunkedArray(csh['calldata/GT']) gth # In[24]: list(csh['samples']) # In[25]: import pandas as pd samples = pd.DataFrame({ 'sample': [b'AC3812', b'AC3813', b'AC3814', b'AC3815'], 'cell_type': ['TAP', 'TAP', 'TLX3', 'TLX3'] }) TLX = samples['cell_type'].isin(['TLX3'])