def getContigStats(vcf,window_size,outfile,subpops): #load data vcf=allel.read_vcf(vcf) snps=allel.GenotypeArray(vcf['calldata/GT']) positions=vcf['variants/POS'] sample_indices=dict() for i in range(len(vcf['samples'])): sample_indices[vcf['samples'][i]]=i #prep output file outfile=open(str(outfile),'w') outfile.write('chrom\tchromStart\tchromEnd\tnumSites\tfst\ttajD1\ttajD2\tthetaW1\tthetaW2\tdxy_bw\tpi\tdfd\n') outfile.close() #get window bounds window_bounds=getSubWinBounds(window_size,max(positions)) window_bound_indices=getSnpIndicesInSubWins(window_bounds,positions) nwindows=max(positions)//window_size - 1 #loop over windows and print summary stats to file for i in range(nwindows): if(len(window_bound_indices[i])<10): #if <n snps in the window outfile=open(str(outfile),'a') sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(0), "NA","NA","NA","NA","NA","NA","NA","NA"] sumstats='\t'.join(sumstats)+'\n' outfile.write(sumstats) outfile.close() else: window_snp_positions=positions[window_bound_indices[i]] window_snps=snps.subset(window_bound_indices[i]) window_ac_all=window_snps.count_alleles() window_ac_subpop=window_snps.count_alleles_subpops(subpops=subpops) window_ac_per_ind=window_snps.to_allele_counts() #summary stats a,b,c=allel.stats.fst.weir_cockerham_fst(window_snps,[subpops['rufus'],subpops['sasin']]) fst=np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) tajD1=allel.stats.diversity.tajima_d(window_ac_subpop['rufus']) tajD2=allel.stats.diversity.tajima_d(window_ac_subpop['sasin']) thetaW1=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['rufus']) thetaW2=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['sasin']) dxy_bw=allel.stats.diversity.sequence_divergence(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin']) pi=allel.stats.diversity.sequence_diversity(window_snp_positions,window_ac_all) dfd=allel.stats.diversity.windowed_df(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'],size=window_size)[0][0] # pdxy=allel.stats.distance.pairwise_dxy(window_snp_positions,window_ac_per_ind) # dmax=pdxy.max() # dmin=pdxy.min() # f2=allel.stats.admixture.patterson_f2(window_ac_subpop['rufus'],window_ac_subpop['sasin']) #need to drop non-biallelic sites for this #write a vector of summary stats to file outfile=open(str(outfile),'a') sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(window_snps.shape[0]), str(round(fst,6)),str(round(tajD1,6)),str(round(tajD2,6)), str(round(thetaW1,6)),str(round(thetaW2,6)),str(round(dxy_bw,6)), str(round(pi,6)),str(round(dfd,6))] sumstats='\t'.join(sumstats)+'\n' outfile.write(sumstats) outfile.close()
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5): window_size = int(window_size) reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples]) original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code] synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles() reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles() synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size) reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size) plt.title('Nucleotide Diversity Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Nucleotide Diversity (π)') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code))) plt.close(plt.gcf()) synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size) reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size) plt.title('Tajima\'s D Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Tajima\'s D') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code))) plt.close(plt.gcf())
def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2, sites_list_chunk): # a string representation of the target region of the current window window_region = chromosome + ":" + str(window_pos_1) + "-" + str( window_pos_2) # read in data from the source VCF for the current window callset = allel.read_vcf(args.vcf, region=window_region, fields=[ 'CHROM', 'POS', 'calldata/GT', 'variants/is_snp', 'variants/numalt' ]) # keep track of whether the callset was empty (no sites for this range in the VCF) # used by compute_summary_stats to add info about completely missing sites if callset is None: callset_is_none = True gt_array = None pos_array = None else: # if the callset is NOT empty (None), continue with pipeline callset_is_none = False # convert to a genotype array object gt_array = allel.GenotypeArray( allel.GenotypeDaskArray(callset['calldata/GT'])) # build an array of positions for the region pos_array = allel.SortedIndex(callset['variants/POS']) # create a mask for biallelic snps and invariant sites snp_invar_mask = np.logical_or( np.logical_and(callset['variants/is_snp'][:] == 1, callset['variants/numalt'][:] == 1), callset['variants/numalt'][:] == 0) # remove rows that are NOT snps or invariant sites from the genotype array gt_array = np.delete(gt_array, np.where(np.invert(snp_invar_mask)), axis=0) gt_array = allel.GenotypeArray(gt_array) # select rows that ARE snps or invariant sites in the position array pos_array = pos_array[snp_invar_mask] # if a list of target sites was specified, mask out all non-target sites if sites_list_chunk is not None: gt_array = mask_non_target_sites(gt_array, pos_array, sites_list_chunk) # extra 'none' check to catch cases where every site was removed by the mask if len(gt_array) == 0: callset_is_none = True gt_array = None pos_array = None return callset_is_none, gt_array, pos_array
def extractGenosAndPositionsForArm(vcfFile, chroms, currChr, sampleIndicesToKeep): # sys.stderr.write("extracting vcf info for arm %s\n" %(currChr)) rawgenos = np.take( vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == currChr], axis=0) # NOQA if len(rawgenos) > 0: genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep) if isHaploidVcfGenoArray(genos): sys.stderr.write( "Detected haploid input for %s. "\ "Converting into diploid individuals "\ "(combining haplotypes in order).\n" % (currChr)) genos = diploidizeGenotypeArray(genos) sys.stderr.write("Done diploidizing %s\n" % (currChr)) positions = np.extract(chroms == currChr, vcfFile["variants/POS"]) if len(positions) > 0: genos = allel.GenotypeArray( genos.subset(sel0=range(len(positions)))) positions2SnpIndices = {} for i in range(len(positions)): positions2SnpIndices[positions[i]] = i assert len(positions) == len( positions2SnpIndices) and len(positions) == len(genos) return genos, positions, positions2SnpIndices, genos.count_alleles().is_biallelic() # NOQA return np.array([]), [], {}, np.array([])
def main(vcffile, popa, popb, popc, popd, freqw, freqx, freqy, freqz, outfile, outfreqfile): """ U_A,B,C,D(w,x,y,z) A是非渗入群体,B是被渗入群体,C是渗入来源群体1, D是渗入来源群体2 在窗口内A中频率小于w,B中大于x,C中大于y,D中小于z的SNP位点数即为U_A,B,C,D(w,x,y,z) 详见:Signatures of Archaic Adaptive Introgression in Present-Day Human Populations """ popA = [x.strip() for x in open(popa)] popB = [x.strip() for x in open(popb)] popC = [x.strip() for x in open(popc)] popD = [x.strip() for x in open(popd)] callset_C = allel.read_vcf( vcffile, samples=popC, fields=['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT']) gt_C = allel.GenotypeArray(callset_C['calldata/GT']) ac_C = gt_C.count_alleles() af_C = ac_C.to_frequencies() site_selection = np.sum(af_C >= freqy, axis=1) > 0 # 只保留C群体中频率大于y的位点 pos = callset_C['variants/POS'][site_selection] chroms = callset_C['variants/CHROM'][site_selection] allel_selection = af_C >= freqy # 筛选allele的编号,以C群体中频率最大的allel为准(包含了site_selection的内容) af_C = af_C[allel_selection] del (callset_C) del (gt_C) del (ac_C) callset_A = allel.read_vcf(vcffile, samples=popA, fields=['calldata/GT']) af_A = allel.GenotypeArray(callset_A['calldata/GT']).count_alleles( ).to_frequencies()[allel_selection] del (callset_A) callset_B = allel.read_vcf(vcffile, samples=popB, fields=['calldata/GT']) af_B = allel.GenotypeArray(callset_B['calldata/GT']).count_alleles( ).to_frequencies()[allel_selection] del (callset_B) callset_D = allel.read_vcf(vcffile, samples=popD, fields=['calldata/GT']) af_D = allel.GenotypeArray(callset_D['calldata/GT']).count_alleles( ).to_frequencies()[allel_selection] del (callset_D) Usites_selection = (af_A <= freqw) & (af_B >= freqx) & (af_C >= freqy) & ( af_D <= freqz) U_chroms = chroms[Usites_selection] U_pos = pos[Usites_selection] with open(outfile, 'w') as f: for chrom, pos in zip(U_chroms, U_pos): f.write(f'{chrom}\t{pos}\n') with open(outfreqfile, 'w') as f: f.write('chrom\tpos\tfreqA\tfreqB\tfreqC\tfreqD\n') for chrom, pos, freqA, freqB, freqC, freqD in zip( chroms, pos, af_A, af_B, af_C, af_D): f.write( f'{chrom}\t{pos}\t{freqA:.3f}\t{freqB:.3f}\t{freqC:.3f}\t{freqD:.3f}\n' )
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] elif args.vcf is not None: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) genotypes = allel.GenotypeArray(vcf['calldata/GT']) samples = vcf['samples'] return genotypes, samples
def get_genotype_array_concat(callsets, genotype_array_type=config.GENOTYPE_ARRAY_DASK): if len(callsets) == 1: # Only one callset provided. No need for concatenation callset = callsets[0] return get_genotype_array(callset=callset, genotype_array_type=genotype_array_type) gt_list = [] # Get genotype data for each callset for callset in callsets: gt = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Encapsulate underlying zarr array with a chunked dask array gt = da.from_array(gt, chunks=gt.chunks) gt_list.append(gt) if genotype_array_type == config.GENOTYPE_ARRAY_DASK: combined_gt = da.concatenate(gt_list, axis=0) combined_gt = allel.GenotypeDaskArray(combined_gt) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: combined_gt = allel.GenotypeChunkedArray( np.concatenate(gt_list, axis=0)) elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0)) else: raise ValueError( 'Error: Invalid option specified for genotype_array_type.') return combined_gt
def vcf2hmmibd(args): print(f'Reading VCF file: {args.infile}') vcf = allel.read_vcf(args.infile, fields=[ 'samples', 'variants', 'calldata/DP', 'calldata/GT', 'calldata/AD' ]) samples = vcf['samples'] # convert chrom name to integer trans_d = read_chrom_translation(args.translationfile) chrom = [trans_d[x] for x in vcf['variants/CHROM']] coordinates = np.column_stack((chrom, vcf['variants/POS'])) # create genotypes allele_depths = allel.GenotypeArray(vcf['calldata/AD']) genotypes = np.argmax(allele_depths, axis=2) # set for missing values or lower than mindepth total_depths = allele_depths.sum(axis=2, where=(allele_depths > 0)) genotypes[total_depths < args.mindepth] = -1 columns = ['chrom', 'pos'] + list(samples) genotypes = np.hstack((coordinates, genotypes)) df = pd.DataFrame(genotypes, columns=columns) df.to_csv(args.outfile, sep='\t', index=False) print(f'Input file for hmmIBD written at: {args.outfile}')
def readData(dir): fs=os.listdir(dir) for f in fs: bf=tbf.copy_template(outdir+f+".bloom") try: input=''.join([dir, f]) callset=allel.read_vcf(input, fields=['variants/CHROM','variants/POS', 'calldata/GT'], types={'calldata/GT':'i2'}, fills={'calldata/GT':2}) chrom=callset['variants/CHROM'] pos=callset['variants/POS'] gt=allel.GenotypeArray(callset['calldata/GT']) for i in range(len(chrom)): position='|'.join([chrom[i], str(pos[i])]) gv=allel.GenotypeVector(gt[i]) key1=0 key2=0 if gv[0][0] ==0|gv[0][0] ==1|gv[0][0] ==2 : key1=gv[0][0] print("key1:"+str(key1)) if gv[0][1] ==0|gv[0][1] ==1|gv[0][1] ==2: key2=gv[0][1] sum=key1|key2 key='|'.join([position, str(sum)]) bf.add(key) except Exception as err: print(f) print(err) bf.close()
def obtain_ancestry_panel(local_callset, sample_list, max_read_count, gq_threshold): indices = obtain_indices(local_callset['samples'], sample_list) dp = local_callset['calldata/DP'] dp = dp[:, indices] gq = local_callset['calldata/GQ'] gq = gq[:, indices] dp_pass = dp < max_read_count gq_pass = gq >= gq_threshold snp_pass = dp_pass * gq_pass gt_all = local_callset['calldata/GT'] gt = gt_all[:, indices] gt = allel.GenotypeArray(gt) alt_alleles = gt.to_n_alt()[:] alt_counts = (alt_alleles * snp_pass).sum(1) ref_alleles = gt.to_n_ref()[:] ref_counts = (ref_alleles * snp_pass).sum(1) panel_alleles = np.column_stack((ref_counts, alt_counts)) return panel_alleles
def circos(directory, outfn, vcffile): ## make config file ## turn vcf file into .dat file ## chr - start - finish - snp density ## bash script ## create data file for heterozygosity snpdensity = pd.read_csv(directory + outfn + ".dat", sep='\t', header=None) ## using user input name read in vcf (either old vcf or new vcf) callset = allel.read_vcf(directory + vcffile + ".vcf") pos = callset['variants/POS'] chrm = callset['variants/CHROM'] gt = allel.GenotypeArray(callset['calldata/GT']) ##test fst samplelist = callset['samples'] fstlist = fst(gt, directory, outfn, samplelist) fp.makeDATFile(pos, gt, chrm, fstlist, snpdensity, directory, outfn, 'fst') ## count the heterozygoes for each pos hetcount = gt.count_het(axis=1) fp.makeDATFile(pos, gt, chrm, hetcount, snpdensity, directory, outfn, 'het')
def countPatternDFOIL(callset, sample_ix, outgroup): """Count patterns for all samples """ print("counting patterns in file...") gt = allel.GenotypeArray(callset['calldata/GT']) pos = allel.SortedIndex(callset['variants/POS']) # remove any sites where outgroup is ./. or 0/1 keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic() gt = gt.compress(keep, axis=0) pos = pos[keep] windict = {} permute = 1 g1, g2, g3, g4 = sample_ix quartet = list(product(g1, g2, g3, g4)) print("total number of combinations: {}".format(len(quartet))) for quart in quartet: print("permutation number {}".format(permute)) i, j, k, m = quart gt_sub = gt.take([i, j, k, m, outgroup], axis=1) keep = gt_sub.is_hom().all(axis=1) gt_sub = gt_sub.compress(keep, axis=0) pos_sub = pos[keep] count_array = gt_sub.is_hom_alt() pattern_array = np.packbits(count_array, axis=1) # windows windict[permute] = (pos_sub, pattern_array) permute += 1 return (windict)
def main(vcffile, groupfile, outfile): group2inds = defaultdict(list) with open(groupfile) as f: for line in f: sampleID, groupID = line.strip().split() group2inds[groupID].append(sampleID) callset = allel.read_vcf( vcffile, fields=[ 'variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT' ], numbers={'ALT': 1}) # 第2个及以上的ALT将被忽略(但是位点还在) 多等位推荐把不同ALT分开在vcf的不同行 df = pd.DataFrame({ 'chr': callset['variants/CHROM'], 'pos': callset['variants/POS'], 'REF': callset['variants/REF'], 'ALT': callset['variants/ALT'] }) for group, samples in group2inds.items(): print(group) print(samples) callset = allel.read_vcf(vcffile, samples=samples, fields=['samples', 'calldata/GT']) af = allel.GenotypeArray( callset['calldata/GT']).count_alleles().to_frequencies() if af.shape[1] > 1: # ALT如果频率都是0的话,就只会有一列REF的频率了 df[group] = af[:, 1] # 第一个ALT的频率 else: df[group] = .0 df.to_csv(outfile, sep='\t', index=False, float_format='%.3f', na_rep='nan')
def read_vcf_founderliab(path): """ Read whole vcf and return ONLY founder matrix """ geno_dosage = allel.GenotypeArray(allel.read_vcf(path, fields=['calldata/GT'])['calldata/GT']).to_n_alt().T return geno_dosage
def read_vcf_allel(file_vcf): ''' Use scikit allel to read vcf file. Organise variant information into summary pandas df. ''' print(file_vcf) vcf_ori= allel.read_vcf(file_vcf) if not vcf_ori: print('empty vcf.') return {}, {}, {} print(vcf_ori.keys()) ### get genotype array geno= vcf_ori['calldata/GT'] mult_alt= [x for x in range(geno.shape[0]) if vcf_ori['variants/ALT'][x][1]] #len(vcf_ori['variants/REF'][x]) > 1 indel= [x for x in range(geno.shape[0]) if len(vcf_ori['variants/REF'][x]) == 1 and len(vcf_ori['variants/ALT'][x][0]) == 1] ## eliminate +1 segregating mutations. for mult in mult_alt: gen_t= geno[mult] gen_t[gen_t > 1] = 0 geno[mult]= gen_t geno= allel.GenotypeArray(geno) geno= geno.to_n_alt().T ## setup summary column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER'] alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])] PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])] summary= [ vcf_ori['variants/CHROM'], vcf_ori['variants/POS'], vcf_ori['variants/ID'], vcf_ori['variants/REF'], alts, vcf_ori['variants/QUAL'], PASS, ] summary= np.array(summary).T if len(indel): print('mutliple ref loci: {}'.format(geno.shape[1] - len(indel))) geno= geno[:,indel] summary= summary[indel,:] summary= pd.DataFrame(summary,columns= column_names) return geno, summary, vcf_ori['samples']
def get_geno(self, m=0, n=0, z=None): """return the subset or whole genotype data in the vcf files :param m: the beginning row # :param n: the ending row # :param z: the selected column #, should be a list or None as default :return: the genotype data, which fill the missing cells with average value """ if m + n > 0 and z is not None: # need to be more flexible gc = allel.GenotypeArray(self._gt[m:n, z]) elif m + n > 0 and z is None: gc = allel.GenotypeArray(self._gt[m:n]) else: gc = allel.GenotypeArray(self._gt[...]) gc_alt = gc.to_n_alt(fill=-1).astype('float64') # missing is '-1' gc_alt_ma = ma.masked_less(gc_alt, 0) ma_mean = gc_alt_ma.mean(axis=1) np.copyto(gc_alt_ma, ma_mean[..., None], where=gc_alt_ma.mask) return gc_alt_ma.data
def data_generator(self, z=None): """generate batchs of genetype data :param z: the selected column #, should be a list or None as default """ batch_size = self._batch_size genotype_batch_indexes = [[i * batch_size, (i + 1) * batch_size] for i in range(self._num_batches)] for k, (x, y) in enumerate(genotype_batch_indexes, 1): if z is not None: batch_geno = allel.GenotypeArray(self._gt[x:y, z]) else: batch_geno = allel.GenotypeArray(self._gt[x:y]) if k == genotype_batch_indexes: batch_geno = allel.GenotypeArray( self._gt[x:y]) # deal with the last batch batch_alt = batch_geno.to_n_alt(fill=0).astype( 'float64') # missing is '0' yield batch_alt
def sci_variant_bldr(self): import allel import subprocess import collections import pandas as pd import os if len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]) > 1: print("Multiple VCFs detected. Files will be merged") if len([ _ for _ in os.listdir(self.path) if _.endswith('.vcf') ]) < len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]): print("VCFs not compressed - compressing") for i in [ _ for _ in os.listdir(self.path) if _.endswith('.vcf') ]: #testing #i = [_ for _ in os.listdir(path) if _.endswith('.vcf')][0] vcf = path + i subprocess.run(['bgzip', "-c", vcf, ">"], stdout=open(vcf + ".gz", "w")) # required? subprocess.run(['tabix', '-p', 'vcf', vcf + ".vcf"]) command = 'bcftools merge --force-samples ' + path + "*.gz" + ' -o ' + path + 'INPUT.vcf' subprocess.run(command, shell=True) vcfdata = allel.read_vcf(path + 'INPUT.vcf', fields=[ 'samples', 'calldata/GT', 'variants/ALT', 'variants/REF', 'variants/CHROM', 'variants/POS', 'variants/svlen' ]) vcfdf = allel.vcf_to_dataframe( path + 'INPUT.vcf', exclude_fields=['QUAL', 'FILTER_PASS', 'ID']) else: vcffile = [_ for _ in os.listdir(self.path) if _.endswith('.vcf')] vcfdata = allel.read_vcf(self.path + vcffile[0], fields=[ 'samples', 'calldata/GT', 'variants/ALT', 'variants/REF', 'variants/CHROM', 'variants/POS', 'variants/svlen' ]) #vcfdata = allel.read_vcf("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf") vcfdf = allel.vcf_to_dataframe( self.path + vcffile[0], exclude_fields=['QUAL', 'FILTER_PASS', 'ID']) #vcfdf = allel.vcf_to_dataframe("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf") sample_set = list(collections.OrderedDict.fromkeys(vcfdata['samples'])) gt = allel.GenotypeArray( vcfdata['calldata/GT']).to_n_alt() # drop additional information gt_data = pd.DataFrame(gt, columns=sample_set) data = pd.concat([vcfdf, gt_data], axis=1, join='inner') return data
def generate_encoded_genotypes(path='', geno_file='', subset=False, subset_geno_file=''): ''' Generates genotype input for external_dataset.py Genotypes are encoded as either 0, 1, or 2, denoting the number of reference alleles in the sample's genotype Arg: if subset = True, encodes 2000 snps for 250 subjects This subset was generated using vcf_subset_generator.py Returns: snps.txt ''' if subset: filepath = os.path.join(path, subset_geno_file) with open(file_path, 'rb') as f: callset = pickle.load(f) else: filepath = os.path.join(path, geno_file) with open(file_path, 'rb') as f: callset = pickle.load(f) #create allel.model.GenotypeArray gt = allel.GenotypeArray(callset['calldata/GT']) #trim repeated name in samples samples = callset['samples'].tolist() samples_trimmed = [] for name in samples: samples_trimmed.append(name.split('_')[0]) #Populate df with genotypes (encoded as 0, 1, or 2) df = pd.DataFrame() df['subjects'] = samples_trimmed ids = callset['variants/ID'].tolist() for snp, genotype in enumerate(gt): genotypes_per_snp = [] for subject in genotype: genotype_per_subject = [] if subject[0] == subject[1]: if subject[0] == 0: genotype_per_subject = 2 elif subject[0] == 1: genotype_per_subject = 0 elif subject[0] != subject[1]: genotype_per_subject = 1 else: genotype_per_subject = 'NA' genotypes_per_snp.append(genotype_per_subject) df[ids[snp]] = genotypes_per_snp #save 'snps.txt' file df.to_csv(os.path.join(path, 'snps.txt'), index=None, sep='\t')
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] positions = callset['variants/POS'] elif args.vcf is not None: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) genotypes = allel.GenotypeArray(vcf['calldata/GT']) samples = vcf['samples'] elif args.matrix is not None: gmat = pd.read_csv(args.matrix, sep="\t") samples = np.array(gmat['sampleID']) gmat = gmat.drop(labels="sampleID", axis=1) gmat = np.array(gmat, dtype="int8") for i in range(gmat.shape[0] ): #kludge to get haplotypes for reading in to allel. h1 = [] h2 = [] for j in range(gmat.shape[1]): count = gmat[i, j] if count == 0: h1.append(0) h2.append(0) elif count == 1: h1.append(1) h2.append(0) elif count == 2: h1.append(1) h2.append(1) if i == 0: hmat = h1 hmat = np.vstack((hmat, h2)) else: hmat = np.vstack((hmat, h1)) hmat = np.vstack((hmat, h2)) genotypes = allel.HaplotypeArray( np.transpose(hmat)).to_genotypes(ploidy=2) return genotypes, samples
def filterGT(callset, outgroup): """Count patterns from VCF """ gt = allel.GenotypeArray(callset['calldata/GT']) p = callset['variants/POS'] pos = allel.SortedIndex(p) acs = gt[:, outgroup].count_alleles(max_allele=1) flt = acs.is_segregating() # needs to be segregating in the outgroup gt = gt.compress(flt, axis=0) pos = pos[flt] return (gt, pos)
def sample_genotype_array(self, sample, part): """Get a genotype array for the specified individual""" file = self._sample_genotype_path(sample, part) if not self._check_local(file): cmd = f'bcftools view -s {sample} -v snps -m2 -M2 -Oz -o {file} {self._chr_path} {self._query(part)}' subprocess.call(cmd, shell=True, stdout=subprocess.PIPE) gt = allel.read_vcf(file, fields=['GT', 'POS']) return pd.Series(allel.GenotypeArray(gt['calldata/GT'])[:, 0], index=gt['variants/POS'])
def _load_calldata(self): callset = allel.read_vcf(self.data, fields=["samples", "GT"]) self.samples_vcforder = callset["samples"] gt = allel.GenotypeArray(callset['calldata/GT']) ## All this is for removing multi-allelic snps, and biallelic singletons ac = gt.count_alleles() flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) self.genotypes = gt.compress(flt, axis=0)
def get_genotype_array(callset, genotype_array_type=config.GENOTYPE_ARRAY_DASK): gtz = get_callset_genotype_data(callset) if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL: return allel.GenotypeArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_DASK: return allel.GenotypeDaskArray(gtz) elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED: return allel.GenotypeChunkedArray(gtz) else: return None
def geno2fst( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) cout('Grouping:') groups = lineparser.parse_grouping() for k in groups: cout(' %12s %3d' % (k, len(groups[k]))) FST = [] # FST indexed by group_keys group_keys = sorted(groups.keys()) cout(group_keys) # output to file cout('Writing outfile...') outfile = open(args.outfile, 'w') outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) ) idx = 0 for (posinfo, genolist) in lineparser.parse(): idx += 1 genoarray = allel.GenotypeArray( [genolist] ) # calculate MAF ac = genoarray.count_alleles() num = np.min(ac) denom = np.sum(ac) if num == denom: maf = 0 else: maf = np.min(ac)/np.sum(ac) # calculate FST per group against other samples fst_sites = [] for g in group_keys: ac_g = genoarray.count_alleles(subpop = groups[g]) ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g]))) num, den = allel.stats.hudson_fst(ac_g, ac_ng) fst = num[0]/den[0] if not (0.0 <= fst <= 1.0): fst = 0 fst_sites.append( fst ) if idx % 100 == 0: cerr('I: writing position no %d' % idx) outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' % (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf, '\t'.join( '%5.4f' % x for x in fst_sites)))
def geno2dhe(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.haploid_translator) lineparser.parse_grouping() cout('Grouping:') groups = lineparser.groups for k in lineparser.groups: cout(' %12s %3d' % (k, len(lineparser.groups[k]))) group_keys = sorted(lineparser.groups.keys()) cout(group_keys) # read whole genotype, and release all unused memory cerr('I: reading genotype file') allel_array = lineparser.parse_all() cerr('I: generating genotype array') genoarray = allel.GenotypeArray(allel_array) del allel_array cerr('I: calculating He') He = 1 - np.sum(genoarray.count_alleles().to_frequencies()**2, axis=1) He_groups = {} pHe = None for g in groups: He_groups[g] = 1 - np.sum( genoarray.count_alleles(subpop=groups[g]).to_frequencies()**2, axis=1) if pHe is None: pHe = He_groups[g] * len(groups[g]) else: pHe = pHe + He_groups[g] * len(groups[g]) dHe = He - pHe / sum(len(x) for x in groups.values()) FST = dHe / He #import IPython; IPython.embed() cerr('I: writing output file') with open(args.outfile, 'wt') as outfile: outfile.write('CHROM\tPOS\tREGION\tFST\tdHe\tHe\t%s\n' % '\t'.join(group_keys)) for i in range(len(He)): posinfo = lineparser.position[i] outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%s\n' % (posinfo[0], posinfo[1], posinfo[4], FST[i], dHe[i], He[i], '\t'.join('%5.4f' % He_groups[g][i] for g in group_keys)))
def allelify(self): """ Updates genotypes and allele counts array to scikit-allel wrappers """ self.genotypes = { key: allel.GenotypeArray(value) for key, value in self.genotypes.items() } # Numpy -> allel self.allele_counts = { key: allel.AlleleCountsArray(value) for key, value in self.allele_counts.items() }
def load_vcf_wrapper(path, seqid, samples): callset = allel.read_vcf(path, region=seqid, fields=['variants/POS', 'calldata/GT', 'samples'], tabix="tabix", samples=samples) p = allel.SortedIndex(callset["variants/POS"]) g = allel.GenotypeArray(callset['calldata/GT']) return p, g
def load_genotypes(): if args.zarr is not None: print("reading zarr") callset = zarr.open_group(args.zarr, mode='r') gt = callset['calldata/GT'] genotypes = allel.GenotypeArray(gt[:]) samples = callset['samples'][:] else: print("reading VCF") vcf = allel.read_vcf(args.vcf, log=sys.stderr) gt = vcf['calldata/GT'] genotypes = allel.GenotypeArray(gt) hap0 = genotypes[:, :, 0] hap1 = genotypes[:, :, 1] haps = allel.HaplotypeArray( np.concatenate((hap0, hap1), axis=1) ) #note order is all hap0 in order of samples, then all hap1 in order of samples. samples = vcf['samples'] s0 = [x + "_h0" for x in samples] s1 = [x + "_h1" for x in samples] samples = np.concatenate((s0, s1), axis=0) return haps, samples
def diploidizeGenotypeArray(genos): numSnps, numSamples, numAlleles = genos.shape if numSamples % 2 != 0: sys.stderr.write( "Diploidizing an odd-numbered sample. The last genome will be truncated.\n") numSamples -= 1 newGenos = [] for i in range(numSnps): currSnp = [] for j in range(0, numSamples, 2): currSnp.append([genos[i, j, 0], genos[i, j+1, 0]]) newGenos.append(currSnp) newGenos = np.array(newGenos) return allel.GenotypeArray(newGenos)