def het_check(self, vcf_path, plot=False, ncpus=1, min_depth=8, sites=op.join(op.dirname(__file__), 'GRCH37.sites'), **kwargs): """ kwargs is not used, but added here to allow same args as ped_check """ import cyvcf2 import numpy as np if ncpus > 16: ncpus = 16 sitesfile = sites samps = [x.sample_id for x in self.samples()] vcf = cyvcf2.VCF(vcf_path, gts012=True, samples=samps) if sorted(vcf.samples) != sorted(samps): log.warning( "sample overlap issues\n\tin vcf, not in ped: %s\n\tin ped, not in vcf: %s" % (",".join(set(vcf.samples) - set(samps)), ",".join(set(samps) - set(vcf.samples)))) if set(vcf.samples) - set(samps) == set(vcf.samples): raise Exception("error: no samples from VCF found in ped") samps = vcf.samples sample_ranges, sites, gt_types = cyvcf2.par_het(vcf_path, samps, ncpus, sites, min_depth=min_depth) call_rate = (gt_types != 3).mean(axis=1) from .pca import pca if plot: pca_plot = plot.replace('het_', 'pca_').replace('het-', 'pca-') if pca_plot == plot: pca_plot, ext = pca_plot.rsplit(".", 1) pca_plot = "%s.%s%s" % (pca_plot, "pca.", ext) else: pca_plot = False pca_df, background_pca_df = pca(pca_plot, sitesfile, gt_types, sites) # not find outliers. depth = np.array([v['median_depth'] for v in sample_ranges.values()]) #ranges = np.array([d['range'] for d in sample_ranges.values()]) ratios = np.array([d['het_ratio'] for d in sample_ranges.values()]) bot = depth.mean() - 2 * depth.std() # remove outliers and re-calc. bot = depth[depth > bot].mean() - 5 * depth[depth > bot].std() # care less if we have really high samples so make it 5. top = depth.mean() + 8 * depth.std() depth_outlier = ((depth < bot) | (depth > top)) for k, v in sample_ranges.items(): v['sample_id'] = k for d, depth_o in zip(sample_ranges.values(), depth_outlier): d['depth_outlier'] = depth_o d['idr_baf'] = d.pop('range') import pandas as pd if sys.version_info[0] == 2: df = pd.DataFrame(sample_ranges.values()) else: df = pd.DataFrame(list(sample_ranges.values())) cols = ['sample_id'] + sorted( [x for x in df.columns if x != 'sample_id']) df = df[cols] df.index = df['sample_id'] l = {s: i for i, s in enumerate(samps)} df['call_rate'] = [call_rate[l[s]] for s in df.index] if pca_df is not None: # merge the 2 dataframes. pca_df.index = samps df = pd.concat((df, pca_df), axis=1) if not plot: return df, background_pca_df from matplotlib import pyplot as plt import seaborn as sns colors = sns.color_palette('Set1', 4) cs = [ colors[1 - int(v['depth_outlier'])] for v in sample_ranges.values() ] ecs = ['none' for k in sample_ranges] s = get_s(np.array([v['median_depth'] for v in sample_ranges.values()])) plt.scatter(depth, ratios, c=cs, edgecolors=ecs, s=s) for k, v in ((k, v) for k, v in sample_ranges.items() if v['depth_outlier']): plt.text(v['median_depth'], v['het_ratio'], k, color=colors[1], fontsize=7) plt.xlabel('median depth') plt.ylabel('proportion het calls') plt.savefig(plot) return df, background_pca_df
def het_check(self, vcf_path, plot=False, ncpus=1, min_depth=8, sites=op.join(op.dirname(__file__), '1kg.sites'), **kwargs): """ kwargs is not used, but added here to allow same args as ped_check """ import cyvcf2 import numpy as np if ncpus > 16: ncpus = 16 samps = [x.sample_id for x in self.samples()] vcf = cyvcf2.VCF(vcf_path, gts012=True, samples=samps) if sorted(vcf.samples) != sorted(samps): print("warning: sample overlap issues\n\tin vcf, not in ped: %s\n\tin ped, not in vcf: %s" % ( ",".join(set(vcf.samples) - set(samps)), ",".join(set(samps) - set(vcf.samples))), file=sys.stderr) if set(vcf.samples) - set(samps) == set(vcf.samples): raise Exception("error: no samples from VCF found in ped") samps = vcf.samples sample_ranges, sites, gt_types = cyvcf2.par_het(vcf_path, samps, ncpus, sites, min_depth=min_depth) call_rate = (gt_types != 3).mean(axis=1) from .pca import pca if plot: pca_plot = plot.replace('het_', 'pca_').replace('het-', 'pca-') if pca_plot == plot: pca_plot, ext = pca_plot.rsplit(".", 1) pca_plot = "%s.%s%s" % (pca_plot, "pca.", ext) else: pca_plot = False pca_df, background_pca_df = pca(pca_plot, gt_types, sites) # not find outliers. depth = np.array([v['median_depth'] for v in sample_ranges.values()]) #ranges = np.array([d['range'] for d in sample_ranges.values()]) ratios = np.array([d['het_ratio'] for d in sample_ranges.values()]) ratios_outlier = ((ratios < 0.305) | (ratios > 0.41)) #ranges_outlier = ((ranges < 0.08) | (ranges > 0.31)) bot = depth.mean() - 2 * depth.std() # remove outliers and re-calc. bot = depth[depth > bot].mean() - 2 * depth[depth > bot].std() # care less if we have really high samples so make it 5. top = depth.mean() + 5 * depth.std() depth_outlier = ((depth < bot) | (depth > top)) for k, v in sample_ranges.items(): v['sample_id'] = k for d, depth_o, ratio_o in zip(sample_ranges.values(), depth_outlier, ratios_outlier): d['ratio_outlier'] = ratio_o d['depth_outlier'] = depth_o d['idr_baf'] = d.pop('range') import pandas as pd if sys.version_info[0] == 2: df = pd.DataFrame(sample_ranges.values()) else: df = pd.DataFrame(list(sample_ranges.values())) cols = ['sample_id'] + sorted([x for x in df.columns if x != 'sample_id']) df = df[cols] df.index = df['sample_id'] l = {s: i for i, s in enumerate(samps)} df['call_rate'] = [call_rate[l[s]] for s in df.index] if pca_df is not None: # merge the 2 dataframes. pca_df.index = samps df = pd.concat((df, pca_df), axis=1) if not plot: return df, background_pca_df from matplotlib import pyplot as plt import seaborn as sns colors = sns.color_palette('Set1', 4) cs = [colors[1 - int(v['depth_outlier'])] for v in sample_ranges.values()] ecs = ['none' if not v['ratio_outlier'] else 'k' for v in sample_ranges.values()] s = get_s(np.array([v['median_depth'] for v in sample_ranges.values()])) plt.scatter(depth, ratios, c=cs, edgecolors=ecs, s=s) for k, v in ((k, v) for k, v in sample_ranges.items() if v['ratio_outlier'] or v['depth_outlier']): plt.text(v['median_depth'], v['het_ratio'], k, color=colors[1], fontsize=7) plt.xlabel('median depth') plt.ylabel('proportion het calls') plt.savefig(plot) return df, background_pca_df