def test_fetch(self): ''' can fetch variants within a genomic region ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') self.assertTrue( bfile._check_for_index(str(self.folder / 'example.16bits.bgen'))) self.assertTrue(list(bfile.fetch('02')) == [])
def __init__(self, bgen_file_path, phenotype_file_path, index_column_name, covariate_file_path=None, sample_file_path=None): """ This software is meant to be called from the command line, so no documentation is included here. Note, the code here is a bit verbose, which was done in an attempt to minimize the number of function calls given the need to perform millions of calls. This could likely be optimized in a better way. """ self.index_column_name = index_column_name assert os.path.isfile(bgen_file_path), "bgen file does not exist" if os.path.isfile(bgen_file_path + '.bgi') is False: print( "Warning: No bgen index (.bgi) file provided in same directory as bgen file. Initial reading of the bgen is MUCH faster with index file. " ) if sample_file_path is not None: assert os.path.isfile( sample_file_path ), "sample file does not exist at provided location" else: sample_file_path = bgen_file_path.strip('bgen') + 'sample' if os.path.isfile(sample_file_path) is False: raise FileNotFoundError( "No sample file at {0:s}. A sample file must be provided.". format(sample_file_path)) print( 'Reading bgen file from {0:s} using sample file {1:s}. If these seem like an error, kill program.' .format(bgen_file_path, sample_file_path)) self.bgen_dataset = BgenFile(bgen_file_path, sample_path=sample_file_path) if os.path.isfile(phenotype_file_path): self.phenotype_dataset = pd.read_csv(phenotype_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError("No phenotype file at provided location") if covariate_file_path is not None: if os.path.isfile(covariate_file_path): self.covariate_dataset = pd.read_csv( covariate_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError( "No covariate file at provided location") else: print( "No covariate file provided. Will use phenotype file for covariates.\n", flush=True) self.covariate_dataset = self.phenotype_dataset
def test_fetch_whole_chrom(self): ''' fetching just with chrom gives all variants on chromosome ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') # test fetching a whole chromosome sortkey = lambda x: (x.chrom, x.pos) for x, y in zip(sorted(bfile.fetch(chrom), key=sortkey), sorted(self.gen_data, key=sortkey)): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos)
def test_fetch_after_position(self): ''' fetching variants with chrom and start gives all variants after pos ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') sortkey = lambda x: (x.chrom, x.pos) gen_vars = [ x for x in sorted(self.gen_data, key=sortkey) if start <= x.pos ] for x, y in zip(sorted(bfile.fetch(chrom, start), key=sortkey), gen_vars): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos)
def test_context_handler_closed_bgen_length(self): ''' error raised if accessing length of exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): len(bfile)
def test_context_handler_closed_bgen_slice(self): ''' error raised if slicing variant from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile[0]
def test_context_handler_closed_bgen_at_position(self): ''' error raised if getting variant at position from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile.at_position(100)
def test_context_handler_closed_bgen_with_rsid(self): ''' error raised if getting variant with rsid from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile) > 0) with self.assertRaises(ValueError): var = bfile.with_rsid('rs111')
def test_zstd_compressed(self): ''' check we can parse genotypes from zstd compressed geno probabilities ''' path = self.folder / 'example.16bits.zstd.bgen' bfile = BgenFile(str(path)) for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, 16))
def test_context_handler_closed_bgen_positions(self): ''' no positions available from exited BgenFile ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: self.assertTrue(len(bfile.positions()) > 0) with self.assertRaises(ValueError): bfile.positions()
def test_v11(self): ''' check we can open a bgen in v1.1 format, and parse genotypes correctly ''' path = self.folder / 'example.v11.bgen' bfile = BgenFile(str(path)) bit_depth = 16 for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_load_haplotypes_bgen(self): ''' check we can open a bgen with haplotypes, and parse genotypes correctly ''' path = self.folder / 'haplotypes.bgen' bfile = BgenFile(str(path)) bit_depth = 16 for var, g in zip(bfile, self.haps_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_load_example_genotypes_bit_depths(self): ''' check parsing genotypes from the example files with different bit depths ''' for path in self.folder.glob('example.*bits.bgen'): bit_depth = int(path.stem.split('.')[1].strip('bits')) bfile = BgenFile(str(path)) for var, g in zip(bfile, self.gen_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_load_complex_file(self): ''' make sure we can open a complex bgen file ''' path = self.folder / 'complex.bgen' bfile = BgenFile(path) bit_depth = 16 for var, g in zip(bfile, self.vcf_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth)) self.assertTrue(all(x == y for x, y in zip(g.ploidy, var.ploidy)))
def test_fetch_in_region(self): ''' fetching variants with chrom, start, stop gives variants in region ''' chrom, start, stop = '01', 5000, 50000 bfile = BgenFile(self.folder / 'example.16bits.bgen') sortkey = lambda x: (x.chrom, x.pos) gen_vars = [ x for x in sorted(self.gen_data, key=sortkey) if start <= x.pos <= stop ] for x, y in zip(sorted(bfile.fetch(chrom, start, stop), key=sortkey), gen_vars): self.assertEqual(x.rsid, y.rsid) self.assertEqual(x.chrom, y.chrom) self.assertEqual(x.pos, y.pos) # check that we don't get any variants in a region without any self.assertEqual(list(bfile.fetch(chrom, start * 1000, stop * 1000)), [])
def test_load_complex_files(self): ''' make sure we can open the complex bgen files ''' for path in self.folder.glob('complex.*.bgen'): bit_depth = int(path.stem.split('.')[1].strip('bits')) bfile = BgenFile(path) for var, g in zip(bfile, self.vcf_data): self.assertEqual(g, var) self.assertTrue( arrays_equal(g.probabilities, var.probabilities, bit_depth))
def test_index_opens(self): ''' loads index when available ''' bfile = BgenFile(self.folder / 'example.15bits.bgen') self.assertFalse( bfile._check_for_index(str(self.folder / 'example.15bits.bgen'))) bfile = BgenFile(self.folder / 'example.16bits.bgen') self.assertTrue( bfile._check_for_index(str(self.folder / 'example.16bits.bgen')))
def test_pickling(self): ''' BgenVar should pickle and unpickle ''' path = self.folder / 'example.16bits.zstd.bgen' with BgenFile(path) as bfile: for var in bfile: # this checks that we can pickle and unpickle a BgenVar pickled = pickle.dumps(var) unpickled = pickle.loads(pickled) # check attributes of the original and unpickled are identical self.assertEqual(var.varid, unpickled.varid) self.assertEqual(var.rsid, unpickled.rsid) self.assertEqual(var.chrom, unpickled.chrom) self.assertEqual(var.pos, unpickled.pos) self.assertEqual(var.alleles, unpickled.alleles)
def test_minor_allele_dosage_v11(self): ''' test we calculate minor_allele_dosage correctly with version 1 bgens ''' path = self.folder / 'example.v11.bgen' with BgenFile(path) as bfile: for var in bfile: dose = var.minor_allele_dosage probs = var.probabilities # calculate dosages for each allele a1 = (probs[:, 0] * 2 + probs[:, 1]) a2 = (probs[:, 2] * 2 + probs[:, 1]) # get delta between var.minor_allele_dosage and values calculated here recomputed = a2 if np.nansum(a1) >= np.nansum(a2) else a1 delta = abs(dose - recomputed) # check difference between the two estimates is sufficiently low self.assertTrue(np.nanmax(delta) < 7e-5)
class QRankGWAS: def __init__(self, bgen_file_path, phenotype_file_path, index_column_name, covariate_file_path=None, sample_file_path=None): """ This software is meant to be called from the command line, so no documentation is included here. Note, the code here is a bit verbose, which was done in an attempt to minimize the number of function calls given the need to perform millions of calls. This could likely be optimized in a better way. """ self.index_column_name = index_column_name assert os.path.isfile(bgen_file_path), "bgen file does not exist" if os.path.isfile(bgen_file_path + '.bgi') is False: print( "Warning: No bgen index (.bgi) file provided in same directory as bgen file. Initial reading of the bgen is MUCH faster with index file. " ) if sample_file_path is not None: assert os.path.isfile( sample_file_path ), "sample file does not exist at provided location" else: sample_file_path = bgen_file_path.strip('bgen') + 'sample' if os.path.isfile(sample_file_path) is False: raise FileNotFoundError( "No sample file at {0:s}. A sample file must be provided.". format(sample_file_path)) print( 'Reading bgen file from {0:s} using sample file {1:s}. If these seem like an error, kill program.' .format(bgen_file_path, sample_file_path)) self.bgen_dataset = BgenFile(bgen_file_path, sample_path=sample_file_path) if os.path.isfile(phenotype_file_path): self.phenotype_dataset = pd.read_csv(phenotype_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError("No phenotype file at provided location") if covariate_file_path is not None: if os.path.isfile(covariate_file_path): self.covariate_dataset = pd.read_csv( covariate_file_path, sep='\t', index_col=index_column_name) else: raise FileNotFoundError( "No covariate file at provided location") else: print( "No covariate file provided. Will use phenotype file for covariates.\n", flush=True) self.covariate_dataset = self.phenotype_dataset def ConstructDataArrays(self, phenotype_name, covariate_cols=None, included_subjects=None): if included_subjects is None: self.included_subjects = self.phenotype_dataset.index.to_numpy() else: self.included_subjects = np.intersect1d( included_subjects, self.phenotype_dataset.index.to_numpy()) self.Y = self.phenotype_dataset.loc[self.included_subjects][[ phenotype_name ]] if covariate_cols is not None: self.Z = self.covariate_dataset.loc[ self.included_subjects][covariate_cols] else: self.Z = None sample_vals_np = np.array(self.bgen_dataset.samples, dtype=self.included_subjects.dtype) sample_vals_np_sorted = np.sort(sample_vals_np) sample_vals_np_idx_sorted = np.argsort(sample_vals_np) conv_dict = dict(zip(sample_vals_np_sorted, sample_vals_np_idx_sorted)) self.included_subjects_bgen_idx = np.array( [conv_dict[x] for x in self.included_subjects]) def BuildQRank(self, quantiles, param_tol=1e-8, max_fitting_iter=5000, output_file_prefix=None, randomize=False): self.qrank = QRank(self.Y, covariate_matrix=self.Z, quantiles=quantiles) self.qrank.FitNullModels(tol=param_tol, maxiter=max_fitting_iter, randomize=randomize) if output_file_prefix is not None: residual_table = pd.DataFrame(index=self.included_subjects) for q in quantiles: residual_table['q.{0:g}.residuals'.format( q)] = self.qrank.null_model_results[q].resid with open( output_file_prefix + '.NullModelResults.{0:g}.txt'.format(q), 'w') as model_file: model_file.write( self.qrank.null_model_results[q].summary().as_text()) self.qrank.null_model_results[q].save( output_file_prefix + '.NullModel.{0:g}.pth'.format(q)) residual_table.to_csv(output_file_prefix + '.NullModelResiduals.txt', sep='\t') def PerformGWASAdditive(self, output_file_prefix, maf_cutoff, print_freq=1000, variant_list=None): if variant_list is None: total_num_variants = len(self.bgen_dataset) variant_iterator = self.bgen_dataset elif len(variant_list) > 1000: print( "Adjusting bgen index to drop excluded variants from the analysis. This may take several minutes up front." ) all_rsids = self.bgen_dataset.rsids() rsid_table = pd.DataFrame({ 'rsid': all_rsids, 'bgen_index': np.arange(len(all_rsids)) }) rsid_table.set_index('rsid', inplace=True, drop=False) rsid_table = rsid_table.drop( np.intersect1d(variant_list, rsid_table.index.to_numpy())) self.bgen_dataset.drop_variants(rsid_table['bgen_index'].to_list()) total_num_variants = len(self.bgen_dataset) def variant_iterator_func(num_var): for x in range(num_var): yield self.bgen_dataset[x] variant_iterator = variant_iterator_func(total_num_variants) else: # use a custom generator, load in real time # def variant_iterator_func(v_list): for x in v_list: yield self.bgen_dataset.with_rsid(x) variant_iterator = variant_iterator_func(variant_list) with open(output_file_prefix + '.Additive.QRankGWAS.txt', 'w', buffering=io.DEFAULT_BUFFER_SIZE * 10) as output_file: output_file.write('snpid\trsid\tchrom\tpos\tmaj\tmin\tmaf\t') output_file.write( '\t'.join(['p.{0:g}'.format(x) for x in self.qrank.quantiles]) + '\tp.comp\n') variant_counter = 0 avg_elapsed_time = 0.0 block_counter = 0 start = time.time() for variant in variant_iterator: if len(variant.alleles) == 2: dosage = variant.minor_allele_dosage[ self.included_subjects_bgen_idx] maf = dosage.sum() / (dosage.shape[0] * 2.0) if (maf >= maf_cutoff): if (variant.alleles.index(variant.minor_allele) == 1) and (maf <= 0.5): alleles = variant.alleles else: alleles = variant.alleles[::-1] output_file.write('{0:s}'.format(variant.varid)) output_file.write('\t{0:s}'.format(variant.rsid)) output_file.write('\t{0:s}'.format(variant.chrom)) output_file.write('\t{0:d}'.format(variant.pos)) output_file.write('\t{0:s}'.format(alleles[0])) output_file.write('\t{0:s}'.format(alleles[1])) output_file.write('\t{0:.8g}'.format(maf)) pvals = self.qrank.ComputePValues(dosage) for p in pvals[0]: output_file.write('\t{0:.8g}'.format(p)) output_file.write('\t{0:.8g}'.format(pvals[1])) output_file.write('\n') variant_counter += 1 if (variant_counter) % print_freq == 0: end = time.time() block_counter += 1 elapsed = end - start print( 'Processed {0:d} of {1:d} variants ({2:.1f}% of total)' .format( variant_counter, total_num_variants, round((variant_counter / total_num_variants) * 1000.0) / 10.0), flush=True) print('Elapsed time {0:.2f} sec'.format(elapsed)) avg_elapsed_time = ((avg_elapsed_time * (block_counter - 1) + elapsed) / block_counter) print('Estimated Total Time Required: {0:.2f} hours\n'. format(((total_num_variants / print_freq) * avg_elapsed_time) / 3600)) start = time.time()
def test_load_missing_file(self): ''' check passing in a path to a missing file fails gracefully ''' with self.assertRaises(ValueError): BgenFile('/zzz/jjj/qqq.bgen')
def test_Path(self): ''' check we can open bgen files from Path objects ''' path = self.folder / 'example.v11.bgen' bfile = BgenFile(path)