def test_init(): """test init""" gtyper = genotyper.Genotyper(0, 20, 0.0001) assert gtyper.min_cov_more_than_error == 0 assert gtyper.no_of_successes == 0 assert gtyper.prob_of_success == 0 gtyper = genotyper.Genotyper(10, 20, 0.0001) assert gtyper.no_of_successes == 10 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 1 gtyper = genotyper.Genotyper(10, 20, 0.01) assert gtyper.no_of_successes == 10 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 2 gtyper = genotyper.Genotyper(100, 200, 0.001) assert gtyper.no_of_successes == 100 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 8 # variance < mean, so will hit the code where it forces # variance = 2 * mean = 20 gtyper = genotyper.Genotyper(10, 5, 0.01) assert gtyper.no_of_successes == 10 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 2
def _simulate_confidence_scores(cls, mean_depth, depth_variance, error_rate, iterations, allele_length=1, seed=42): np.random.seed(seed) allele_groups_dict = {'1': {0}, '2': {1}} i = 0 confidences = [] # We can't use the negative binomial unless depth_variance > mean_depth. # So force it to be so. if depth_variance < mean_depth: depth_variance = 2 * mean_depth logging.warn('Variance in read depth is smaller than mean read depth. Setting variance = 2 * mean, so that variant simulations can run. GT_CONF_PERCENTILE in the output VCF file may not be very useful as a result of this.') no_of_successes = (mean_depth ** 2) / (depth_variance - mean_depth) prob_of_success = 1 - (depth_variance - mean_depth) / depth_variance while i < iterations: correct_coverage = np.random.negative_binomial(no_of_successes, prob_of_success) incorrect_coverage = np.random.binomial(mean_depth, error_rate) if correct_coverage + incorrect_coverage == 0: continue allele_combination_cov = {} if incorrect_coverage > 0: allele_combination_cov['1'] = incorrect_coverage if correct_coverage > 0: allele_combination_cov['2'] = correct_coverage allele_per_base_cov = [[incorrect_coverage] * allele_length, [correct_coverage] * allele_length] gtyper = genotyper.Genotyper(mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict) gtyper.run() confidences.append(round(gtyper.genotype_confidence)) i += 1 assert len(confidences) == iterations confidences.sort() return confidences
def test_log_likelihood_homozygous(): """test _log_likelihood_homozygous""" gtyper = genotyper.Genotyper(100, 200, 0.01) allele_depth = 90 total_depth = 95 allele_length = 5 non_zeros = allele_length got = gtyper._log_likelihood_homozygous(allele_depth, total_depth, allele_length, non_zeros) assert round(got, 2) == -26.78 gtyper = genotyper.Genotyper(10, 200, 0.01) allele_depth = 1 total_depth = 9 got = gtyper._log_likelihood_homozygous(allele_depth, total_depth, allele_length, non_zeros) assert round(got, 2) == -39.34
def test_run_zero_coverage(): """test run when all alleles have zero coverage""" gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0], [0, 0]] gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) assert gtyper.genotype == {"."} assert gtyper.genotype_confidence == 0.0 assert gtyper.genotype_frs == "."
def test_run_zero_coverage(self): '''test run when all alleles have zero coverage''' mean_depth = 20 error_rate = 0.01 allele_combination_cov = {} allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}} allele_per_base_cov = [[0], [0, 0]] gtyper = genotyper.Genotyper(mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict) gtyper.run() self.assertEqual({'.'}, gtyper.genotype) self.assertEqual(0.0, gtyper.genotype_confidence)
def test_nomatherror_mean_depth0(): """ Can get a mean_depth of zero but try to genotype a non-zero coverage site due to rounding imprecision. In which case we need to avoid trying to do log(0) in likelihood calculation and should return no call. """ gtyper = genotyper.Genotyper(0, 0, 0.01) allele_combination_cov = {"1": 1} allele_groups_dict = {"1": {0}, "2": {1}} allele_per_base_cov = [[1], [0, 0]] gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) assert gtyper.genotype == {"."} assert gtyper.genotype_confidence == 0.0 assert gtyper.genotype_frs == "."
def _simulate_confidence_scores( cls, mean_depth, depth_variance, error_rate, iterations, allele_length=1, seed=42, call_hets=False, ): np.random.seed(seed) allele_groups_dict = {"1": {0}, "2": {1}} i = 0 confidences = [] gtyper = genotyper.Genotyper( mean_depth, depth_variance, error_rate, call_hets=call_hets, ) logging.debug( "Simulation:\titeration\tcorrect_coverage\tincorrect_coverage\tgenotype_confidence" ) while i < iterations: correct_coverage = np.random.negative_binomial( gtyper.no_of_successes, gtyper.prob_of_success ) incorrect_coverage = np.random.binomial(mean_depth, error_rate) if correct_coverage + incorrect_coverage == 0: continue allele_combination_cov = {} if incorrect_coverage > 0: allele_combination_cov["1"] = incorrect_coverage if correct_coverage > 0: allele_combination_cov["2"] = correct_coverage allele_per_base_cov = [ [incorrect_coverage] * allele_length, [correct_coverage] * allele_length, ] gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) logging.debug( f"Simulation:\t{i}\t{correct_coverage}\t{incorrect_coverage}\t{gtyper.genotype_confidence}" ) confidences.append(round(gtyper.genotype_confidence)) i += 1 assert len(confidences) == iterations confidences.sort() return confidences
def test_run(): """test run""" gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {"1": 2, "2": 20, "3": 1} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0, 1], [20, 19]] gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) depth0 = round(3 / 23, 4) depth1 = round(21 / 23, 4) expected = [({1}, -12.03, depth1), ({0}, -114.57, depth0)] assert len(gtyper.likelihoods) == len(expected) for i in range(len(expected)): assert gtyper.likelihoods[i][0] == expected[i][0] assert round(gtyper.likelihoods[i][1], 2) == round(expected[i][1], 2) assert gtyper.likelihoods[i][2] == expected[i][2]
def test_run_zero_coverage(self): """test run when all alleles have zero coverage""" mean_depth = 20 error_rate = 0.01 allele_combination_cov = {} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0], [0, 0]] gtyper = genotyper.Genotyper( mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict, ) gtyper.run() self.assertEqual({"."}, gtyper.genotype) self.assertEqual(0.0, gtyper.genotype_confidence)
def test_calculate_log_likelihoods(self): '''test _calculate_log_likelihoods''' mean_depth = 20 error_rate = 0.01 allele_combination_cov = {'1': 2, '2': 20, '3': 1} allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}} allele_per_base_cov = [[0, 1], [20, 19]] gtyper = genotyper.Genotyper(mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict) gtyper._calculate_log_likelihoods() expected = [ ({1}, -11.68), ({0, 1}, -22.92), ({0}, -124.91), ] self.assertEqual(3, len(gtyper.likelihoods)) gtyper.likelihoods = [(x[0], round(x[1], 2)) for x in gtyper.likelihoods] self.assertEqual(expected, gtyper.likelihoods)
def test_nomatherror_mean_depth0(self): """ Can get a mean_depth of zero but try to genotype a non-zero coverage site due to rounding imprecision. In which case we need to avoid trying to do log(0) in likelihood calculation and should return no call. """ mean_depth = 0 error_rate = 0.01 allele_combination_cov = {"1": 1} allele_groups_dict = {"1": {0}, "2": {1}} allele_per_base_cov = [[1], [0, 0]] gtyper = genotyper.Genotyper( mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict, ) gtyper.run() self.assertEqual({"."}, gtyper.genotype) self.assertEqual(0.0, gtyper.genotype_confidence)
def test_run(self): '''test run''' mean_depth = 20 error_rate = 0.01 allele_combination_cov = {'1': 2, '2': 20, '3': 1} allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}} allele_per_base_cov = [[0, 1], [20, 19]] gtyper = genotyper.Genotyper(mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict) expected = [ ({1}, -11.68), ({0, 1}, -22.92), ({0}, -124.91), ] gtyper.run() self.assertEqual(len(expected), len(gtyper.likelihoods)) for i in range(len(expected)): self.assertEqual(expected[i][0], gtyper.likelihoods[i][0]) self.assertAlmostEqual(expected[i][1], gtyper.likelihoods[i][1], places=2)
def test_calculate_log_likelihoods(): """test _calculate_log_likelihoods""" gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {"1": 2, "2": 20, "3": 1} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0, 1], [20, 19]] depth0 = round(3 / 23, 4) depth01 = 1 depth1 = round(21 / 23, 4) gtyper._init_alleles_and_genotypes( allele_combination_cov=allele_combination_cov, allele_per_base_cov=allele_per_base_cov, allele_groups_dict=allele_groups_dict, ) gtyper._calculate_log_likelihoods() assert len(gtyper.likelihoods) == 2 expected = [ ({1}, -12.03, depth1), ({0}, -114.57, depth0), ] gtyper.likelihoods = [(x[0], round(x[1], 2), x[2]) for x in gtyper.likelihoods] assert gtyper.likelihoods == expected
def update_vcf_record_using_gramtools_allele_depths( vcf_record, allele_combination_cov, allele_per_base_cov, allele_groups_dict, mean_depth, read_error_rate, min_cov_more_than_error=None, ): """allele_depths should be a dict of allele -> coverage. The REF allele must also be in the dict. So keys of dict must be equal to REF + ALTs sequences. This also changes all columns from QUAL onwards. Returns a VcfRecord the same as vcf_record, but with all zero coverage alleles removed, and GT and COV fixed accordingly""" gtyper = genotyper.Genotyper( mean_depth, read_error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict, min_cov_more_than_error=min_cov_more_than_error, ) gtyper.run() genotype_indexes = set() if "." in gtyper.genotype: genotype = "./." else: if 0 in gtyper.genotype: genotype_indexes.add(0) for i in range(len(vcf_record.ALT)): if i + 1 in gtyper.genotype: genotype_indexes.add(i + 1) if len(genotype_indexes) == 1: genotype_index = genotype_indexes.pop() genotype = str(genotype_index) + "/" + str(genotype_index) genotype_indexes.add(genotype_index) else: genotype = "/".join( [str(x) for x in sorted(list(genotype_indexes))]) cov_values = [ gtyper.singleton_allele_coverages.get(x, 0) for x in range(1 + len(vcf_record.ALT)) ] cov_string = ",".join([str(x) for x in cov_values]) vcf_record.QUAL = None vcf_record.INFO.clear() vcf_record.FILTER = set() vcf_record.FORMAT.clear() vcf_record.set_format_key_value("DP", str(sum(allele_combination_cov.values()))) vcf_record.set_format_key_value("GT", genotype) vcf_record.set_format_key_value("COV", cov_string) vcf_record.set_format_key_value("GT_CONF", str(gtyper.genotype_confidence)) # Make new record where all zero coverage alleles are removed filtered_record = copy.deepcopy(vcf_record) if genotype in ["./.", "0/0"]: return filtered_record indexes_to_keep = set( [i for i in range(len(cov_values)) if i == 0 or cov_values[i] > 0]) indexes_to_keep.update(genotype_indexes) indexes_to_keep = list(indexes_to_keep) indexes_to_keep.sort() filtered_record.set_format_key_value( "COV", ",".join([str(cov_values[i]) for i in indexes_to_keep])) assert indexes_to_keep[0] == 0 filtered_record.ALT = [ filtered_record.ALT[i - 1] for i in indexes_to_keep[1:] ] # The indexes of the genotype string 'n/m' are shifted because # we probably removed some alleles genotype_strings = { vcf_record.REF if i == 0 else vcf_record.ALT[i - 1] for i in genotype_indexes } new_genotype_indexes = set() if 0 in genotype_indexes: new_genotype_indexes.add(0) for i, genotype_string in enumerate(filtered_record.ALT): if genotype_string in genotype_strings: new_genotype_indexes.add(i + 1) if len(genotype_strings) == len(new_genotype_indexes): break new_genotype_indexes = list(new_genotype_indexes) if len(new_genotype_indexes) == 1: new_genotype_indexes.append(new_genotype_indexes[0]) assert len(new_genotype_indexes) == 2 filtered_record.set_format_key_value( "GT", "/".join([str(x) for x in new_genotype_indexes])) return filtered_record
def write_vcf_annotated_using_coverage_from_gramtools( mean_depth, depth_variance, vcf_records, all_allele_coverage, allele_groups, read_error_rate, outfile, sample_name="SAMPLE", filtered_outfile=None, ref_seq_lengths=None, call_hets=False, ): """mean_depth, vcf_records, all_allele_coverage, allele_groups should be those returned by load_gramtools_vcf_and_allele_coverage_files(). Writes a new VCF that has allele counts for all the ALTs""" assert len(vcf_records) == len(all_allele_coverage) if call_hets: raise NotImplementedError("Heterozygous calling is not implemented") header_lines = [ "##fileformat=VCFv4.2", "##source=minos, version " + minos_version, "##fileDate=" + str(datetime.date.today()), '##FORMAT=<ID=COV,Number=R,Type=Integer,Description="Number of reads on ref and alt alleles">', '##FORMAT=<ID=FRS,Number=1,Type=Float,Description="Fraction of reads that support the genotype call">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total read depth from gramtools">', '##FORMAT=<ID=DPF,Number=1,Type=Float,Description="Depth Fraction, defined as DP divided by mean depth">', '##FORMAT=<ID=GT_CONF,Number=1,Type=Float,Description="Genotype confidence. Difference in log likelihood of most likely and next most likely genotype">', f"##minosMeanReadDepth={mean_depth}", ] if ref_seq_lengths is not None: for name, length in sorted(ref_seq_lengths.items()): header_lines.append(f"##contig=<ID={name},length={length}>") header_lines.append( "\t".join( [ "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", sample_name, ] ) ) gtyper = genotyper.Genotyper( mean_depth, depth_variance, read_error_rate, call_hets=call_hets, ) if filtered_outfile is not None: f_filter = open(filtered_outfile, "w") print(*header_lines, sep="\n", file=f_filter) with open(outfile, "w") as f: print(*header_lines, sep="\n", file=f) for i in range(len(vcf_records)): logging.debug("Genotyping: " + str(vcf_records[i])) filtered_record = update_vcf_record_using_gramtools_allele_depths( vcf_records[i], gtyper, all_allele_coverage[i][0], all_allele_coverage[i][1], allele_groups, ) print(vcf_records[i], file=f) if filtered_outfile is not None: print(filtered_record, file=f_filter) if filtered_outfile is not None: f_filter.close()
def update_vcf_record_using_gramtools_allele_depths( vcf_record, allele_combination_cov, allele_per_base_cov, allele_groups_dict, mean_depth, read_error_rate, kmer_size): '''allele_depths should be a dict of allele -> coverage. The REF allele must also be in the dict. So keys of dict must be equal to REF + ALTs sequences. This also changes all columns from QUAL onwards. Returns a VcfRecord the same as vcf_record, but with all zero coverage alleles removed, and GT and COV fixed accordingly''' gtyper = genotyper.Genotyper(mean_depth, read_error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict) gtyper.run() genotype_indexes = set() if '.' in gtyper.genotype: genotype = './.' else: if 0 in gtyper.genotype: genotype_indexes.add(0) for i in range(len(vcf_record.ALT)): if i + 1 in gtyper.genotype: genotype_indexes.add(i + 1) if len(genotype_indexes) == 1: genotype_index = genotype_indexes.pop() genotype = str(genotype_index) + '/' + str(genotype_index) genotype_indexes.add(genotype_index) else: genotype = '/'.join( [str(x) for x in sorted(list(genotype_indexes))]) cov_values = [ gtyper.singleton_alleles_cov.get(x, 0) for x in range(1 + len(vcf_record.ALT)) ] cov_string = ','.join([str(x) for x in cov_values]) vcf_record.QUAL = None vcf_record.FILTER = '.' vcf_record.INFO = {'KMER': str(kmer_size)} vcf_record.format_keys = ['DP', 'GT', 'COV', 'GT_CONF'] vcf_record.FORMAT = { 'DP': str(sum(allele_combination_cov.values())), 'GT': genotype, 'COV': cov_string, 'GT_CONF': str(gtyper.genotype_confidence) } # Make new record where all zero coverage alleles are removed filtered_record = copy.deepcopy(vcf_record) if genotype in ['./.', '0/0']: return filtered_record indexes_to_keep = set( [i for i in range(len(cov_values)) if i == 0 or cov_values[i] > 0]) indexes_to_keep.update(genotype_indexes) indexes_to_keep = list(indexes_to_keep) indexes_to_keep.sort() filtered_record.FORMAT['COV'] = ','.join( [str(cov_values[i]) for i in indexes_to_keep]) assert indexes_to_keep[0] == 0 filtered_record.ALT = [ filtered_record.ALT[i - 1] for i in indexes_to_keep[1:] ] # The indexes of the genotype string 'n/m' are shifted because # we probably removed some alleles genotype_strings = { vcf_record.REF if i == 0 else vcf_record.ALT[i - 1] for i in genotype_indexes } new_genotype_indexes = set() if 0 in genotype_indexes: new_genotype_indexes.add(0) for i, genotype_string in enumerate(filtered_record.ALT): if genotype_string in genotype_strings: new_genotype_indexes.add(i + 1) if len(genotype_strings) == len(new_genotype_indexes): break new_genotype_indexes = list(new_genotype_indexes) if len(new_genotype_indexes) == 1: new_genotype_indexes.append(new_genotype_indexes[0]) assert len(new_genotype_indexes) == 2 filtered_record.FORMAT['GT'] = '/'.join( [str(x) for x in new_genotype_indexes]) return filtered_record