Пример #1
0
def annotate_snp_mismatch(
        t: Union[hl.MatrixTable, hl.Table], data_type: str,
        rg: hl.genetics.ReferenceGenome) -> Union[hl.MatrixTable, hl.Table]:
    """
    Annotates mismatches between reference allele and allele in reference fasta

    Assumes input Table/MatrixTable has t.new_locus annotation

    :param t: Table/MatrixTable of SNPs to be annotated
    :param data_type: Data type (exomes or genomes for gnomAD; not used otherwise)
    :param rg: Reference genome with fasta sequence loaded
    :return: Table annotated with mismatches between reference allele and allele in fasta
    """

    logger.info('Filtering to SNPs')
    snp_expr = hl.is_snp(t.alleles[0], t.alleles[1])
    t = t.filter(snp_expr) if isinstance(t,
                                         hl.Table) else t.filter_rows(snp_expr)

    mismatch_expr = {
        'reference_mismatch':
        hl.cond(t.new_locus.is_negative_strand,
                (flip_base(t.alleles[0]) != hl.get_sequence(
                    t.locus.contig, t.locus.position, reference_genome=rg)),
                (t.alleles[0] != hl.get_sequence(
                    t.locus.contig, t.locus.position, reference_genome=rg)))
    }
    logger.info(
        'Checking if reference allele matches what is in reference fasta')
    logger.info(
        'For SNPs on the negative strand, make sure the reverse complement of the ref alleles matches what is in the ref fasta'
    )
    return t.annotate(**mismatch_expr) if isinstance(
        t, hl.Table) else t.annotate_rows(**mismatch_expr)
Пример #2
0
def annotate_snp_mismatch(
        t: Union[hl.MatrixTable, hl.Table],
        rg: hl.genetics.ReferenceGenome) -> Union[hl.MatrixTable, hl.Table]:
    """
    Annotates mismatches between reference allele and allele in reference fasta

    Assumes input Table/MatrixTable has t.new_locus annotation

    :param t: Table/MatrixTable of SNPs to be annotated
    :param rg: Reference genome with fasta sequence loaded
    :return: Table annotated with mismatches between reference allele and allele in fasta
    """

    logger.info("Filtering to SNPs")
    snp_expr = hl.is_snp(t.alleles[0], t.alleles[1])
    t = t.filter(snp_expr) if isinstance(t,
                                         hl.Table) else t.filter_rows(snp_expr)

    mismatch_expr = {
        "reference_mismatch":
        hl.cond(
            t.new_locus.is_negative_strand,
            (hl.reverse_complement(t.alleles[0]) != hl.get_sequence(
                t.locus.contig, t.locus.position, reference_genome=rg)),
            (t.alleles[0] != hl.get_sequence(
                t.locus.contig, t.locus.position, reference_genome=rg)),
        )
    }
    logger.info(
        "Checking if reference allele matches what is in reference fasta")
    logger.info(
        "For SNPs on the negative strand, make sure the reverse complement of the ref alleles matches what is in the ref fasta"
    )
    return (t.annotate(**mismatch_expr)
            if isinstance(t, hl.Table) else t.annotate_rows(**mismatch_expr))
Пример #3
0
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file(
            "test_rg",
            resource("fake_reference.fasta"),
            resource("fake_reference.fasta.fai"),
            mt_contigs=["b", "c"],
            x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(
            hl.eval(
                t.all(
                    hl.get_sequence(t.contig, t.pos, reference_genome=gr4) ==
                    t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(
            hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))

        gr4.remove_sequence()
        assert not gr4.has_sequence()

        gr4.add_sequence(resource("fake_reference.fasta"),
                         resource("fake_reference.fasta.fai"))
        assert gr4.has_sequence()
Пример #4
0
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"),
                                              resource("fake_reference.fasta.fai"),
                                              mt_contigs=["b", "c"], x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
Пример #5
0
    def test_classes(self):
        l = Locus.parse('1:100')

        self.assertEqual(l, Locus('1', 100))
        self.assertEqual(l, Locus(1, 100))
        self.assertEqual(l.reference_genome, hl.default_reference())

        c_hom_ref = Call([0, 0])
        self.assertEqual(c_hom_ref.alleles, [0, 0])
        self.assertEqual(c_hom_ref.ploidy, 2)
        self.assertFalse(c_hom_ref.phased)
        self.assertFalse(c_hom_ref.is_haploid())
        self.assertTrue(c_hom_ref.is_diploid())
        self.assertEqual(c_hom_ref.n_alt_alleles(), 0)
        self.assertTrue(c_hom_ref.one_hot_alleles(2) == [2, 0])
        self.assertTrue(c_hom_ref.is_hom_ref())
        self.assertFalse(c_hom_ref.is_het())
        self.assertFalse(c_hom_ref.is_hom_var())
        self.assertFalse(c_hom_ref.is_non_ref())
        self.assertFalse(c_hom_ref.is_het_non_ref())
        self.assertFalse(c_hom_ref.is_het_ref())
        self.assertTrue(c_hom_ref.unphased_diploid_gt_index() == 0)

        c_het_phased = Call([1, 0], phased=True)
        self.assertEqual(c_het_phased.alleles, [1, 0])
        self.assertEqual(c_het_phased.ploidy, 2)
        self.assertTrue(c_het_phased.phased)
        self.assertFalse(c_het_phased.is_haploid())
        self.assertTrue(c_het_phased.is_diploid())
        self.assertEqual(c_het_phased.n_alt_alleles(), 1)
        self.assertTrue(c_het_phased.one_hot_alleles(2) == [1, 1])
        self.assertFalse(c_het_phased.is_hom_ref())
        self.assertTrue(c_het_phased.is_het())
        self.assertFalse(c_het_phased.is_hom_var())
        self.assertTrue(c_het_phased.is_non_ref())
        self.assertFalse(c_het_phased.is_het_non_ref())
        self.assertTrue(c_het_phased.is_het_ref())

        c_hom_var = Call([1, 1])
        self.assertEqual(c_hom_var.alleles, [1, 1])
        self.assertEqual(c_hom_var.ploidy, 2)
        self.assertFalse(c_hom_var.phased)
        self.assertFalse(c_hom_var.is_haploid())
        self.assertTrue(c_hom_var.is_diploid())
        self.assertEqual(c_hom_var.n_alt_alleles(), 2)
        self.assertTrue(c_hom_var.one_hot_alleles(2) == [0, 2])
        self.assertFalse(c_hom_var.is_hom_ref())
        self.assertFalse(c_hom_var.is_het())
        self.assertTrue(c_hom_var.is_hom_var())
        self.assertTrue(c_hom_var.is_non_ref())
        self.assertFalse(c_hom_var.is_het_non_ref())
        self.assertFalse(c_hom_var.is_het_ref())
        self.assertTrue(c_hom_var.unphased_diploid_gt_index() == 2)

        c_haploid = Call([2], phased=True)
        self.assertEqual(c_haploid.alleles, [2])
        self.assertEqual(c_haploid.ploidy, 1)
        self.assertTrue(c_haploid.phased)
        self.assertTrue(c_haploid.is_haploid())
        self.assertFalse(c_haploid.is_diploid())
        self.assertEqual(c_haploid.n_alt_alleles(), 1)
        self.assertTrue(c_haploid.one_hot_alleles(3) == [0, 0, 1])
        self.assertFalse(c_haploid.is_hom_ref())
        self.assertFalse(c_haploid.is_het())
        self.assertTrue(c_haploid.is_hom_var())
        self.assertTrue(c_haploid.is_non_ref())
        self.assertFalse(c_haploid.is_het_non_ref())
        self.assertFalse(c_haploid.is_het_ref())

        c_zeroploid = Call([])
        self.assertEqual(c_zeroploid.alleles, [])
        self.assertEqual(c_zeroploid.ploidy, 0)
        self.assertFalse(c_zeroploid.phased)
        self.assertFalse(c_zeroploid.is_haploid())
        self.assertFalse(c_zeroploid.is_diploid())
        self.assertEqual(c_zeroploid.n_alt_alleles(), 0)
        self.assertTrue(c_zeroploid.one_hot_alleles(3) == [0, 0, 0])
        self.assertFalse(c_zeroploid.is_hom_ref())
        self.assertFalse(c_zeroploid.is_het())
        self.assertFalse(c_zeroploid.is_hom_var())
        self.assertFalse(c_zeroploid.is_non_ref())
        self.assertFalse(c_zeroploid.is_het_non_ref())
        self.assertFalse(c_zeroploid.is_het_ref())

        self.assertRaisesRegex(
            NotImplementedError,
            "Calls with greater than 2 alleles are not supported.", Call,
            [1, 1, 1, 1])

        rg = hl.get_reference('GRCh37')
        self.assertEqual(rg.name, "GRCh37")
        self.assertEqual(rg.contigs[0], "1")
        self.assertListEqual(rg.x_contigs, ["X"])
        self.assertListEqual(rg.y_contigs, ["Y"])
        self.assertListEqual(rg.mt_contigs, ["MT"])
        self.assertEqual(rg.par[0],
                         hl.parse_locus_interval("X:60001-2699521").value)
        self.assertEqual(rg.contig_length("1"), 249250621)

        name = "test"
        contigs = ["1", "X", "Y", "MT"]
        lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000}
        x_contigs = ["X"]
        y_contigs = ["Y"]
        mt_contigs = ["MT"]
        par = [("X", 5, 1000)]

        gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs,
                              mt_contigs, par)
        self.assertEqual(gr2.name, name)
        self.assertListEqual(gr2.contigs, contigs)
        self.assertListEqual(gr2.x_contigs, x_contigs)
        self.assertListEqual(gr2.y_contigs, y_contigs)
        self.assertListEqual(gr2.mt_contigs, mt_contigs)
        self.assertEqual(gr2.par,
                         [hl.parse_locus_interval("X:5-1000", gr2).value])
        self.assertEqual(gr2.contig_length("1"), 10000)
        self.assertDictEqual(gr2.lengths, lengths)
        gr2.write("/tmp/my_gr.json")

        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file(
            "test_rg",
            resource("fake_reference.fasta"),
            resource("fake_reference.fasta.fai"),
            mt_contigs=["b", "c"],
            x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(t.all(hl.get_sequence(gr4, t.contig, t.pos) == t.base))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(
            l.sequence_context(before=3, after=3).value == "TTTCGAA")