예제 #1
0
 def test_iterate_exclude_filters(self):
     reads = Reads(self.sam1_file)
     reads.filter_unmapped(queue=True)
     reads.filter_quality(35, queue=True)
     reads.filter_non_unique(strict=True, queue=True)
     reads.run_queued_filters()
     assert len(
         list(
             reads.reads(
                 excluded_filters=['unmapped', 'mapq', 'uniqueness'
                                   ]))) == 271
     assert len(
         list(reads.reads(
             excluded_filters=['unmapped', 'uniqueness']))) == 246
     assert len(list(
         reads.reads(excluded_filters=['unmapped', 'mapq']))) == 153
     assert len(list(
         reads.reads(excluded_filters=['mapq', 'uniqueness']))) == 271
     assert len(list(reads.reads(excluded_filters=['unmapped']))) == 153
     assert len(list(reads.reads(excluded_filters=['mapq']))) == 153
     assert len(list(reads.reads(excluded_filters=['uniqueness']))) == 246
     reads.close()
예제 #2
0
class TestAccessOptimisedReadPairs(TestFragmentMappedReadPairs):
    @classmethod
    def setup_method(self, method):
        self.dir = os.path.dirname(os.path.realpath(__file__))
        sam1_file = self.dir + "/test_seq/lambda_reads1.sam"
        sam2_file = self.dir + "/test_seq/lambda_reads2.sam"
        self.reads1 = Reads(sam1_file)
        self.reads2 = Reads(sam2_file)
        self.reads1.filter_unmapped()
        self.reads2.filter_unmapped()
        self.genome = Genome.from_folder(self.dir + "/test_seq/lambda_genome/")

        self.pairs_class = AccessOptimisedReadPairs
        self.pairs = self.pairs_class()
        regions = self.genome.get_regions(1000)
        self.pairs.load(self.reads1, self.reads2, regions=regions)
        regions.close()

    def teardown_method(self, method):
        self.reads1.close()
        self.reads2.close()
        self.genome.close()
        self.pairs.close()
예제 #3
0
class TestBWAFragmentMappedReads:
    def setup_method(self, method):
        self.dir = os.path.dirname(os.path.realpath(__file__))
        sam1_file = self.dir + "/test_seq/test_bwa1.sam"
        sam2_file = self.dir + "/test_seq/test_bwa2.sam"
        self.reads1 = Reads()
        self.reads2 = Reads()
        self.reads1.load(sam1_file)
        self.reads2.load(sam2_file)
        self.reads1.filter_unmapped()
        self.reads2.filter_unmapped()
        self.genome = Genome.from_folder(self.dir + "/test_seq/dmel_genome/")
        self.pairs = FragmentMappedReadPairs()
        regions = self.genome.get_regions('MboI')
        self.pairs.load(self.reads1, self.reads2, regions=regions)
        regions.close()

    def teardown_method(self, method):
        self.reads1.close()
        self.reads2.close()
        self.genome.close()
        self.pairs.close()

    def test_loaded_bwamem_pairs(self):
        assert self.pairs._single_count == 896
        assert self.pairs._pair_count == 515

    def test_pcr_duplicate_filter(self):
        mask = self.pairs.add_mask_description(
            'pcr_duplicate', 'Mask suspected PCR duplicates.')
        pcr_duplicate_filter = PCRDuplicateFilter(pairs=self.pairs,
                                                  threshold=3)

        assert len(self.pairs) == 515
        self.pairs.filter(pcr_duplicate_filter)
        assert len(self.pairs) == 512
예제 #4
0
class TestFragmentMappedReadPairs:
    @classmethod
    def setup_method(self, method):
        self.dir = os.path.dirname(os.path.realpath(__file__))
        sam1_file = self.dir + "/test_seq/lambda_reads1.sam"
        sam2_file = self.dir + "/test_seq/lambda_reads2.sam"
        self.reads1 = Reads(sam1_file)
        self.reads2 = Reads(sam2_file)
        self.reads1.filter_unmapped()
        self.reads2.filter_unmapped()
        self.genome = Genome.from_folder(self.dir + "/test_seq/lambda_genome/")

        self.pairs_class = FragmentMappedReadPairs
        self.pairs = self.pairs_class()
        regions = self.genome.get_regions(1000)
        self.pairs.load(self.reads1, self.reads2, regions=regions)
        regions.close()

    def teardown_method(self, method):
        self.reads1.close()
        self.reads2.close()
        self.genome.close()
        self.pairs.close()

    def test_select(self):
        pair = self.pairs[0]
        assert isinstance(pair, FragmentReadPair)
        assert pair.left.position == 40884
        assert pair.right.position == 41039
        assert pair.left.strand == 1
        assert pair.right.strand == -1
        assert isinstance(pair.left.fragment, GenomicRegion)
        assert isinstance(pair.right.fragment, GenomicRegion)
        assert pair.left.fragment.start == 40001
        assert pair.left.fragment.end == 41000
        assert pair.left.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'
        assert pair.right.fragment.start == 41001
        assert pair.right.fragment.end == 42000
        assert pair.right.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'

        pair = self.pairs[1]
        assert isinstance(pair, FragmentReadPair)
        assert pair.left.position == 19617
        assert pair.right.position == 19736
        assert pair.left.strand == 1
        assert pair.right.strand == -1
        assert isinstance(pair.left.fragment, GenomicRegion)
        assert isinstance(pair.right.fragment, GenomicRegion)
        assert pair.left.fragment.start == 19001
        assert pair.left.fragment.end == 20000
        assert pair.left.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'
        assert pair.right.fragment.start == 19001
        assert pair.right.fragment.end == 20000
        assert pair.right.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'

        pair = self.pairs[-1]
        assert isinstance(pair, FragmentReadPair)
        assert pair.left.position == 25765
        assert pair.right.position == 25622
        assert pair.left.strand == -1
        assert pair.right.strand == 1
        assert isinstance(pair.left.fragment, GenomicRegion)
        assert isinstance(pair.right.fragment, GenomicRegion)
        assert pair.left.fragment.start == 25001
        assert pair.left.fragment.end == 26000
        assert pair.left.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'
        assert pair.right.fragment.start == 25001
        assert pair.right.fragment.end == 26000
        assert pair.right.fragment.chromosome == 'gi|9626243|ref|NC_001416.1|'

    def test_iter(self):
        for pair in self.pairs:
            assert isinstance(pair, FragmentReadPair)
            assert isinstance(pair.left, FragmentRead)
            assert isinstance(pair.right, FragmentRead)
            assert isinstance(pair.left.fragment, GenomicRegion)
            assert isinstance(pair.right.fragment, GenomicRegion)
            assert pair.left.position > 0 or pair.left.position == -1
            assert pair.right.position > 0 or pair.right.position == -1
            assert pair.left.strand == -1 or pair.left.strand == 1
            assert pair.right.strand == -1 or pair.right.strand == 1

            assert 0 < pair.left.fragment.start <= pair.left.fragment.end
            assert 0 < pair.right.fragment.start <= pair.right.fragment.end
            if pair.left.position > 0:
                assert pair.left.fragment.start <= pair.left.position <= pair.left.fragment.end
            if pair.right.position > 0:
                assert pair.right.fragment.start <= pair.right.position <= pair.right.fragment.end

    def test_len(self):
        assert len(self.pairs) == 44

    def test_single(self):
        assert len(self.pairs._single) == 6

    def test_auto_mindist(self):
        ad = self.pairs_class._auto_dist
        np.random.seed(101)
        x = np.linspace(1, 100, 10)
        i = [
            4.0, 3.0, 1.0, 1.2, 0.4, 0.8, 0.5, 0.4, 0.6, 0.5, 0.6, 0.5, 0.5,
            0.5
        ]
        o = [
            0.2, 0.4, 0.3, 0.6, 0.5, 0.4, 0.5, 0.5, 0.6, 0.5, 0.5, 0.4, 0.5,
            0.5
        ]
        b = np.random.normal(500, 200, 14).astype(int)
        assert ad(x, i, b, 0.05) == 67
        assert ad(x, o, b, 0.05) == 45

    def test_filter_inward(self):
        mask = self.pairs.add_mask_description(
            'inwards',
            'Mask read pairs that inward facing and closer than 100bp')
        in_filter = InwardPairsFilter(minimum_distance=100, mask=mask)

        assert len(self.pairs) == 44
        self.pairs.filter(in_filter)

        assert len(self.pairs) == 18

    def test_filter_outward(self):
        mask = self.pairs.add_mask_description(
            'outwards',
            'Mask read pairs that outward facing and closer than 100bp')
        out_filter = OutwardPairsFilter(minimum_distance=100, mask=mask)

        assert len(self.pairs) == 44
        self.pairs.filter(out_filter)
        assert len(self.pairs) == 28

    def test_filter_redist(self):
        mask = self.pairs.add_mask_description(
            're-dist',
            'Mask read pairs where one half maps more than 100bp away from both RE sites'
        )
        re_filter = ReDistanceFilter(maximum_distance=300, mask=mask)

        assert len(self.pairs) == 44
        self.pairs.filter(re_filter)
        assert len(self.pairs) == 13

    def test_filter_self_ligated(self):
        mask = self.pairs.add_mask_description(
            'self_ligated',
            'Mask read pairs that represent self-ligated fragments')
        self_ligation_filter = SelfLigationFilter(mask=mask)

        assert len(self.pairs) == 44
        self.pairs.filter(self_ligation_filter)
        assert len(self.pairs) == 7

    def test_get_ligation_structure_biases(self):
        reads1 = Reads(self.dir +
                       "/../data/test_genomic/yeast.sample.chrI.1.sam")
        reads2 = Reads(self.dir +
                       "/../data/test_genomic/yeast.sample.chrI.2.sam")
        chrI = Chromosome.from_fasta(self.dir +
                                     "/../data/test_genomic/chrI.fa")
        genome = Genome(chromosomes=[chrI])
        pairs = self.pairs_class()
        regions = genome.get_regions('HindIII')
        pairs.load(reads1, reads2, regions)
        reads1.close()
        reads2.close()
        genome.close()
        regions.close()
        x, i, o, b = pairs.get_ligation_structure_biases(
            sampling=200, skip_self_ligations=False)
        assert len(x) == len(i) == len(o) == len(b) == 3
        assert x.tolist() == [494, 4487, 19399]
        assert i.tolist() == [
            2.616915422885572, 0.8059701492537313, 0.6417910447761194
        ]
        assert o.tolist() == [
            0.2537313432835821, 0.24378109452736318, 0.46766169154228854
        ]
        assert b.tolist() == [778, 412, 424]
        pairs.close()

    def test_re_dist(self):
        read1 = FragmentRead(GenomicRegion(chromosome='chr1',
                                           start=1,
                                           end=1000),
                             position=200,
                             strand=-1)
        assert read1.re_distance() == 199
        read2 = FragmentRead(GenomicRegion(chromosome='chr1',
                                           start=1,
                                           end=1000),
                             position=990,
                             strand=-1)
        assert read2.re_distance() == 10

    def test_iterate_exclude_filters(self):
        mask = self.pairs.add_mask_description(
            'inwards',
            'Mask read pairs that inward facing and closer than 100bp')
        in_filter = InwardPairsFilter(minimum_distance=100, mask=mask)
        self.pairs.filter(in_filter)
        mask = self.pairs.add_mask_description(
            'outwards',
            'Mask read pairs that outward facing and closer than 100bp')
        out_filter = OutwardPairsFilter(minimum_distance=100, mask=mask)
        self.pairs.filter(out_filter)
        mask = self.pairs.add_mask_description(
            're-dist',
            'Mask read pairs where one half maps more than 100bp away from both RE sites'
        )
        re_filter = ReDistanceFilter(maximum_distance=300, mask=mask)
        self.pairs.filter(re_filter)
        assert len(self.pairs) == 0
        assert len(
            list(
                self.pairs.pairs(
                    excluded_filters=['inwards', 'outwards', 're-dist'
                                      ]))) == 44
        assert len(
            list(self.pairs.pairs(
                excluded_filters=['inwards', 'outwards']))) == 13
        assert len(
            list(self.pairs.pairs(
                excluded_filters=['inwards', 're-dist']))) == 28
        assert len(
            list(self.pairs.pairs(
                excluded_filters=['outwards', 're-dist']))) == 18
        assert len(list(self.pairs.pairs(excluded_filters=['inwards']))) == 11
        assert len(list(self.pairs.pairs(excluded_filters=['outwards']))) == 2
        assert len(list(self.pairs.pairs(excluded_filters=['re-dist']))) == 2
        assert len(
            list(self.pairs.pairs(
                excluded_filters=[in_filter, re_filter]))) == 28
        assert len(list(
            self.pairs.pairs(excluded_filters=[in_filter, mask]))) == 28
        assert len(list(
            self.pairs.pairs(excluded_filters=[in_filter, 3]))) == 28

    @pytest.mark.parametrize("gz", [False, True])
    @pytest.mark.parametrize("include_filtered", [False, True])
    def test_export_homer(self, gz, include_filtered, tmpdir_factory):
        fn = str(
            tmpdir_factory.mktemp("output").join(".tsv.gz" if gz else ".tsv"))
        mask = self.pairs.add_mask_description(
            'self_ligated',
            'Mask read pairs that represent self-ligated fragments')
        self_ligation_filter = SelfLigationFilter(mask=mask)
        self.pairs.filter(self_ligation_filter)
        self.pairs.to_homer(fn, include_filtered=include_filtered)
        if gz:
            import gzip, sys
            read_handle = gzip.open(
                fn, mode="rt" if sys.version_info.major == 3 else "r")
        else:
            read_handle = open(fn, mode="r")
        with read_handle:
            lines = read_handle.readlines()
            assert len(lines) == 44 if include_filtered else 7