예제 #1
0
    def test_kmerCountingFASTA(self):
        params_MQF = test_params(self.ksize, "MQF")
        params_PHMAP = test_params(self.ksize, "PHMAP")
        params_MAP = test_params(self.ksize, "MAP")

        KF_MQF = params_MQF.new_kf(self.ksize)
        KF_PHMAP = params_PHMAP.new_kf(self.ksize)
        KF_MAP = params_MAP.new_kf(self.ksize)

        self.assertFalse(KF_MQF.size())
        self.assertFalse(KF_PHMAP.size())
        self.assertFalse(KF_MAP.size())

        # kmerCounting
        kp.countKmersFromFile(KF_MQF, {
            "mode": 1,
            "k_size": self.ksize
        }, params_MQF.fasta_file, 1000)
        kp.countKmersFromFile(KF_PHMAP, {
            "mode": 1,
            "k_size": self.ksize
        }, params_PHMAP.fasta_file, 1000)
        kp.countKmersFromFile(KF_MAP, {
            "mode": 1,
            "k_size": self.ksize
        }, params_MAP.fasta_file, 1000)

        self.assertTrue(KF_MQF.size())
        self.assertTrue(KF_PHMAP.size())
        self.assertTrue(KF_MAP.size())
예제 #2
0
    def test_PHMAP(self):

        my_tmpdir = "tmp" + str(random.randint(1, 9999))
        os.mkdir(my_tmpdir)

        params = test_params(self.ksize, "PHMAP")
        params.Hasher = kp.TwoBitsHasher(self.ksize)

        KF = params.new_kf(self.ksize)
        self.assertTrue(KF.empty())

        # Create random kmers list
        kmers_list = params.generate_kmers(kmers_no=20)
        kmers_hashes = [params.Hasher.hash(kmer[0]) for kmer in kmers_list]

        for kmer in kmers_list:
            KF.insert(kmer[0], kmer[1])

        fileName = os.path.join(my_tmpdir, "tmp.kdataframeMQF")
        KF.save(fileName)
        KF_LOADED = kp.kDataFrame.load(fileName)

        it = KF_LOADED.begin()
        count = 0

        while it != KF_LOADED.end():
            self.assertTrue(it.getHashedKmer() in kmers_hashes)
            count += 1
            it.next()

        self.assertEqual(count, len(kmers_list))
        shutil.rmtree(my_tmpdir)
예제 #3
0
class TestkmerDecoder(unittest.TestCase):
    generic_params = test_params(10, "PHMAP")
    seq = "ACGTAGCATGCATGACGATGCTAGCGTGATGCTAGCTAGTCAGTAGCATGC"

    def test_kmers(self):
        KD_params = {
            "mode": 1,
            "k_size": 10,
        }

        KF_seq = kp.kDataFramePHMAP(KD_params["k_size"])
        KF_file = kp.kDataFramePHMAP(KD_params["k_size"])

        # void countKmersFromFile(kDataFrame * output, string mode, std::map<std::string, int> params, string filename, int chunk_size = 1000);
        kp.countKmersFromFile(KF_file, KD_params,
                              self.generic_params.small_fasta_file, 1)
        self.assertFalse(KF_file.empty())

        kp.countKmersFromString(KF_seq, KD_params, self.seq)
        self.assertFalse(KF_seq.empty())

    def test_skipmers(self):
        KD_params = {"mode": 2, "k_size": 10, "m": 2, "n": 3}

        KF_seq = kp.kDataFramePHMAP(KD_params["k_size"])
        KF_file = kp.kDataFramePHMAP(KD_params["k_size"])

        kp.countKmersFromFile(KF_file, KD_params,
                              self.generic_params.small_fasta_file, 1)
        self.assertFalse(KF_file.empty())

        kp.countKmersFromString(KF_seq, KD_params, self.seq)
        self.assertFalse(KF_seq.empty())

        self.assertEqual(KF_seq.size(), KF_file.size())
    def main_test(self, kf_type):
        params = test_params(kSize=21, kDataFrameType=kf_type)

        kmers_list = params.generate_kmers(kmers_no=1000)

        kFrame = params.create_new_kf()

        # Insert all kmers
        for kmer in kmers_list:
            kFrame.insert(kmer[0], kmer[1])

        kFrame.addColumn_int("intColumn")
        kFrame.addColumn_bool("boolColumn")
        kFrame.addColumn_double("doubleColumn")

        simColumns = dict()
        it = kFrame.begin()

        while it != kFrame.end():
            kmer = it.getKmer()
            _double = random.uniform(1.0, 100.0)
            _bool = bool(random.getrandbits(1))
            _int = random.randint(1, 1000)
            simColumns[kmer] = (_int, _bool, _double)
            kFrame.setKmerColumnValue_int("intColumn", kmer, _int)
            kFrame.setKmerColumnValue_bool("boolColumn", kmer, _bool)
            kFrame.setKmerColumnValue_double("doubleColumn", kmer, _double)
            it.next()

        for kmer, values in simColumns.items():
            _int = kFrame.getKmerColumnValue_int("intColumn", kmer)
            _bool = kFrame.getKmerColumnValue_bool("boolColumn", kmer)
            _double = kFrame.getKmerColumnValue_double("doubleColumn", kmer)

            golden_int = values[0]
            golden_bool = values[1]
            golden_double = values[2]

            self.assertEqual(golden_int, _int)
            self.assertEqual(golden_bool, _bool)
            self.assertEqual(golden_double, _double)
예제 #5
0
class TestKDataFrameMQF(unittest.TestCase):
    params = test_params(kSize=21, kDataFrameType="MQF")
    params.Hasher = kp.IntegerHasher(21)

    def test_emptykDataFrame(self):
        framesToBeTested = []
        kSizes = [21, 31]
        for kSize in kSizes:
            framesToBeTested.append(self.params.new_kf(kSize))

        for kFrame in framesToBeTested:
            self.assertTrue(kFrame.empty())
            self.assertEqual(kFrame.size(), 0)

    def test_insertOneTime(self):
        print(self._testMethodName)
        kSize = 31
        _kmer, rand_count = self.params.generate_singleKmer()

        kf = kp.kDataFrameMQF(kSize)
        self.assertTrue(kf.empty())

        kf.insert(_kmer, rand_count)
        self.assertFalse(kf.empty())
        self.assertEqual(kf.getCount(_kmer), rand_count)

    def test_insertNTimes(self):
        print(self._testMethodName)

        # Create random kmers list
        kmers_list = self.params.generate_kmers(kmers_no=20)

        # Empty kDataFrames
        kFrame = self.params.create_new_kf()
        self.assertTrue(kFrame.empty())

        insertedKmers = 0
        kmers_hash_values = [
            self.params.Hasher.hash(kmer[0]) for kmer in kmers_list
        ]

        # Insert all kmers
        for kmer in kmers_list:
            if kFrame.insert(kmer[0], kmer[1]):
                insertedKmers += 1

        # Assert all kmers are inserted
        self.assertEqual(insertedKmers, len(kmers_list))

        # Verify inserted kmers
        it = kFrame.begin()
        while it != kFrame.end():
            self.assertTrue(it.getHashedKmer() in kmers_hash_values)
            it.next()

        # Verify all inserted kmers
        for kmer in kmers_list:
            c = kFrame.getCount(kmer[0])
            self.assertEqual(c, kmer[1])

    def test_eraseKmers(self):
        print(self._testMethodName)

        # Create random kmers list
        kmers_list = self.params.generate_kmers(kmers_no=20)

        # Empty kDataFrames
        kFrames = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames[0].empty())
        self.assertTrue(kFrames[1].empty())

        # Insert kmers
        for i in range(len(kFrames)):
            for kmer in kmers_list:
                self.assertTrue(kFrames[i].insert(kmer[0], kmer[1]))

        # Erasing all kmers
        for i in range(len(kFrames)):
            for kmer in kmers_list:
                self.assertTrue(kFrames[i].erase(kmer[0]))

        # Check that all kmers have been erased
        for i in range(len(kFrames)):
            for kmer in kmers_list:
                self.assertEqual(kFrames[i].getCount(kmer[0]), 0)

    def test_iterateOverAllKmers(self):
        print(self._testMethodName)

        # Create random kmers list
        kmers_list = self.params.generate_kmers(kmers_no=20)

        # Empty kDataFrames
        kFrames = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames[0].empty())
        self.assertTrue(kFrames[1].empty())

        # Insert kmers
        for i in range(len(kFrames)):
            for kmer in kmers_list:
                self.assertTrue(kFrames[i].insert(kmer[0], kmer[1]))

        # Get all inserted kmers counts
        inserted_counts = set([kmer[1] for kmer in kmers_list])

        for kFrame in kFrames:
            it = kFrame.begin()
            kframe_kmers_counts = set()
            while it != kFrame.end():
                count = it.getCount()
                kframe_kmers_counts.add(count)
                it.next()

            self.assertEqual(
                len(kframe_kmers_counts.intersection(inserted_counts)),
                len(inserted_counts))

    def test_saveAndIterateOverAllKmers(self):
        my_tmpdir = "tmp" + str(random.randint(1, 9999))
        os.mkdir(my_tmpdir)
        print(self._testMethodName)

        # Create random kmers list
        kmers_list = self.params.generate_kmers(kmers_no=20)

        # Empty kDataFrames
        kFrames = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames[0].empty())
        self.assertTrue(kFrames[1].empty())

        # Insert kmers
        for i in range(len(kFrames)):
            for kmer in kmers_list:
                self.assertTrue(kFrames[i].insert(kmer[0], kmer[1]))

        # Get all inserted kmers counts
        inserted_counts = set([kmer[1] for kmer in kmers_list])
        kmers_hash_values = [
            self.params.Hasher.hash(kmer[0]) for kmer in kmers_list
        ]

        fileName = os.path.join(my_tmpdir, "tmp.kdataframe")

        for i in range(len(kFrames)):
            kFrames[i].save(fileName + "_" + str(i))

        loaded_kFrames = []
        for i in range(len(kFrames)):
            loaded_kFrames.append(kp.kDataFrame.load(fileName + "_" + str(i)))

        for kFrame in loaded_kFrames:
            it = kFrame.begin()
            kframe_kmers_counts = set()
            while (it != kFrame.end()):
                count = it.getCount()
                kframe_kmers_counts.add(count)
                self.assertTrue(it.getHashedKmer() in kmers_hash_values)
                self.assertTrue(it.getCount() in inserted_counts)
                it.next()

        shutil.rmtree(my_tmpdir)
예제 #6
0
class TestSetFunctions(unittest.TestCase):
    params = test_params(kSize=21, kDataFrameType="MQF")
    params.Hasher = kp.IntegerHasher(21)

    def test_kFrameUnion(self):
        print(self._testMethodName)

        # Empty kDataFrames
        kFrames_vec = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames_vec[0].empty())
        self.assertTrue(kFrames_vec[1].empty())

        # Create random kmers list
        kmers_list1 = self.params.generate_kmers(kmers_no=20)
        inserted_counts1 = set([kmer[1] for kmer in kmers_list1])

        kmers_list2 = self.params.generate_kmers(kmers_no=20)
        inserted_counts2 = set([kmer[1] for kmer in kmers_list2])

        # Inserting Kmers
        inserted_kmers_hashes_1 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list1]
        inserted_kmers_hashes_2 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list2]
        inserted_kmers_hashes_1 = set(inserted_kmers_hashes_1)
        inserted_kmers_hashes_2 = set(inserted_kmers_hashes_2)
        union_kmers_hashes = inserted_kmers_hashes_1.union(inserted_kmers_hashes_2)

        for kmer in kmers_list1:
            self.assertTrue(kFrames_vec[0].insert(kmer[0], kmer[1]))

        for kmer in kmers_list2:
            self.assertTrue(kFrames_vec[1].insert(kmer[0], kmer[1]))

        # Apply kFrameUnion
        union_kFrame = kp.kFrameUnion(kFrames_vec)

        # Total kmers extracted from union kFrames_vec
        kmers_count = 0

        it = union_kFrame.begin()
        while it != union_kFrame.end():
            self.assertTrue(it.getHashedKmer() in union_kmers_hashes)
            kmers_count += 1
            it.next()

        self.assertEqual(kmers_count, len(union_kmers_hashes))

    def test_kFrameIntersect(self):
        print(self._testMethodName)

        # Empty kDataFrames
        kFrames_vec = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames_vec[0].empty())
        self.assertTrue(kFrames_vec[1].empty())

        # Create random kmers list
        kmers_list1 = self.params.generate_kmers(kmers_no=20)
        inserted_counts1 = set([kmer[1] for kmer in kmers_list1])

        kmers_list2 = self.params.generate_kmers(kmers_no=20)
        # Replicate some kmers from kmers_list1 in kmers_list2 to make sure len(intersection) > 0
        kmers_list2 += kmers_list1[0:10]

        inserted_counts2 = set([kmer[1] for kmer in kmers_list2])

        # Inserting Kmers
        inserted_kmers_hashes_1 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list1]
        inserted_kmers_hashes_2 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list2]
        inserted_kmers_hashes_1 = set(inserted_kmers_hashes_1)
        inserted_kmers_hashes_2 = set(inserted_kmers_hashes_2)
        intersect_kmers_hashes = inserted_kmers_hashes_1.intersection(inserted_kmers_hashes_2)

        for kmer in kmers_list1:
            self.assertTrue(kFrames_vec[0].insert(kmer[0], kmer[1]))

        for kmer in kmers_list2:
            self.assertTrue(kFrames_vec[1].insert(kmer[0], kmer[1]))

        # Apply kFrameUnion
        intersect_kFrame = kp.kFrameIntersect(kFrames_vec)

        # Total kmers extracted from union kFrames_vec
        kmers_count = 0

        it = intersect_kFrame.begin()
        while it != intersect_kFrame.end():
            self.assertTrue(it.getHashedKmer() in intersect_kmers_hashes)
            kmers_count += 1
            it.next()


        self.assertEqual(kmers_count, len(intersect_kmers_hashes))

    def test_kFrameDiff(self):
        print(self._testMethodName)

        # Empty kDataFrames
        kFrames_vec = self.params.create_empty_kframes(2)
        self.assertTrue(kFrames_vec[0].empty())
        self.assertTrue(kFrames_vec[1].empty())

        # Create random kmers list
        kmers_list1 = self.params.generate_kmers(kmers_no=20)
        inserted_counts1 = set([kmer[1] for kmer in kmers_list1])

        kmers_list2 = self.params.generate_kmers(kmers_no=20)
        # Replicate some kmers from kmers_list1 in kmers_list2 to make sure len(intersection) > 0
        kmers_list2 += kmers_list1[0:10]

        inserted_counts2 = set([kmer[1] for kmer in kmers_list2])

        # Inserting Kmers
        inserted_kmers_hashes_1 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list1]
        inserted_kmers_hashes_2 = [self.params.Hasher.hash(kmer[0]) for kmer in kmers_list2]
        inserted_kmers_hashes_1 = set(inserted_kmers_hashes_1)
        inserted_kmers_hashes_2 = set(inserted_kmers_hashes_2)
        difference_kmers_hashes = inserted_kmers_hashes_1.difference(inserted_kmers_hashes_2)

        for kmer in kmers_list1:
            self.assertTrue(kFrames_vec[0].insert(kmer[0], kmer[1]))

        for kmer in kmers_list2:
            self.assertTrue(kFrames_vec[1].insert(kmer[0], kmer[1]))

        # Apply kFrameUnion
        diff_kFrame = kp.kFrameDiff(kFrames_vec)

        # Total kmers extracted from union kFrames_vec
        kmers_count = 0

        it = diff_kFrame.begin()
        while it != diff_kFrame.end():
            self.assertTrue(it.getHashedKmer() in difference_kmers_hashes)
            kmers_count += 1
            it.next()

        self.assertEqual(kmers_count, len(difference_kmers_hashes))