Exemplo n.º 1
0
def convert_index(infile, config, num_samples):
    in_graph = db.DB()
    in_graph.set_cachesize(4, 0)
    in_graph.open(infile + "/graph", flags=db.DB_RDONLY)

    # Create the kmer signature index
    storage = get_storage(config)
    storage.set_integer(BLOOMFILTER_SIZE_KEY, config["m"])
    storage.set_integer(NUM_HASH_FUNCTS_KEY, config["h"])
    BitMatrix.create(storage=storage,
                     rows=get_rows(in_graph, config["m"]),
                     num_rows=config["m"],
                     num_cols=num_samples)
    in_graph.close()
Exemplo n.º 2
0
 def create(cls, storage, bloomfilters, bloomfilter_size, num_hashes, lowmem=False):
     bloomfilters = [
         bf.bitarray if isinstance(bf, BloomFilter) else bf for bf in bloomfilters
     ]
     storage.set_integer(BLOOMFILTER_SIZE_KEY, bloomfilter_size)
     storage.set_integer(NUM_HASH_FUNCTS_KEY, num_hashes)
     logger.debug("Transpose bitarrays")
     rows = transpose(bloomfilters, lowmem=lowmem)
     logger.debug("Insert rows")
     bitmatrix = BitMatrix.create(
         storage, rows, num_rows=bloomfilter_size, num_cols=len(bloomfilters)
     )
     return cls(storage)
Exemplo n.º 3
0
def test_get_insert_column():
    rows = [
        bitarray("001"),
        bitarray("001"),
        bitarray("111"),
        bitarray("001"),
        bitarray("111"),
    ] * 5
    for storage in get_storages():
        storage.delete_all()
        bm = BitMatrix.create(storage, rows, len(rows), len(rows[0]))
        assert bm.get_column(0) == bitarray("00101" * 5)
        bm.insert_column(bitarray("1" * 25), 0)
        assert bm.get_column(0) == bitarray("1" * 25)

        assert bm.get_row(1) == bitarray("101")
        bm.insert_column(bitarray("1" * 25), 3)
        assert bm.get_column(3) == bitarray("1" * 25)
        assert bm.get_row(1) == bitarray("1011")
Exemplo n.º 4
0
def test_get_set():
    rows = [
        bitarray("001"),
        bitarray("001"),
        bitarray("111"),
        bitarray("001"),
        bitarray("111"),
    ] * 5
    for storage in get_storages():
        storage.delete_all()
        bm = BitMatrix.create(storage, rows, len(rows), len(rows[0]))
        bm.set_rows(range(25), rows)
        assert list(bm.get_rows(range(3))) == rows[:3]
        assert bm.get_column(0) == bitarray("00101" * 5)
        assert bm.get_column(2) == bitarray("1" * 25)
        assert list(bm.get_columns([0, 2])) == [
            bitarray("00101" * 5),
            bitarray("1" * 25),
        ]
Exemplo n.º 5
0
 def __init__(self, storage):
     self.storage = storage
     self.bitmatrix = BitMatrix(storage)
     self.bloomfilter_size = storage.get_integer(BLOOMFILTER_SIZE_KEY)
     self.num_hashes = storage.get_integer(NUM_HASH_FUNCTS_KEY)
Exemplo n.º 6
0
class KmerSignatureIndex:

    """
    Methods for managing kmer signature indexes
    """

    def __init__(self, storage):
        self.storage = storage
        self.bitmatrix = BitMatrix(storage)
        self.bloomfilter_size = storage.get_integer(BLOOMFILTER_SIZE_KEY)
        self.num_hashes = storage.get_integer(NUM_HASH_FUNCTS_KEY)

    @classmethod
    def create(cls, storage, bloomfilters, bloomfilter_size, num_hashes, lowmem=False):
        bloomfilters = [
            bf.bitarray if isinstance(bf, BloomFilter) else bf for bf in bloomfilters
        ]
        storage.set_integer(BLOOMFILTER_SIZE_KEY, bloomfilter_size)
        storage.set_integer(NUM_HASH_FUNCTS_KEY, num_hashes)
        logger.debug("Transpose bitarrays")
        rows = transpose(bloomfilters, lowmem=lowmem)
        logger.debug("Insert rows")
        bitmatrix = BitMatrix.create(
            storage, rows, num_rows=bloomfilter_size, num_cols=len(bloomfilters)
        )
        return cls(storage)

    def lookup(self, kmers, remove_trailing_zeros=True):
        if isinstance(kmers, str):
            kmers = [kmers]
        kmers=set(kmers)
        kmer_to_hashes = self.__kmers_to_hashes(kmers)
        hashes = {h for sublist in kmer_to_hashes.values() for h in sublist}
        rows = self.__batch_get_rows(hashes, remove_trailing_zeros)
        return self.__bitwise_and_kmers(kmer_to_hashes, rows)

    def insert_bloom(self, bloomfilter, column_index):
        self.bitmatrix.insert_column(bloomfilter, column_index)

    def merge_indexes(self, ksi):
        for i in range(self.bloomfilter_size):
            r1 = self.bitmatrix.get_row(i)
            r2 = ksi.bitmatrix.get_row(i)
            r1.extend(r2)
            self.bitmatrix.set_row(i, r1)
        self.bitmatrix.set_num_cols(self.bitmatrix.num_cols + ksi.bitmatrix.num_cols)

    def __kmers_to_hashes(self, kmers):
        d = {}
        for k in set(kmers):
            d[k] = set(
                generate_hashes(
                    convert_query_kmer(k), self.num_hashes, self.bloomfilter_size
                )
            )  ## use canonical kmer to generate lookup, but report query kmer
        return d

    def __batch_get_rows(self, row_indexes, remove_trailing_zeros=False):
        return dict(zip(row_indexes, self.bitmatrix.get_rows(row_indexes, remove_trailing_zeros=remove_trailing_zeros)))

    def __bitwise_and_kmers(self, kmer_to_hashes, rows):
        d = {}
        for k, hashes in kmer_to_hashes.items():
            subset_rows = [rows[h] for h in hashes]
            d[k] = bitwise_and(subset_rows)
        return d