Exemplo n.º 1
0
 def test_build_contig_db_from_fasta(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig_db.fast_add_kmers_from_fasta(KMER_FASTA)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     self.assertGreaterEqual(len(stored), 3)
Exemplo n.º 2
0
 def test_build_contig_db(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=100)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     self.assertGreaterEqual(len(stored), 2)
Exemplo n.º 3
0
 def test_add_kmer_to_pre(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = PreDB(sqlite3.connect(':memory:'), ramifier=ramifier)
     db.py_add_kmer(KMER_31)
     db.commit()
     members = list(db.conn.execute('SELECT * FROM kmers'))
     self.assertEqual(len(members), 1)
     self.assertIn(KMER_31,
                   [reverse_convert_kmer(member[1]) for member in members])
Exemplo n.º 4
0
 def test_save(self):
     DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE),
                      ramifier=ramifier,
                      box_side_len=0.5)
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31)
     db.close()
     remove(DB_SAVE_TEMP_FILE)
Exemplo n.º 5
0
 def test_get_centroids(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(':memory:'),
                      ramifier=ramifier,
                      box_side_len=0.5)
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A')
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T')
     db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C')
     db.commit()
     centroids = db.centroids()
     self.assertEqual(centroids.shape, (2, 4))
Exemplo n.º 6
0
 def test_add_kmer(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(':memory:'),
                      ramifier=ramifier,
                      box_side_len=0.5)
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31)
     db.commit()
     members = db.py_get_cluster_members(0)
     self.assertEqual(len(members), 1)
     self.assertIn(KMER_31,
                   [reverse_convert_kmer(member) for member in members])
Exemplo n.º 7
0
 def test_search_contig_db(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=10)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     searcher = ContigSearcher(contig_db)
     hits = searcher.py_search(contig[500:600], 0.1, 0.5)
     self.assertGreaterEqual(len(hits), 1)
Exemplo n.º 8
0
def add_rotation_dists(dimensions, kmer_cols, outfile, rotation, dist_table):
    """Add rotation distances to an existing distance table."""
    header = dist_table.readline().strip() + f',rotation_dist_{dimensions}\n'
    outfile.write(header)
    ramifier = RotatingRamifier.from_file(dimensions, rotation)
    for line in dist_table:
        line = line.strip()
        tkns = line.split(',')
        k1, k2 = tkns[kmer_cols[0]], tkns[kmer_cols[1]]
        rft1, rft2 = ramifier.ramify(k1), ramifier.ramify(k2)
        d = np.linalg.norm(rft1 - rft2)
        outfile.write(line + f',{d}\n')
Exemplo n.º 9
0
 def test_build_grid_cover_from_fasta(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(':memory:'),
                      ramifier=ramifier,
                      box_side_len=0.5)
     grid = GridCoverBuilder(db)
     grid.fast_add_kmers_from_fasta(KMER_FASTA)
     grid.commit()
     n_centers = grid.db.centroids().shape[0]
     n_points = len(grid.db.get_kmers())
     self.assertGreater(n_centers, 0)
     self.assertLess(n_centers, 98)
     self.assertEqual(n_points, 98)
Exemplo n.º 10
0
def build_grid_cover_fasta(dimension, threads, outfile, rotation, fasta_list):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    fasta_list = [line.strip() for line in fasta_list]
    ramifier = RotatingRamifier.from_file(dimension, rotation)
    predb = PreDB.load_from_filepath(outfile, ramifier=ramifier)
    start = time()
    with click.progressbar(fasta_list) as fastas:
        for fasta_filename in fastas:
            n_added = predb.fast_add_kmers_from_fasta(fasta_filename)
    predb.close()
    add_time = time() - start
    click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s.',
               err=True)
Exemplo n.º 11
0
 def test_save_and_reload(self):
     DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE),
                      ramifier=ramifier,
                      box_side_len=0.5)
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31)
     db.close()
     del db
     db = GridCoverDB.load_from_filepath(DB_SAVE_TEMP_FILE)
     members = db.py_get_cluster_members(0)
     self.assertEqual(len(members), 1)
     self.assertIn(KMER_31,
                   [reverse_convert_kmer(member) for member in members])
     remove(DB_SAVE_TEMP_FILE)
Exemplo n.º 12
0
def build_grid_cover(radius, dimension, threads, num_kmers, start_offset,
                     outfile, preload, rotation, kmer_table):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    ramifier = RotatingRamifier.from_file(dimension, rotation)
    grid = GridCoverBuilder.from_filepath(outfile, ramifier, radius)
    start = time()
    n_added = grid.fast_add_kmers_from_file(kmer_table, num_to_add=num_kmers)
    grid.commit()
    n_centers = grid.db.centroids().shape[0]
    grid.close()
    add_time = time() - start
    click.echo(
        f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. {n_centers:,} clusters.',
        err=True)
Exemplo n.º 13
0
 def test_pre_build_blooms(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db = GridCoverDB(sqlite3.connect(':memory:'),
                      ramifier=ramifier,
                      box_side_len=0.5)
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A')
     db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T')
     db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C')
     db.commit()
     for centroid_id in [0, 1]:
         db.build_and_store_bloom_grid(centroid_id)
     bg_0 = db.retrieve_bloom_grid(0)
     bg_1 = db.retrieve_bloom_grid(1)
     self.assertEqual(max(bg_0.py_count_grid_contains(KMER_30 + 'A')),
                      32 - bg_0.col_k)
     self.assertEqual(max(bg_1.py_count_grid_contains(KMER_30 + 'C')),
                      32 - bg_1.col_k)
     self.assertRaises(IndexError, lambda: db.retrieve_bloom_grid(2))
Exemplo n.º 14
0
def build_grid_cover_fasta(radius, dimension, threads, outfile, rotation,
                           fasta_list):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    fasta_list = [line.strip() for line in fasta_list]
    ramifier = RotatingRamifier.from_file(dimension, rotation)
    grid = GridCoverBuilder.from_filepath(outfile, ramifier, radius)
    start = time()
    with click.progressbar(fasta_list) as fastas:
        for fasta_filename in fastas:
            n_added = grid.fast_add_kmers_from_fasta(fasta_filename)
    n_centers = grid.db.centroids().shape[0]
    grid.close()
    add_time = time() - start
    click.echo((f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. '
                f'{n_centers:,} clusters.'),
               err=True)
Exemplo n.º 15
0
    def test_search_bigger_contig_db_exact(self):
        contig_db = ContigDB(sqlite3.connect(':memory:'),
                             ramifier=RotatingRamifier.from_file(
                                 4, KMER_ROTATION),
                             box_side_len=0.0001)
        n_contigs, contig_len = 3, 2 * 10 * 1000
        contigs = [random_kmer(contig_len) for _ in range(n_contigs)]
        for i, contig in enumerate(contigs):
            contig_db.py_add_contig(f'test_genome_{i}___test_contig_{i}',
                                    contig,
                                    gap=1)
        contig_db.commit()
        self.assertEqual(contig_db.centroids().shape[0],
                         n_contigs * (contig_len - 31 + 1))

        searcher = ContigSearcher(contig_db)
        hits = searcher.py_search(contigs[0][500:600], 0, 1)
        self.assertEqual(len(hits), 1)
Exemplo n.º 16
0
def build_contig_cover_fasta(radius, dimension, threads, outfile, rotation,
                             fasta_list):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    fasta_list = [line.strip() for line in fasta_list]
    ramifier = RotatingRamifier.from_file(dimension, rotation)
    grid = ContigDB(sqlite3.connect(outfile),
                    ramifier=ramifier,
                    box_side_len=radius)
    click.echo(f'Adding {len(fasta_list)} fastas.', err=True)
    start = time()
    with click.progressbar(fasta_list) as fastas:
        for fasta_filename in fastas:
            n_added = grid.fast_add_kmers_from_fasta(fasta_filename)
    grid.close()
    add_time = time() - start
    click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. ',
               err=True)
Exemplo n.º 17
0
 def test_fileio_contig_db(self):
     fname = 'temp.test_contig_db.sqlite'
     try:
         remove(fname)
     except FileNotFoundError:
         pass
     conn = sqlite3.connect(fname)
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=1)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=100)
     contig_db.commit()
     from_store = ContigDB.load_from_filepath(fname)
     self.assertEqual(contig_db.current_seq_coord,
                      from_store.current_seq_coord)
     self.assertEqual(len(contig_db.centroid_cache),
                      len(from_store.centroid_cache))
     for key, val in contig_db.centroid_cache.items():
         self.assertIn(key, from_store.centroid_cache)
         self.assertEqual(val, from_store.centroid_cache[key])
     remove(fname)
Exemplo n.º 18
0
 def test_merge_dbs(self):
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     db1 = GridCoverDB(sqlite3.connect(':memory:'),
                       ramifier=ramifier,
                       box_side_len=0.5)
     db1.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A')
     db1.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'T')
     db1.commit()
     db2 = GridCoverDB(sqlite3.connect(':memory:'),
                       ramifier=ramifier,
                       box_side_len=0.5)
     db2.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'C')
     db2.py_add_point_to_cluster(np.array([1., 1., 0., 0.]), KMER_30 + 'G')
     db2.commit()
     db1.load_other(db2)
     centroids = db1.centroids()
     self.assertEqual(centroids.shape, (3, 4))
     kmers = [el[1] for el in db1.get_kmers()]
     self.assertEqual(len(kmers), 4)
     for char in 'ATCG':
         self.assertIn(KMER_30 + char, kmers)
Exemplo n.º 19
0
def calibrate_db(dropout, gap, burst, kmer_len, outfile, rotation, fasta):
    seqs = [str(el.seq) for el in SeqIO.parse(fasta, 'fasta')]
    kmers = set()
    for seq in seqs:
        for i in range(0, len(seq) - kmer_len, gap):
            for j in range(burst):
                j = 0
                if random.random() < dropout:
                    kmer = seq[i + j:i + j + kmer_len]
                    # kmer = 'A' + kmer + 'C'
                    kmers.add(kmer)
                    # frac = 30
                    # mut_kmer = kmer[:(kmer_len // frac)]
                    # mut_kmer += mutate_seq(kmer[(kmer_len // frac):((frac - 1) * kmer_len // frac)])
                    # mut_kmer += kmer[((frac - 1) * kmer_len // frac):]
                    # kmers.add(mut_kmer)

    click.echo(f'{len(kmers)} kmers', err=True)
    dist_tbl = pd.DataFrame(py_needle(list(kmers)),
                            columns=['k1', 'k2', 'f_lev'])

    if rotation is None:
        ramifier = Ramifier(kmer_len)
    else:
        ramifier = RotatingRamifier.from_file(rotation)

    def rc_lev(row):
        s1, s2 = row['k1'], reverseComplement(row['k2'])
        return py_needle([s1, s2])[0][2]

    dist_tbl['rc_lev'] = dist_tbl.apply(rc_lev, axis=1)
    dist_tbl['lev'] = dist_tbl.apply(
        lambda row: min(row['f_lev'], row['rc_lev']), axis=1)

    def ram_dist(row):
        r1, r2 = ramifier.ramify(row['k1']), ramifier.ramify(row['k2'])
        return np.abs(r1 - r2).sum()

    dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1)
    dist_tbl.to_csv(outfile)
Exemplo n.º 20
0
    def test_build_merge_contig_db(self):
        conn_1 = sqlite3.connect(':memory:')
        ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
        contig_db_1 = ContigDB(conn_1, ramifier=ramifier, box_side_len=0.5)
        contig = random_kmer(2 * 10 * 1000)
        contig_db_1.py_add_contig('test_genome_1___test_contig_1',
                                  contig,
                                  gap=100)
        contig_db_1.commit()
        n_stored = len(contig_db_1.get_all_contigs())

        conn_2 = sqlite3.connect(':memory:')
        contig_db_2 = ContigDB(conn_2, ramifier=ramifier, box_side_len=0.5)
        contig = random_kmer(2 * 10 * 1000)
        contig_db_2.py_add_contig('test_genome_2___test_contig_2',
                                  contig,
                                  gap=100)
        contig_db_2.commit()
        n_stored += len(contig_db_2.get_all_contigs())

        contig_db_1.load_other(contig_db_2)

        self.assertEqual(len(contig_db_1.get_all_contigs()), n_stored)