def genomicsqlite_txdb(txdb): """ import bioC TxDb sourced originally from: http://bioconductor.org/packages/release/BiocViews.html#___TxDb """ outfile = txdb[:-7] + ".genomicsqlite" conn = sqlite3.connect(txdb) conn.executescript(genomicsqlite.vacuum_into_sql(conn, outfile)) conn.close() # create GRIs on the three feature tables conn = genomicsqlite.connect(outfile) conn.executescript( genomicsqlite.create_genomic_range_index_sql(conn, "transcript", "tx_chrom", "tx_start", "tx_end", floor=2)) conn.executescript( genomicsqlite.create_genomic_range_index_sql(conn, "cds", "cds_chrom", "cds_start", "cds_end", floor=2)) # intentionally left exon unindexed conn.close() return outfile
def test_gri_levels_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() results = list( con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')") con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"): con.execute( "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')" ) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")) results = list( con.execute( "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')" )) assert results == [(15, 0)]
def test_attach(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con, gri=False) con.commit() attach_script = genomicsqlite.attach_sql(con, str(tmp_path / "test_attached.gsql"), "db2", unsafe_load=True) con.executescript(attach_script) con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid", "beg", "end")) ref_script = genomicsqlite.put_reference_assembly_sql( con, "GRCh38_no_alt_analysis_set", schema="db2") con.executescript(ref_script) query = ( "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql( con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") + " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id") results = list(con.execute(query)) assert len(results) == 5191 refseq_by_name = genomicsqlite.get_reference_sequences_by_name( con, schema="db2") assert len(refseq_by_name) > 24
def test_gri_lvl(): # Test the _gri_lvl generated column which calculates each feature's level number based on its # length. con = sqlite3.connect(":memory:") con.executescript( "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER, expected_lvl INTEGER)" ) for lvl in range(16): for ofs in (-2, -1, 0, 1): featlen = 16 ** lvl + ofs tup = (420, 420 + featlen, (0 - lvl if ofs < 1 else 0 - lvl - 1)) con.execute("INSERT INTO features VALUES(42,?,?,?)", tup) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end") ) assert ( next( con.execute("SELECT count(*) FROM features WHERE expected_lvl == ifnull(_gri_lvl,999)") )[0] == 62 ) assert ( next( con.execute("SELECT count(*) FROM features WHERE expected_lvl != ifnull(_gri_lvl,999)") )[0] == 2 )
def _fill_exons(con, floor=None, table="exons", gri=True, len_gri=False): con.execute( f"CREATE TABLE {table}(rid TEXT NOT NULL, beg INTEGER NOT NULL, end INTEGER NOT NULL, len INTEGER NOT NULL, id TEXT NOT NULL)" ) for line in _EXONS.strip().split("\n"): line = line.split("\t") con.execute( f"INSERT INTO {table}(rid,beg,end,len,id) VALUES(?,?,?,?,?)", (line[0], int(line[1]) - 1, int(line[2]), int(line[2]) - int(line[1]) + 1, line[3]), ) if gri: con.executescript( genomicsqlite.create_genomic_range_index_sql( con, table, "rid", "beg", ("beg+len" if len_gri else "end"), floor=floor ) )
def test_abutment(): # Test GRI query correctness with various cases of features abutting the query range con = sqlite3.connect(":memory:") con.executescript("CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)") pos0 = 10000000000 for lvl in range(9): for ofs in (-1, 0, 1): for tup in ((pos0 - 16 ** lvl, pos0 + ofs), (pos0 + 123 + ofs, pos0 + 123 + 16 ** 9)): con.execute("INSERT INTO features VALUES(42,?,?)", tup) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end") ) query = genomicsqlite.genomic_range_rowids_sql( con, "features", "42", str(pos0), str(pos0 + 123) )[1:-1] control_query = f"SELECT _rowid_ FROM features NOT INDEXED WHERE rid = 42 AND NOT (end < {pos0} OR beg > {pos0+123}) ORDER BY _rowid_" assert list(con.execute(query)) == list(con.execute(control_query))
def test_gri_levels_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() # test caching & invalidation: results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')")) assert results == [(3, 1)] tch1 = con.total_changes con.execute("INSERT INTO exons VALUES('ether',0,4097,4097,'ether')") tch2 = con.total_changes assert tch2 > tch1 results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(4, 1)] con.commit() results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(4, 1)] con.execute("DELETE FROM exons WHERE rid = 'ether'") con.commit() results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')") con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"): con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end") ) results = list( con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')") ) assert results == [(15, 0)]
def main(): header = f"""\ -- GenomicSQLite smoke test -- timestamp: {time.asctime(time.gmtime())} platform: {platform.platform()} uname: {os.uname()} python: {platform.python_implementation()} {platform.python_version()}""" print(textwrap.dedent(header)) print("cpu", end="") try: if platform.system() == "Linux": with open("/proc/cpuinfo") as cpuinfo: modelname = (line.strip().replace("\t", "") for line in cpuinfo if line.lower().strip().startswith("model name")) print(" " + next(modelname, ": ???")) elif platform.system() == "Darwin": sysctl = subprocess.run( ["sysctl", "-n", "machdep.cpu.brand_string"], check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if sysctl.returncode == 0 and sysctl.stdout.strip(): print(": " + sysctl.stdout.strip()) else: print(": ???") else: print(": ???") except Exception: print(": ???") env_keys = [ k for k in os.environ if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH", "LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH") ] if env_keys: print("environment:") for k in env_keys: print(f" {k}={os.environ[k]}") print("sqlite3: ", end="") import sqlite3 conn = sqlite3.connect(":memory:") print(next(conn.execute("SELECT sqlite_version()"))[0]) print("genomicsqlite: ", end="") try: import genomicsqlite except ImportError as exn: print(f"\n\nUnable to import genomicsqlite: {exn}") print( "The Python genomicsqlite package may need to be installed via pip3 or conda," ) print("or its location may need to be added to PYTHONPATH.") sys.exit(1) print(genomicsqlite.__version__) print(f"genomicsqlite library: {genomicsqlite._DLL}") dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"), f"genomicsqlite_smoke_test.{time.monotonic()}") print(f"\ntest database: {dbfile}") try: dbconn = genomicsqlite.connect(dbfile) for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)): dbconn.execute( f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)" ) for line in bed.strip().split("\n"): line = line.split("\t") dbconn.execute( f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)", (line[0], int(line[1]) - 1, int(line[2]), line[3]), ) dbconn.executescript( genomicsqlite.create_genomic_range_index_sql( dbconn, table, "chrom", "pos", "end")) dbconn.commit() query = ( "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n" + textwrap.indent( genomicsqlite.genomic_range_rowids_sql( dbconn, "exons2", "exons1.chrom", "exons1.pos", "exons1.end"), " ", ) + "\nORDER BY exons1.id, exons2.id") print("GRI query:\n" + textwrap.indent(query, " ")) print("GRI query plan:") gri_constraints = 0 for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query): print(" " + expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): gri_constraints += 1 results = list(dbconn.execute(query)) control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id" control_results = list(dbconn.execute(control)) assert len(control_results) == 1139 assert results == control_results if gri_constraints != 3: print("GRI query opcodes:") for expl in dbconn.execute("EXPLAIN " + query): print(" " + str(expl)) print( "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan." ) print( "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n" ) sys.exit(2) dbconn.close() print("\nGenomicSQLite smoke test OK =)\n") finally: os.remove(dbfile)
def test_query_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)" results = list(con.execute(query, ("chr17", 43044294, 43048294))) control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons") control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query control_results = list( con.execute(control_query, ("chr17", 43044294, 43048294))) assert results == control_results for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_", ("chr17", 43044294, 43048294), ): assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3] assert next((expl[3] for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC", ("chr17", 43044294, 43048294), ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3])) with pytest.raises(sqlite3.OperationalError, match="domain error"): con.execute( "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)" ) dbfile2 = str(tmp_path / "test2.gsql") con2 = genomicsqlite.connect(dbfile2, unsafe_load=True) _fill_exons(con2) con2.commit() con2.close() con.executescript( genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True)) query = """ SELECT main.exons.id, db2.exons.id FROM main.exons LEFT JOIN db2.exons ON db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end) AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id """ results = list(con.execute(query)) assert len(results) == 5191 with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute( "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)" ) con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="no such index"): con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" ) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")) results = list( con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" )) assert results == []
def test_level_detection(): # test corner cases for the bit of genomic_range_rowids() which detects the level range of # extant features con = sqlite3.connect(":memory:") con.executescript( "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end")) def fanout(query): return sum( 1 for expl in con.execute("EXPLAIN QUERY PLAN " + query, (None, None, None)) if "((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16 con.executescript( "INSERT INTO features VALUES(NULL, NULL, NULL); INSERT INTO features VALUES(NULL, 0, 10000000000)" ) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16 assert not list( con.execute( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1], (None, 123, 456))) con.executescript("INSERT INTO features VALUES(42, 1048568, 1048584)") query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert "-1" in query and "-0x10)" in query assert not list( con.execute( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1], (42, None, 1048584))) assert fanout(query) == 1 con.executescript(""" INSERT INTO features VALUES(44, 1048568, 1048584); INSERT INTO features VALUES(44, 0, 64000) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert "-4," in query and "-0x10000)" in query assert "-3," in query and "-0x1000)" in query assert "-2," in query and "-0x100)" in query assert "-1," in query and "-0x10)" in query assert fanout(query) == 4 assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features", ceiling=6)[1:-1]) == 7 assert (fanout( genomicsqlite.genomic_range_rowids_sql(con, "features", ceiling=6, floor=3)[1:-1]) == 4) con.executescript(""" INSERT INTO features VALUES(43, NULL, 10000000000); INSERT INTO features VALUES(44, 0, NULL) """) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 4 con.executescript(""" INSERT INTO features VALUES(43, 0, 10000000000); INSERT INTO features VALUES(43, 32, 33) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert fanout(query) == 10 con.executescript(""" INSERT INTO features VALUES(43, 0, 10000000000); INSERT INTO features VALUES(43, 32, 32) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] assert fanout(query) == 10 assert len(list(con.execute(query, (43, 32, 33)))) == 4 assert len(list(con.execute(query, (43, 33, 33)))) == 3