def test_gvcf_dv(tmp_path): dbfile = str(tmp_path / "dv_gvcf_lines.gsql") vcf_lines_into_sqlite( os.path.join(HERE, "data/NA12878.dv0.8.0.chr21.g.vcf.gz"), str(dbfile)) rows = 962897 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
def test_bam(tmp_path): bamfile = os.path.join(HERE, "data/NA12878.chr21:20000000-22500000.bam") dbfile = str(tmp_path / "test.bam.sqlite") sam_into_sqlite(bamfile, str(dbfile), "--table-prefix", "NA12878_") genomicsqlite._compact(str(dbfile), ["--inner-page-KiB", "64", "--outer-page-KiB", "2"]) con = sqlite3.connect(f"file:{dbfile}.compact?mode=ro", uri=True) assert next(con.execute("PRAGMA page_size"))[0] == 2048 con.close() con = sqlite3.connect(f"file:{dbfile}.compact?mode=ro&vfs=zstd", uri=True) assert next(con.execute("PRAGMA page_size"))[0] == 65536 con.close() con = genomicsqlite.connect(f"{dbfile}.compact", read_only=True) count = next(con.execute("SELECT COUNT(*) FROM NA12878_reads"))[0] assert count == 592861 count = next( con.execute("SELECT COUNT(DISTINCT qname) FROM NA12878_reads_seqs"))[0] assert count == 299205 mq_hist = dict( con.execute(""" SELECT mq, COUNT(*) as count FROM (SELECT ifnull(json_extract(tags_json, '$.MQ'),0) AS mq FROM NA12878_reads NATURAL JOIN NA12878_reads_tags WHERE (flag & 3840) = 0) GROUP BY mq ORDER BY mq DESC """)) assert (mq_hist[0], mq_hist[60]) == (2734, 520522)
def genomicsqlite_txdb(txdb): """ import bioC TxDb sourced originally from: http://bioconductor.org/packages/release/BiocViews.html#___TxDb """ outfile = txdb[:-7] + ".genomicsqlite" conn = sqlite3.connect(txdb) conn.executescript(genomicsqlite.vacuum_into_sql(conn, outfile)) conn.close() # create GRIs on the three feature tables conn = genomicsqlite.connect(outfile) conn.executescript( genomicsqlite.create_genomic_range_index_sql(conn, "transcript", "tx_chrom", "tx_start", "tx_end", floor=2)) conn.executescript( genomicsqlite.create_genomic_range_index_sql(conn, "cds", "cds_chrom", "cds_start", "cds_end", floor=2)) # intentionally left exon unindexed conn.close() return outfile
def test_attach(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con, gri=False) con.commit() attach_script = genomicsqlite.attach_sql(con, str(tmp_path / "test_attached.gsql"), "db2", unsafe_load=True) con.executescript(attach_script) con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid", "beg", "end")) ref_script = genomicsqlite.put_reference_assembly_sql( con, "GRCh38_no_alt_analysis_set", schema="db2") con.executescript(ref_script) query = ( "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql( con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") + " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id") results = list(con.execute(query)) assert len(results) == 5191 refseq_by_name = genomicsqlite.get_reference_sequences_by_name( con, schema="db2") assert len(refseq_by_name) > 24
def test_pvcf_gatk(tmp_path): dbfile = str(tmp_path / "gatk_pvcf_lines.gsql") vcf_lines_into_sqlite(os.path.join(HERE, "data/gatk.1KGP.ALDH2.vcf.gz"), str(dbfile)) rows = 2088 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
def test_pvcf_glnexus(tmp_path): dbfile = str(tmp_path / "test.gsql") vcf_into_sqlite( os.path.join(HERE, "data/dv_glnexus.1KGP.ALDH2.vcf.gz"), str(dbfile), ) rows = 1993 samples = 2504 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM variants")) == (rows, ) assert next(con.execute("SELECT COUNT(*) FROM genotypes")) == (rows * samples, ) assert next(con.execute("SELECT SUM(DP) FROM genotypes")) == (134178640, ) assert round(next( con.execute("SELECT SUM(QUAL) FROM variants"))[0]) == 118146 assert next(con.execute("SELECT SUM(_gri_lvl) FROM variants")) == (-158, ) assert list( con.execute( "SELECT GT1, GT2, COUNT(*) AS ct FROM genotypes GROUP BY GT1, GT2 ORDER BY ct DESC LIMIT 8" )) == [ (0, 0, 4840519), (0, 1, 75297), (1, 1, 33338), (None, None, 32971), (0, 2, 2780), (1, 2, 1109), (0, 3, 979), (2, 2, 554), ]
def test_web(): conn = genomicsqlite.connect( "https://github.com/mlin/sqlite_zstd_vfs/releases/download/web-test-db-v1/TxDb.Hsapiens.UCSC.hg38.knownGene.vacuum.genomicsqlite", read_only=True, ) results = list(t[0] for t in conn.execute( "SELECT tx_name FROM transcript WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "transcript") + " ORDER BY tx_name", ("chr12", 111803912, 111804012), )) print(results) sys.stdout.flush() assert results == sorted([ "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1", "ENST00000549106.1" ]) exon_cds_counts = """ SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM genomic_range_index_levels('cds'), exon LEFT JOIN cds ON cds._rowid_ IN genomic_range_rowids('cds', exon_chrom, exon_start, exon_end, _gri_ceiling, _gri_floor) AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_ """ for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts): print(expl[3]) sys.stdout.flush() cds_exon_count_hist = list( conn.execute( f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds" )) assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
def test_gri_levels_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() results = list( con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')") con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"): con.execute( "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')" ) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")) results = list( con.execute( "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')" )) assert results == [(15, 0)]
def test_gvcf_hc(tmp_path): dbfile = str(tmp_path / "gatk_gvcf_lines.gsql") vcf_lines_into_sqlite( os.path.join(HERE, "data/hc.NA12878.chr22:25000000-30000000.g.vcf.gz"), str(dbfile)) rows = 823481 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
def test_pvcf_glnexus(tmp_path): dbfile = str(tmp_path / "glnexus_pvcf_lines.gsql") vcf_lines_into_sqlite( os.path.join(HERE, "data/dv_glnexus.1KGP.ALDH2.vcf.gz"), str(dbfile)) rows = 1994 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM vcf_lines")) == (rows, ) assert next(con.execute("SELECT SUM(rlen) FROM vcf_lines")) == (2560, )
def test_connect(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) con.executescript( genomicsqlite.put_reference_assembly_sql(con, "GRCh38_no_alt_analysis_set")) _fill_exons(con) con.commit() del con con = genomicsqlite.connect(dbfile, read_only=True) query = ( "WITH exons2 AS (SELECT * from exons) SELECT exons.id, exons2.id FROM exons2 LEFT JOIN exons ON exons._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql(con, "exons", "exons2.rid", "exons2.beg", "exons2.end") + " AND exons.id != exons2.id ORDER BY exons.id, exons2.id") results = list(con.execute(query)) assert len(results) == 5191
def test_pvcf_gatk(tmp_path): dbfile = str(tmp_path / "test.gsql") vcf_into_sqlite(os.path.join(HERE, "data/gatk.1KGP.ALDH2.vcf.gz"), str(dbfile)) rows = 2087 samples = 2504 con = genomicsqlite.connect(dbfile, read_only=True) assert next(con.execute("SELECT COUNT(*) FROM variants"))[0] == rows assert next( con.execute("SELECT COUNT(*) FROM genotypes"))[0] == rows * samples
def test_bundled_extensions(): con = genomicsqlite.connect(":memory:") con.executescript(""" CREATE TABLE test(json TEXT); INSERT INTO test(json) VALUES('{"chrom":"chr10"}'); INSERT INTO test(json) VALUES('{"chrom":"chr2"}'); INSERT INTO test(json) VALUES('{"chrom":"chrMT"}'); """) results = con.execute( "SELECT json_extract(json,'$.chrom') AS chrom FROM test ORDER BY chrom COLLATE UINT" ) assert list(results) == [("chr2", ), ("chr10", ), ("chrMT", )]
def test_dna_revcomp(): con = genomicsqlite.connect(":memory:") assert next(con.execute("SELECT dna_revcomp('AGCTagct')"))[0] == "agctAGCT" assert next(con.execute("SELECT dna_revcomp('gATtaCa')"))[0] == "tGtaATc" assert next(con.execute("SELECT dna_revcomp('')"))[0] == "" assert next(con.execute("SELECT dna_revcomp(NULL)"))[0] is None with pytest.raises(sqlite3.OperationalError): con.execute("SELECT dna_revcomp('GATTACAb')") with pytest.raises(sqlite3.OperationalError): con.execute("SELECT dna_revcomp('GATTACA ')") with pytest.raises(sqlite3.OperationalError): con.execute("SELECT dna_revcomp(42)")
def test_twobit_random(): con = genomicsqlite.connect(":memory:") random.seed(42) for seqlen in (random.randint(2, 1250) for _ in range(5000)): rna = random.choice((False, True)) nucs = ( ("a", "A", "g", "G", "c", "C", "u", "U") if rna else ("a", "A", "g", "G", "c", "C", "t", "T") ) seq = "".join(random.choice(nucs) for _ in range(seqlen)) assert (rna and ("t" not in seq and "T" not in seq)) or ( not rna and ("u" not in seq and "U" not in seq) ) crumbs = next(con.execute("SELECT nucleotides_twobit(?)", (seq,)))[0] assert isinstance(crumbs, bytes) assert len(crumbs) == math.ceil(len(seq) / 4) + 1 assert next(con.execute("SELECT twobit_length(nucleotides_twobit(?))", (seq,)))[0] == len( seq ) query = f"SELECT {'twobit_rna' if rna else 'twobit_dna'}(nucleotides_twobit(?))" decoded = next(con.execute(query, (seq,)))[0] assert decoded == seq.upper() # test built-in substr sub_ofs = random.randint(0, seqlen) sub_len = random.randint(0, seqlen) decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, sub_ofs, sub_len)))[0] control = next(con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, sub_ofs, sub_len)))[0] assert decoded == control # test with negative offset/length -- https://sqlite.org/lang_corefunc.html#substr decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, 0 - sub_ofs, sub_len)))[0] control = next( con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, 0 - sub_ofs, sub_len)) )[0] assert decoded == control decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, sub_ofs, 0 - sub_len)))[0] control = next( con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, sub_ofs, 0 - sub_len)) )[0] assert decoded == control
def test_gnomad_sites_small(tmp_path): dbfile = str(tmp_path / "test.gsql") vcf_into_sqlite( os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"), str(dbfile), "--table-prefix", "gnomad_", ) con = genomicsqlite.connect(dbfile, read_only=True) query = ( "SELECT variant_rowid, id_jsarray FROM (SELECT _gri_rid AS rid FROM _gri_refseq WHERE gri_refseq_name=?1) AS query, gnomad_variants WHERE variant_rowid IN" + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants", "query.rid")) rs671 = ("chr12", 111803912, 111804012) print(query) indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671): print(expl[3]) if "USING INDEX gnomad_variants__gri" in expl[3]: indexed += 1 assert indexed == 3 opcodes = list(con.execute("EXPLAIN " + query, rs671)) accessed_cursors = list(opcode[2] for opcode in opcodes if opcode[1] == "Column") table_rootpages = set(row[0] for row in con.execute( "SELECT rootpage FROM sqlite_master WHERE type='table'")) table_cursors = set( opcode[2] for opcode in opcodes if opcode[1] == "OpenRead" and opcode[3] in table_rootpages) assert sum( 1 for cursor in accessed_cursors if cursor not in table_cursors) > 1 assert sum(1 for cursor in accessed_cursors if cursor in table_cursors) == 1 results = list(con.execute(query, rs671)) results_rowids = set(vt[0] for vt in results) assert next(vt for vt in results if vt[1] and "rs671" in vt[1]) control = "SELECT variant_rowid FROM gnomad_variants NATURAL JOIN _gri_refseq WHERE gri_refseq_name = ? AND NOT ((pos+rlen) < ? OR pos > ?)" control_rowids = set(vt[0] for vt in con.execute(control, rs671)) assert len(control_rowids) == 22 assert results_rowids == control_rowids
def test_gnomad_sites_small(tmp_path): dbfile = str(tmp_path / "gnomad_lines.gsql") vcf_lines_into_sqlite( os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"), str(dbfile), "--table", "gnomad_vcf_lines", ) con = genomicsqlite.connect(dbfile, read_only=True) query = ("SELECT _rowid_, line FROM gnomad_vcf_lines WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_vcf_lines")) rs671 = ("chr12", 111803912, 111804012) print(query) indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671): print(expl[3]) if "USING INDEX gnomad_vcf_lines__gri" in expl[3]: indexed += 1 assert indexed == 3 opcodes = list(con.execute("EXPLAIN " + query, rs671)) accessed_cursors = list(opcode[2] for opcode in opcodes if opcode[1] == "Column") table_rootpages = set(row[0] for row in con.execute( "SELECT rootpage FROM sqlite_master WHERE type='table'")) table_cursors = set( opcode[2] for opcode in opcodes if opcode[1] == "OpenRead" and opcode[3] in table_rootpages) assert sum( 1 for cursor in accessed_cursors if cursor not in table_cursors) > 1 assert sum(1 for cursor in accessed_cursors if cursor in table_cursors) == 1 results = list(con.execute(query, rs671)) results_rowids = set(vt[0] for vt in results) assert next(vt for vt in results if vt[1] and "rs671" in vt[1]) control = "SELECT rowid FROM gnomad_vcf_lines WHERE NOT ((POS+rlen) < ? OR POS > ?)" control_rowids = set(vt[0] for vt in con.execute(control, (rs671[1], rs671[2]))) assert len(control_rowids) == 22 assert results_rowids == control_rowids
def test_gri_levels_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() # test caching & invalidation: results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')")) assert results == [(3, 1)] tch1 = con.total_changes con.execute("INSERT INTO exons VALUES('ether',0,4097,4097,'ether')") tch2 = con.total_changes assert tch2 > tch1 results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(4, 1)] con.commit() results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(4, 1)] con.execute("DELETE FROM exons WHERE rid = 'ether'") con.commit() results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')")) assert results == [(3, 1)] results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')")) assert results == [(3, 1)] with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')") con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"): con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end") ) results = list( con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')") ) assert results == [(15, 0)]
def test_parse_genomic_range(): con = genomicsqlite.connect(":memory:") for (txt, chrom, begin_pos, end_pos) in [ ("chr1:2,345-06,789", "chr1", 2344, 6789), ("π:1-9,223,372,036,854,775,799", "π", 0, 9223372036854775799), ]: assert next( con.execute("SELECT parse_genomic_range_sequence(?)", (txt, )))[0] == chrom assert next(con.execute("SELECT parse_genomic_range_begin(?)", (txt, )))[0] == begin_pos assert next(con.execute("SELECT parse_genomic_range_end(?)", (txt, )))[0] == end_pos for txt in [ "", ":", "-", ":-", ":1-2", "chr1", "chr1:0-1", "chr1:1,234", "chr1:1,234-", "chr1:1,234-5,67", "chr1 :2,345-06,789", "chr1:2,345-06,789\t", "chr1:2345-deadbeef", "chr1:1-9,223,372,036,854,775,800", ]: with pytest.raises(sqlite3.OperationalError) as exc: con.execute("SELECT parse_genomic_range_sequence(?)", (txt, )) assert "parse_genomic_range():" in str(exc.value) with pytest.raises(sqlite3.OperationalError): con.execute("SELECT parse_genomic_range_begin(?)", (txt, )) assert "parse_genomic_range():" in str(exc.value) with pytest.raises(sqlite3.OperationalError): con.execute("SELECT parse_genomic_range_end(?)", (txt, )) assert "parse_genomic_range():" in str(exc.value) assert next(con.execute("SELECT parse_genomic_range_end(NULL)"))[0] is None
def test_twobit_column(): # test populating a column with mixed BLOB and TEXT values con = genomicsqlite.connect(":memory:") con.executescript("CREATE TABLE test(test_twobit BLOB)") for elt in list("Tu") + ["foo", "bar", "gATuaCa"]: con.execute("INSERT INTO test(test_twobit) VALUES(nucleotides_twobit(?))", (elt,)) column = list(con.execute("SELECT test_twobit FROM test")) assert isinstance(column[0][0], bytes), str([type(x[0]) for x in column]) assert isinstance(column[-1][0], bytes) assert isinstance(column[-2][0], str) assert column[-2][0] == "bar" assert list(con.execute("SELECT twobit_dna(test_twobit) FROM test")) == [ ("T",), ("T",), ("foo",), ("bar",), ("GATTACA",), ]
def test_twobit_corner_cases(): con = genomicsqlite.connect(":memory:") for nuc in "AGCTagct": assert next(con.execute("SELECT length(nucleotides_twobit(?))", (nuc,)))[0] == 1 assert ( next(con.execute("SELECT twobit_dna(nucleotides_twobit(?))", (nuc,)))[0] == nuc.upper() ) assert next(con.execute("SELECT nucleotides_twobit('')"))[0] == "" assert next(con.execute("SELECT nucleotides_twobit('acgt 1')"))[0] == "acgt 1" assert next(con.execute("SELECT twobit_dna('acgt 1')"))[0] == "acgt 1" assert next(con.execute("SELECT twobit_dna('acgt 1',1,6)"))[0] == "acgt 1" assert next(con.execute("SELECT twobit_dna('acgt 1',3,3)"))[0] == "gt " assert next(con.execute("SELECT twobit_dna('acgt 1',-2,-3)"))[0] == "cgt" # exhaustively test offset/length corner cases for xtest in range(-9, 9): for ytest in range(-9, 9): decoded = next( con.execute("SELECT twobit_rna(nucleotides_twobit('gattaca'),?,?)", (xtest, ytest)) )[0] control = next(con.execute("SELECT substr('GAUUACA',?,?)", (xtest, ytest)))[0] assert decoded == control, str((xtest, ytest))
def test_bam(tmp_path): bamfile = os.path.join(HERE, "data/NA12878.chr21:20000000-22500000.bam") dbfile = str(tmp_path / "test.bam.sqlite") sam_into_sqlite(bamfile, str(dbfile), "--table-prefix", "NA12878_") con = genomicsqlite.connect(dbfile, read_only=True) count = next(con.execute("SELECT COUNT(*) FROM NA12878_reads"))[0] assert count == 592861 count = next( con.execute("SELECT COUNT(DISTINCT qname) FROM NA12878_reads_seqs"))[0] assert count == 299205 mq_hist = dict( con.execute(""" SELECT mq, COUNT(*) as count FROM (SELECT ifnull(json_extract(tags_json, '$.MQ'),0) AS mq FROM NA12878_reads NATURAL JOIN NA12878_reads_tags WHERE (flag & 3840) = 0) GROUP BY mq ORDER BY mq DESC """)) assert (mq_hist[0], mq_hist[60]) == (2734, 520522)
def test_txdbquery(genomicsqlite_txdb): conn = genomicsqlite.connect(genomicsqlite_txdb, read_only=True) assert next(conn.execute("PRAGMA page_size"))[0] == 16384 # one query results = list(t[0] for t in conn.execute( "SELECT tx_name FROM transcript WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "transcript") + " ORDER BY tx_name", ("chr12", 111803912, 111804012), )) print(results) assert results == sorted([ "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1", "ENST00000549106.1" ]) # random queries chroms = list( conn.execute( "SELECT tx_chrom, length FROM (SELECT tx_chrom, MAX(tx_end) AS length FROM transcript GROUP BY tx_chrom) WHERE length > 1000000" )) random.seed(0xBADF00D) for tbl in ("transcript", "cds"): query = genomicsqlite.genomic_range_rowids_sql(conn, tbl)[1:-1] fanout = 0 for expl in conn.execute("EXPLAIN QUERY PLAN " + query, (None, None, None)): print(expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): fanout += 1 assert (tbl, fanout) in (("transcript", 5), ("cds", 3)) pfx = "tx" if tbl == "transcript" else tbl control_query = f"SELECT _rowid_ FROM {tbl} NOT INDEXED WHERE {pfx}_chrom = ? AND NOT ({pfx}_end < ? OR {pfx}_start > ?) ORDER BY _rowid_" total_results = 0 for _ in range(2000): chrom = random.choice(chroms) beg = random.randint(0, chrom[1] - 65536) end = beg + random.randint(1, random.choice([16, 256, 4096, 65536 ])) ids = list(row[0] for row in conn.execute(query, (chrom[0], beg, end))) control_ids = list( row[0] for row in conn.execute(control_query, (chrom[0], beg, end))) assert ids == control_ids total_results += len(control_ids) assert total_results in (7341, 2660) # join exon to cds ("which exons are coding?") exon_cds_counts = ( "SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM exon LEFT JOIN cds ON cds._rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "cds", "exon_chrom", "exon_start", "exon_end") + " AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_" ) print(exon_cds_counts) fanout = 0 for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts): print(expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): fanout += 1 assert fanout == 3 cds_exon_count_hist = list( conn.execute( f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds" )) for elt in cds_exon_count_hist: print(elt) assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
def main(): header = f"""\ -- GenomicSQLite smoke test -- timestamp: {time.asctime(time.gmtime())} platform: {platform.platform()} uname: {os.uname()} python: {platform.python_implementation()} {platform.python_version()}""" print(textwrap.dedent(header)) print("cpu", end="") try: if platform.system() == "Linux": with open("/proc/cpuinfo") as cpuinfo: modelname = (line.strip().replace("\t", "") for line in cpuinfo if line.lower().strip().startswith("model name")) print(" " + next(modelname, ": ???")) elif platform.system() == "Darwin": sysctl = subprocess.run( ["sysctl", "-n", "machdep.cpu.brand_string"], check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if sysctl.returncode == 0 and sysctl.stdout.strip(): print(": " + sysctl.stdout.strip()) else: print(": ???") else: print(": ???") except Exception: print(": ???") env_keys = [ k for k in os.environ if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH", "LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH") ] if env_keys: print("environment:") for k in env_keys: print(f" {k}={os.environ[k]}") print("sqlite3: ", end="") import sqlite3 conn = sqlite3.connect(":memory:") print(next(conn.execute("SELECT sqlite_version()"))[0]) print("genomicsqlite: ", end="") try: import genomicsqlite except ImportError as exn: print(f"\n\nUnable to import genomicsqlite: {exn}") print( "The Python genomicsqlite package may need to be installed via pip3 or conda," ) print("or its location may need to be added to PYTHONPATH.") sys.exit(1) print(genomicsqlite.__version__) print(f"genomicsqlite library: {genomicsqlite._DLL}") dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"), f"genomicsqlite_smoke_test.{time.monotonic()}") print(f"\ntest database: {dbfile}") try: dbconn = genomicsqlite.connect(dbfile) for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)): dbconn.execute( f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)" ) for line in bed.strip().split("\n"): line = line.split("\t") dbconn.execute( f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)", (line[0], int(line[1]) - 1, int(line[2]), line[3]), ) dbconn.executescript( genomicsqlite.create_genomic_range_index_sql( dbconn, table, "chrom", "pos", "end")) dbconn.commit() query = ( "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n" + textwrap.indent( genomicsqlite.genomic_range_rowids_sql( dbconn, "exons2", "exons1.chrom", "exons1.pos", "exons1.end"), " ", ) + "\nORDER BY exons1.id, exons2.id") print("GRI query:\n" + textwrap.indent(query, " ")) print("GRI query plan:") gri_constraints = 0 for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query): print(" " + expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): gri_constraints += 1 results = list(dbconn.execute(query)) control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id" control_results = list(dbconn.execute(control)) assert len(control_results) == 1139 assert results == control_results if gri_constraints != 3: print("GRI query opcodes:") for expl in dbconn.execute("EXPLAIN " + query): print(" " + str(expl)) print( "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan." ) print( "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n" ) sys.exit(2) dbconn.close() print("\nGenomicSQLite smoke test OK =)\n") finally: os.remove(dbfile)
def test_query_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)" results = list(con.execute(query, ("chr17", 43044294, 43048294))) control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons") control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query control_results = list( con.execute(control_query, ("chr17", 43044294, 43048294))) assert results == control_results for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_", ("chr17", 43044294, 43048294), ): assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3] assert next((expl[3] for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC", ("chr17", 43044294, 43048294), ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3])) with pytest.raises(sqlite3.OperationalError, match="domain error"): con.execute( "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)" ) dbfile2 = str(tmp_path / "test2.gsql") con2 = genomicsqlite.connect(dbfile2, unsafe_load=True) _fill_exons(con2) con2.commit() con2.close() con.executescript( genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True)) query = """ SELECT main.exons.id, db2.exons.id FROM main.exons LEFT JOIN db2.exons ON db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end) AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id """ results = list(con.execute(query)) assert len(results) == 5191 with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute( "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)" ) con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="no such index"): con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" ) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")) results = list( con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" )) assert results == []