Пример #1
0
def test_indexing():
    con = sqlite3.connect(":memory:")
    _fill_exons(con)
    con.commit()

    query = genomicsqlite.genomic_range_rowids_sql(con, "exons")
    query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + query
    print("\n" + query)

    # The query should only search the relevant GRI levels
    indexed = 0
    for expl in con.execute("EXPLAIN QUERY PLAN " + query,
                            ("chr17", 43044294, 43048294)):
        print(expl[3])
        if "USING INDEX exons__gri" in expl[3]:
            indexed += 1
    assert indexed == 3

    # The query should be covered by the index except for one final fetch of exons.id
    opcodes = list(
        con.execute("EXPLAIN " + query, ("chr17", 43044294, 43048294)))
    # for expl in opcodes:
    #    if expl[1] in {"OpenRead", "OpenPseudo", "Column"}:
    #        print(expl)
    accessed_cursors = list(opcode[2] for opcode in opcodes
                            if opcode[1] == "Column")
    table_rootpages = set(row[0] for row in con.execute(
        "SELECT rootpage FROM sqlite_master WHERE type='table'"))
    table_cursors = set(
        opcode[2] for opcode in opcodes
        if opcode[1] == "OpenRead" and opcode[3] in table_rootpages)
    assert sum(1 for cursor in accessed_cursors
               if cursor not in table_cursors) > 1  # get from idx
    assert sum(1 for cursor in accessed_cursors
               if cursor in table_cursors) == 1  # get from table

    # (Also, it should produce correct results)
    control_query = "SELECT id FROM exons NOT INDEXED WHERE rid = ? AND NOT (end < ? OR beg > ?) ORDER BY _rowid_"
    random.seed(0xBADF00D)
    total_results = 0
    for _ in range(50000):
        beg = random.randint(43044294 - 10000, 43044294 + 10000)
        end = beg + random.randint(1, random.choice([10, 100, 1000, 10000]))
        ids = list(row[0] for row in con.execute(query, ("chr17", beg, end)))
        control_ids = list(
            row[0] for row in con.execute(control_query, ("chr17", beg, end)))
        assert ids == control_ids
        total_results += len(control_ids)
    assert total_results == 189935

    with pytest.raises(sqlite3.OperationalError):
        genomicsqlite.genomic_range_rowids_sql(con, "nonexistent_table")
Пример #2
0
def test_refseq():
    con = sqlite3.connect(":memory:")

    create_assembly = genomicsqlite.put_reference_assembly_sql(
        con, "GRCh38_no_alt_analysis_set")
    lines = create_assembly.strip().split("\n")
    print("\n".join([line for line in lines if "INSERT INTO" in line][:24]))
    assert len([line for line in lines if "INSERT INTO" in line]) == 195
    print("\n".join([line for line in lines if "INSERT INTO" not in line]))
    assert len([line for line in lines if "INSERT INTO" not in line]) == 2
    con.executescript(create_assembly)

    _fill_exons(con, floor=2)
    con.commit()

    refseq_by_rid = genomicsqlite.get_reference_sequences_by_rid(con)
    refseq_by_name = genomicsqlite.get_reference_sequences_by_name(con)
    for refseq in refseq_by_rid.values():
        assert refseq_by_rid[refseq.rid] == refseq
        assert refseq_by_name[refseq.name] == refseq
        if refseq.name == "chr17":
            assert refseq.rid == 17
            assert refseq.length == 83257441
            assert refseq.assembly == "GRCh38_no_alt_analysis_set"
            assert refseq.refget_id == "f9a0fb01553adb183568e3eb9d8626db"
    assert len(refseq_by_rid) == 195

    query = (
        "SELECT _gri_refseq._gri_rid, rid, beg, end, id FROM exons, _gri_refseq WHERE exons.rid = gri_refseq_name AND exons._rowid_ IN "
        + genomicsqlite.genomic_range_rowids_sql(con, "exons"))
    print("\n" + query)
    assert len([line for line in query.split("\n") if "BETWEEN" in line]) == 2
    assert len(list(con.execute(query, ("chr17", 43115725, 43125370)))) == 56
Пример #3
0
def test_attach(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con, gri=False)
    con.commit()

    attach_script = genomicsqlite.attach_sql(con,
                                             str(tmp_path /
                                                 "test_attached.gsql"),
                                             "db2",
                                             unsafe_load=True)
    con.executescript(attach_script)
    con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons")
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid",
                                                     "beg", "end"))
    ref_script = genomicsqlite.put_reference_assembly_sql(
        con, "GRCh38_no_alt_analysis_set", schema="db2")
    con.executescript(ref_script)

    query = (
        "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n"
        + genomicsqlite.genomic_range_rowids_sql(
            con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") +
        " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id")
    results = list(con.execute(query))
    assert len(results) == 5191

    refseq_by_name = genomicsqlite.get_reference_sequences_by_name(
        con, schema="db2")
    assert len(refseq_by_name) > 24
Пример #4
0
def test_join():
    for len_gri in (False, True):
        con = sqlite3.connect(":memory:")
        con.executescript(
            genomicsqlite.put_reference_assembly_sql(
                con, "GRCh38_no_alt_analysis_set"))
        _fill_exons(con, table="exons")
        _fill_exons(con, floor=2, table="exons2", len_gri=len_gri)
        con.commit()

        query = (
            "SELECT exons.id, exons2.id FROM exons LEFT JOIN exons2 ON exons2._rowid_ IN\n"
            + genomicsqlite.genomic_range_rowids_sql(
                con, "exons2", "exons.rid", "exons.beg", "exons.end") +
            " AND exons.id != exons2.id ORDER BY exons.id, exons2.id")
        print(query)
        indexed = 0
        for expl in con.execute("EXPLAIN QUERY PLAN " + query):
            print(expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                indexed += 1
        assert indexed == 2
        results = list(con.execute(query))
        assert len(results) == 5191
        assert len([result for result in results if result[1] is None]) == 5
        control = "SELECT exons.id, exons2.id FROM exons LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons.beg OR exons2.beg > exons.end) AND exons.id != exons2.id ORDER BY exons.id, exons2.id"
        control = list(con.execute(control))
        assert results == control
Пример #5
0
def test_web():
    conn = genomicsqlite.connect(
        "https://github.com/mlin/sqlite_zstd_vfs/releases/download/web-test-db-v1/TxDb.Hsapiens.UCSC.hg38.knownGene.vacuum.genomicsqlite",
        read_only=True,
    )

    results = list(t[0] for t in conn.execute(
        "SELECT tx_name FROM transcript WHERE _rowid_ IN " +
        genomicsqlite.genomic_range_rowids_sql(conn, "transcript") +
        " ORDER BY tx_name",
        ("chr12", 111803912, 111804012),
    ))
    print(results)
    sys.stdout.flush()
    assert results == sorted([
        "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1",
        "ENST00000549106.1"
    ])

    exon_cds_counts = """
        SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds
        FROM genomic_range_index_levels('cds'), exon LEFT JOIN cds
            ON cds._rowid_ IN genomic_range_rowids('cds', exon_chrom, exon_start, exon_end, _gri_ceiling, _gri_floor)
            AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end)
        GROUP BY exon._rowid_
        """
    for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts):
        print(expl[3])
    sys.stdout.flush()
    cds_exon_count_hist = list(
        conn.execute(
            f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds"
        ))
    assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
Пример #6
0
def test_abutment():
    # Test GRI query correctness with various cases of features abutting the query range
    con = sqlite3.connect(":memory:")
    con.executescript("CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)")
    pos0 = 10000000000
    for lvl in range(9):
        for ofs in (-1, 0, 1):
            for tup in ((pos0 - 16 ** lvl, pos0 + ofs), (pos0 + 123 + ofs, pos0 + 123 + 16 ** 9)):
                con.execute("INSERT INTO features VALUES(42,?,?)", tup)
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end")
    )
    query = genomicsqlite.genomic_range_rowids_sql(
        con, "features", "42", str(pos0), str(pos0 + 123)
    )[1:-1]
    control_query = f"SELECT _rowid_ FROM features NOT INDEXED WHERE rid = 42 AND NOT (end < {pos0} OR beg > {pos0+123}) ORDER BY _rowid_"
    assert list(con.execute(query)) == list(con.execute(control_query))
Пример #7
0
def test_gnomad_sites_small(tmp_path):
    dbfile = str(tmp_path / "test.gsql")

    vcf_into_sqlite(
        os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"),
        str(dbfile),
        "--table-prefix",
        "gnomad_",
    )

    con = genomicsqlite.connect(dbfile, read_only=True)

    query = (
        "SELECT variant_rowid, id_jsarray FROM (SELECT _gri_rid AS rid FROM _gri_refseq WHERE gri_refseq_name=?1) AS query, gnomad_variants WHERE variant_rowid IN"
        + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants",
                                                 "query.rid"))
    rs671 = ("chr12", 111803912, 111804012)
    print(query)

    indexed = 0
    for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671):
        print(expl[3])
        if "USING INDEX gnomad_variants__gri" in expl[3]:
            indexed += 1
    assert indexed == 3
    opcodes = list(con.execute("EXPLAIN " + query, rs671))
    accessed_cursors = list(opcode[2] for opcode in opcodes
                            if opcode[1] == "Column")
    table_rootpages = set(row[0] for row in con.execute(
        "SELECT rootpage FROM sqlite_master WHERE type='table'"))
    table_cursors = set(
        opcode[2] for opcode in opcodes
        if opcode[1] == "OpenRead" and opcode[3] in table_rootpages)
    assert sum(
        1 for cursor in accessed_cursors if cursor not in table_cursors) > 1
    assert sum(1 for cursor in accessed_cursors
               if cursor in table_cursors) == 1

    results = list(con.execute(query, rs671))
    results_rowids = set(vt[0] for vt in results)
    assert next(vt for vt in results if vt[1] and "rs671" in vt[1])

    control = "SELECT variant_rowid FROM gnomad_variants NATURAL JOIN _gri_refseq WHERE gri_refseq_name = ? AND NOT ((pos+rlen) < ? OR pos > ?)"
    control_rowids = set(vt[0] for vt in con.execute(control, rs671))
    assert len(control_rowids) == 22
    assert results_rowids == control_rowids
Пример #8
0
def test_connect(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    con.executescript(
        genomicsqlite.put_reference_assembly_sql(con,
                                                 "GRCh38_no_alt_analysis_set"))
    _fill_exons(con)
    con.commit()
    del con

    con = genomicsqlite.connect(dbfile, read_only=True)
    query = (
        "WITH exons2 AS (SELECT * from exons) SELECT exons.id, exons2.id FROM exons2 LEFT JOIN exons ON exons._rowid_ IN\n"
        + genomicsqlite.genomic_range_rowids_sql(con, "exons", "exons2.rid",
                                                 "exons2.beg", "exons2.end") +
        " AND exons.id != exons2.id ORDER BY exons.id, exons2.id")
    results = list(con.execute(query))
    assert len(results) == 5191
Пример #9
0
def test_gnomad_sites_small(tmp_path):
    dbfile = str(tmp_path / "gnomad_lines.gsql")

    vcf_lines_into_sqlite(
        os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"),
        str(dbfile),
        "--table",
        "gnomad_vcf_lines",
    )

    con = genomicsqlite.connect(dbfile, read_only=True)
    query = ("SELECT _rowid_, line FROM gnomad_vcf_lines WHERE _rowid_ IN " +
             genomicsqlite.genomic_range_rowids_sql(con, "gnomad_vcf_lines"))
    rs671 = ("chr12", 111803912, 111804012)
    print(query)

    indexed = 0
    for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671):
        print(expl[3])
        if "USING INDEX gnomad_vcf_lines__gri" in expl[3]:
            indexed += 1
    assert indexed == 3
    opcodes = list(con.execute("EXPLAIN " + query, rs671))
    accessed_cursors = list(opcode[2] for opcode in opcodes
                            if opcode[1] == "Column")
    table_rootpages = set(row[0] for row in con.execute(
        "SELECT rootpage FROM sqlite_master WHERE type='table'"))
    table_cursors = set(
        opcode[2] for opcode in opcodes
        if opcode[1] == "OpenRead" and opcode[3] in table_rootpages)
    assert sum(
        1 for cursor in accessed_cursors if cursor not in table_cursors) > 1
    assert sum(1 for cursor in accessed_cursors
               if cursor in table_cursors) == 1

    results = list(con.execute(query, rs671))
    results_rowids = set(vt[0] for vt in results)
    assert next(vt for vt in results if vt[1] and "rs671" in vt[1])

    control = "SELECT rowid FROM gnomad_vcf_lines WHERE NOT ((POS+rlen) < ? OR POS > ?)"
    control_rowids = set(vt[0]
                         for vt in con.execute(control, (rs671[1], rs671[2])))
    assert len(control_rowids) == 22
    assert results_rowids == control_rowids
Пример #10
0
def main():
    header = f"""\
        -- GenomicSQLite smoke test --
        timestamp: {time.asctime(time.gmtime())}
        platform: {platform.platform()}
        uname: {os.uname()}
        python: {platform.python_implementation()} {platform.python_version()}"""
    print(textwrap.dedent(header))

    print("cpu", end="")
    try:
        if platform.system() == "Linux":
            with open("/proc/cpuinfo") as cpuinfo:
                modelname = (line.strip().replace("\t", "") for line in cpuinfo
                             if line.lower().strip().startswith("model name"))
                print(" " + next(modelname, ": ???"))
        elif platform.system() == "Darwin":
            sysctl = subprocess.run(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                check=True,
                universal_newlines=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            if sysctl.returncode == 0 and sysctl.stdout.strip():
                print(": " + sysctl.stdout.strip())
            else:
                print(": ???")
        else:
            print(": ???")
    except Exception:
        print(": ???")

    env_keys = [
        k for k in os.environ
        if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH",
                                                   "LD_LIBRARY_PATH",
                                                   "DYLD_LIBRARY_PATH")
    ]
    if env_keys:
        print("environment:")
        for k in env_keys:
            print(f"  {k}={os.environ[k]}")

    print("sqlite3: ", end="")
    import sqlite3

    conn = sqlite3.connect(":memory:")
    print(next(conn.execute("SELECT sqlite_version()"))[0])

    print("genomicsqlite: ", end="")
    try:
        import genomicsqlite
    except ImportError as exn:
        print(f"\n\nUnable to import genomicsqlite: {exn}")
        print(
            "The Python genomicsqlite package may need to be installed via pip3 or conda,"
        )
        print("or its location may need to be added to PYTHONPATH.")
        sys.exit(1)

    print(genomicsqlite.__version__)
    print(f"genomicsqlite library: {genomicsqlite._DLL}")

    dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"),
                          f"genomicsqlite_smoke_test.{time.monotonic()}")
    print(f"\ntest database: {dbfile}")
    try:
        dbconn = genomicsqlite.connect(dbfile)
        for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)):
            dbconn.execute(
                f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)"
            )
            for line in bed.strip().split("\n"):
                line = line.split("\t")
                dbconn.execute(
                    f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)",
                    (line[0], int(line[1]) - 1, int(line[2]), line[3]),
                )
            dbconn.executescript(
                genomicsqlite.create_genomic_range_index_sql(
                    dbconn, table, "chrom", "pos", "end"))
        dbconn.commit()

        query = (
            "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n"
            + textwrap.indent(
                genomicsqlite.genomic_range_rowids_sql(
                    dbconn, "exons2", "exons1.chrom", "exons1.pos",
                    "exons1.end"),
                " ",
            ) + "\nORDER BY exons1.id, exons2.id")
        print("GRI query:\n" + textwrap.indent(query, "  "))
        print("GRI query plan:")
        gri_constraints = 0
        for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query):
            print("  " + expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                gri_constraints += 1

        results = list(dbconn.execute(query))
        control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id"
        control_results = list(dbconn.execute(control))
        assert len(control_results) == 1139
        assert results == control_results

        if gri_constraints != 3:
            print("GRI query opcodes:")
            for expl in dbconn.execute("EXPLAIN " + query):
                print("  " + str(expl))

            print(
                "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan."
            )
            print(
                "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n"
            )

            sys.exit(2)

        dbconn.close()
        print("\nGenomicSQLite smoke test OK =)\n")
    finally:
        os.remove(dbfile)
Пример #11
0
def test_query_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)"
    results = list(con.execute(query, ("chr17", 43044294, 43048294)))

    control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons")
    control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query
    control_results = list(
        con.execute(control_query, ("chr17", 43044294, 43048294)))

    assert results == control_results

    for expl in con.execute(
            "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_",
        ("chr17", 43044294, 43048294),
    ):
        assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3]

    assert next((expl[3] for expl in con.execute(
        "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC",
        ("chr17", 43044294, 43048294),
    ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3]))

    with pytest.raises(sqlite3.OperationalError, match="domain error"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)"
        )

    dbfile2 = str(tmp_path / "test2.gsql")
    con2 = genomicsqlite.connect(dbfile2, unsafe_load=True)
    _fill_exons(con2)
    con2.commit()
    con2.close()

    con.executescript(
        genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True))
    query = """
        SELECT main.exons.id, db2.exons.id
            FROM main.exons LEFT JOIN db2.exons ON
            db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end)
            AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id
        """
    results = list(con.execute(query))
    assert len(results) == 5191

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)"
        )

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError, match="no such index"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        )

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid",
                                                     "beg", "end"))
    results = list(
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        ))
    assert results == []
Пример #12
0
def test_level_detection():
    # test corner cases for the bit of genomic_range_rowids() which detects the level range of
    # extant features

    con = sqlite3.connect(":memory:")
    con.executescript(
        "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)")
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "features", "rid",
                                                     "beg", "end"))

    def fanout(query):
        return sum(
            1 for expl in con.execute("EXPLAIN QUERY PLAN " +
                                      query, (None, None, None)) if
            "((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
            in expl[3])

    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16

    con.executescript(
        "INSERT INTO features VALUES(NULL, NULL, NULL); INSERT INTO features VALUES(NULL, 0, 10000000000)"
    )
    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16
    assert not list(
        con.execute(
            genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1],
            (None, 123, 456)))

    con.executescript("INSERT INTO features VALUES(42, 1048568, 1048584)")
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert "-1" in query and "-0x10)" in query
    assert not list(
        con.execute(
            genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1],
            (42, None, 1048584)))

    assert fanout(query) == 1

    con.executescript("""
        INSERT INTO features VALUES(44, 1048568, 1048584);
        INSERT INTO features VALUES(44, 0, 64000)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert "-4," in query and "-0x10000)" in query
    assert "-3," in query and "-0x1000)" in query
    assert "-2," in query and "-0x100)" in query
    assert "-1," in query and "-0x10)" in query
    assert fanout(query) == 4

    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features",
                                               ceiling=6)[1:-1]) == 7
    assert (fanout(
        genomicsqlite.genomic_range_rowids_sql(con,
                                               "features",
                                               ceiling=6,
                                               floor=3)[1:-1]) == 4)

    con.executescript("""
        INSERT INTO features VALUES(43, NULL, 10000000000);
        INSERT INTO features VALUES(44, 0, NULL)
        """)
    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 4

    con.executescript("""
        INSERT INTO features VALUES(43, 0, 10000000000);
        INSERT INTO features VALUES(43, 32, 33)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert fanout(query) == 10

    con.executescript("""
        INSERT INTO features VALUES(43, 0, 10000000000);
        INSERT INTO features VALUES(43, 32, 32)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    assert fanout(query) == 10
    assert len(list(con.execute(query, (43, 32, 33)))) == 4
    assert len(list(con.execute(query, (43, 33, 33)))) == 3
Пример #13
0
def test_txdbquery(genomicsqlite_txdb):
    conn = genomicsqlite.connect(genomicsqlite_txdb, read_only=True)
    assert next(conn.execute("PRAGMA page_size"))[0] == 16384

    # one query
    results = list(t[0] for t in conn.execute(
        "SELECT tx_name FROM transcript WHERE _rowid_ IN " +
        genomicsqlite.genomic_range_rowids_sql(conn, "transcript") +
        " ORDER BY tx_name",
        ("chr12", 111803912, 111804012),
    ))
    print(results)
    assert results == sorted([
        "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1",
        "ENST00000549106.1"
    ])

    # random queries
    chroms = list(
        conn.execute(
            "SELECT tx_chrom, length FROM (SELECT tx_chrom, MAX(tx_end) AS length FROM transcript GROUP BY tx_chrom) WHERE length > 1000000"
        ))

    random.seed(0xBADF00D)
    for tbl in ("transcript", "cds"):
        query = genomicsqlite.genomic_range_rowids_sql(conn, tbl)[1:-1]
        fanout = 0
        for expl in conn.execute("EXPLAIN QUERY PLAN " + query,
                                 (None, None, None)):
            print(expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                fanout += 1
        assert (tbl, fanout) in (("transcript", 5), ("cds", 3))

        pfx = "tx" if tbl == "transcript" else tbl
        control_query = f"SELECT _rowid_ FROM {tbl} NOT INDEXED WHERE {pfx}_chrom = ? AND NOT ({pfx}_end < ? OR {pfx}_start > ?) ORDER BY _rowid_"

        total_results = 0
        for _ in range(2000):
            chrom = random.choice(chroms)
            beg = random.randint(0, chrom[1] - 65536)
            end = beg + random.randint(1, random.choice([16, 256, 4096, 65536
                                                         ]))
            ids = list(row[0]
                       for row in conn.execute(query, (chrom[0], beg, end)))
            control_ids = list(
                row[0]
                for row in conn.execute(control_query, (chrom[0], beg, end)))
            assert ids == control_ids
            total_results += len(control_ids)
        assert total_results in (7341, 2660)

    # join exon to cds ("which exons are coding?")
    exon_cds_counts = (
        "SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM exon LEFT JOIN cds ON cds._rowid_ IN "
        + genomicsqlite.genomic_range_rowids_sql(conn, "cds", "exon_chrom",
                                                 "exon_start", "exon_end") +
        " AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_"
    )
    print(exon_cds_counts)
    fanout = 0
    for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts):
        print(expl[3])
        if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                in expl[3]):
            fanout += 1
    assert fanout == 3
    cds_exon_count_hist = list(
        conn.execute(
            f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds"
        ))
    for elt in cds_exon_count_hist:
        print(elt)
    assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]