Пример #1
0
def test_gvcf_dv(tmp_path):
    dbfile = str(tmp_path / "dv_gvcf_lines.gsql")
    vcf_lines_into_sqlite(
        os.path.join(HERE, "data/NA12878.dv0.8.0.chr21.g.vcf.gz"), str(dbfile))
    rows = 962897
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
Пример #2
0
def test_bam(tmp_path):
    bamfile = os.path.join(HERE, "data/NA12878.chr21:20000000-22500000.bam")
    dbfile = str(tmp_path / "test.bam.sqlite")

    sam_into_sqlite(bamfile, str(dbfile), "--table-prefix", "NA12878_")
    genomicsqlite._compact(str(dbfile),
                           ["--inner-page-KiB", "64", "--outer-page-KiB", "2"])

    con = sqlite3.connect(f"file:{dbfile}.compact?mode=ro", uri=True)
    assert next(con.execute("PRAGMA page_size"))[0] == 2048
    con.close()

    con = sqlite3.connect(f"file:{dbfile}.compact?mode=ro&vfs=zstd", uri=True)
    assert next(con.execute("PRAGMA page_size"))[0] == 65536
    con.close()

    con = genomicsqlite.connect(f"{dbfile}.compact", read_only=True)

    count = next(con.execute("SELECT COUNT(*) FROM NA12878_reads"))[0]
    assert count == 592861

    count = next(
        con.execute("SELECT COUNT(DISTINCT qname) FROM NA12878_reads_seqs"))[0]
    assert count == 299205

    mq_hist = dict(
        con.execute("""
            SELECT mq, COUNT(*) as count FROM
                (SELECT ifnull(json_extract(tags_json, '$.MQ'),0) AS mq
                FROM NA12878_reads NATURAL JOIN NA12878_reads_tags WHERE (flag & 3840) = 0)
            GROUP BY mq ORDER BY mq DESC
            """))
    assert (mq_hist[0], mq_hist[60]) == (2734, 520522)
Пример #3
0
def genomicsqlite_txdb(txdb):
    """
    import bioC TxDb sourced originally from:
    http://bioconductor.org/packages/release/BiocViews.html#___TxDb
    """
    outfile = txdb[:-7] + ".genomicsqlite"
    conn = sqlite3.connect(txdb)
    conn.executescript(genomicsqlite.vacuum_into_sql(conn, outfile))
    conn.close()
    # create GRIs on the three feature tables
    conn = genomicsqlite.connect(outfile)
    conn.executescript(
        genomicsqlite.create_genomic_range_index_sql(conn,
                                                     "transcript",
                                                     "tx_chrom",
                                                     "tx_start",
                                                     "tx_end",
                                                     floor=2))
    conn.executescript(
        genomicsqlite.create_genomic_range_index_sql(conn,
                                                     "cds",
                                                     "cds_chrom",
                                                     "cds_start",
                                                     "cds_end",
                                                     floor=2))
    # intentionally left exon unindexed
    conn.close()
    return outfile
Пример #4
0
def test_attach(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con, gri=False)
    con.commit()

    attach_script = genomicsqlite.attach_sql(con,
                                             str(tmp_path /
                                                 "test_attached.gsql"),
                                             "db2",
                                             unsafe_load=True)
    con.executescript(attach_script)
    con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons")
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid",
                                                     "beg", "end"))
    ref_script = genomicsqlite.put_reference_assembly_sql(
        con, "GRCh38_no_alt_analysis_set", schema="db2")
    con.executescript(ref_script)

    query = (
        "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n"
        + genomicsqlite.genomic_range_rowids_sql(
            con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") +
        " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id")
    results = list(con.execute(query))
    assert len(results) == 5191

    refseq_by_name = genomicsqlite.get_reference_sequences_by_name(
        con, schema="db2")
    assert len(refseq_by_name) > 24
Пример #5
0
def test_pvcf_gatk(tmp_path):
    dbfile = str(tmp_path / "gatk_pvcf_lines.gsql")
    vcf_lines_into_sqlite(os.path.join(HERE, "data/gatk.1KGP.ALDH2.vcf.gz"),
                          str(dbfile))
    rows = 2088
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
Пример #6
0
def test_pvcf_glnexus(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    vcf_into_sqlite(
        os.path.join(HERE, "data/dv_glnexus.1KGP.ALDH2.vcf.gz"),
        str(dbfile),
    )
    rows = 1993
    samples = 2504
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM variants")) == (rows, )
    assert next(con.execute("SELECT COUNT(*) FROM genotypes")) == (rows *
                                                                   samples, )
    assert next(con.execute("SELECT SUM(DP) FROM genotypes")) == (134178640, )
    assert round(next(
        con.execute("SELECT SUM(QUAL) FROM variants"))[0]) == 118146
    assert next(con.execute("SELECT SUM(_gri_lvl) FROM variants")) == (-158, )
    assert list(
        con.execute(
            "SELECT GT1, GT2, COUNT(*) AS ct FROM genotypes GROUP BY GT1, GT2 ORDER BY ct DESC LIMIT 8"
        )) == [
            (0, 0, 4840519),
            (0, 1, 75297),
            (1, 1, 33338),
            (None, None, 32971),
            (0, 2, 2780),
            (1, 2, 1109),
            (0, 3, 979),
            (2, 2, 554),
        ]
Пример #7
0
def test_web():
    conn = genomicsqlite.connect(
        "https://github.com/mlin/sqlite_zstd_vfs/releases/download/web-test-db-v1/TxDb.Hsapiens.UCSC.hg38.knownGene.vacuum.genomicsqlite",
        read_only=True,
    )

    results = list(t[0] for t in conn.execute(
        "SELECT tx_name FROM transcript WHERE _rowid_ IN " +
        genomicsqlite.genomic_range_rowids_sql(conn, "transcript") +
        " ORDER BY tx_name",
        ("chr12", 111803912, 111804012),
    ))
    print(results)
    sys.stdout.flush()
    assert results == sorted([
        "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1",
        "ENST00000549106.1"
    ])

    exon_cds_counts = """
        SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds
        FROM genomic_range_index_levels('cds'), exon LEFT JOIN cds
            ON cds._rowid_ IN genomic_range_rowids('cds', exon_chrom, exon_start, exon_end, _gri_ceiling, _gri_floor)
            AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end)
        GROUP BY exon._rowid_
        """
    for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts):
        print(expl[3])
    sys.stdout.flush()
    cds_exon_count_hist = list(
        conn.execute(
            f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds"
        ))
    assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
Пример #8
0
def test_gri_levels_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    results = list(
        con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')")

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError,
                       match="missing genomic range index"):
        con.execute(
            "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')"
        )

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid",
                                                     "beg", "end"))
    results = list(
        con.execute(
            "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')"
        ))
    assert results == [(15, 0)]
Пример #9
0
def test_gvcf_hc(tmp_path):
    dbfile = str(tmp_path / "gatk_gvcf_lines.gsql")
    vcf_lines_into_sqlite(
        os.path.join(HERE, "data/hc.NA12878.chr22:25000000-30000000.g.vcf.gz"),
        str(dbfile))
    rows = 823481
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM vcf_lines"))[0] == rows
Пример #10
0
def test_pvcf_glnexus(tmp_path):
    dbfile = str(tmp_path / "glnexus_pvcf_lines.gsql")
    vcf_lines_into_sqlite(
        os.path.join(HERE, "data/dv_glnexus.1KGP.ALDH2.vcf.gz"), str(dbfile))
    rows = 1994
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM vcf_lines")) == (rows, )
    assert next(con.execute("SELECT SUM(rlen) FROM vcf_lines")) == (2560, )
Пример #11
0
def test_connect(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    con.executescript(
        genomicsqlite.put_reference_assembly_sql(con,
                                                 "GRCh38_no_alt_analysis_set"))
    _fill_exons(con)
    con.commit()
    del con

    con = genomicsqlite.connect(dbfile, read_only=True)
    query = (
        "WITH exons2 AS (SELECT * from exons) SELECT exons.id, exons2.id FROM exons2 LEFT JOIN exons ON exons._rowid_ IN\n"
        + genomicsqlite.genomic_range_rowids_sql(con, "exons", "exons2.rid",
                                                 "exons2.beg", "exons2.end") +
        " AND exons.id != exons2.id ORDER BY exons.id, exons2.id")
    results = list(con.execute(query))
    assert len(results) == 5191
Пример #12
0
def test_pvcf_gatk(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    vcf_into_sqlite(os.path.join(HERE, "data/gatk.1KGP.ALDH2.vcf.gz"),
                    str(dbfile))
    rows = 2087
    samples = 2504
    con = genomicsqlite.connect(dbfile, read_only=True)
    assert next(con.execute("SELECT COUNT(*) FROM variants"))[0] == rows
    assert next(
        con.execute("SELECT COUNT(*) FROM genotypes"))[0] == rows * samples
Пример #13
0
def test_bundled_extensions():
    con = genomicsqlite.connect(":memory:")

    con.executescript("""
        CREATE TABLE test(json TEXT);
        INSERT INTO test(json) VALUES('{"chrom":"chr10"}');
        INSERT INTO test(json) VALUES('{"chrom":"chr2"}');
        INSERT INTO test(json) VALUES('{"chrom":"chrMT"}');
        """)

    results = con.execute(
        "SELECT json_extract(json,'$.chrom') AS chrom FROM test ORDER BY chrom COLLATE UINT"
    )
    assert list(results) == [("chr2", ), ("chr10", ), ("chrMT", )]
Пример #14
0
def test_dna_revcomp():
    con = genomicsqlite.connect(":memory:")

    assert next(con.execute("SELECT dna_revcomp('AGCTagct')"))[0] == "agctAGCT"
    assert next(con.execute("SELECT dna_revcomp('gATtaCa')"))[0] == "tGtaATc"
    assert next(con.execute("SELECT dna_revcomp('')"))[0] == ""
    assert next(con.execute("SELECT dna_revcomp(NULL)"))[0] is None

    with pytest.raises(sqlite3.OperationalError):
        con.execute("SELECT dna_revcomp('GATTACAb')")

    with pytest.raises(sqlite3.OperationalError):
        con.execute("SELECT dna_revcomp('GATTACA ')")

    with pytest.raises(sqlite3.OperationalError):
        con.execute("SELECT dna_revcomp(42)")
Пример #15
0
def test_twobit_random():
    con = genomicsqlite.connect(":memory:")

    random.seed(42)
    for seqlen in (random.randint(2, 1250) for _ in range(5000)):
        rna = random.choice((False, True))

        nucs = (
            ("a", "A", "g", "G", "c", "C", "u", "U")
            if rna
            else ("a", "A", "g", "G", "c", "C", "t", "T")
        )
        seq = "".join(random.choice(nucs) for _ in range(seqlen))
        assert (rna and ("t" not in seq and "T" not in seq)) or (
            not rna and ("u" not in seq and "U" not in seq)
        )

        crumbs = next(con.execute("SELECT nucleotides_twobit(?)", (seq,)))[0]
        assert isinstance(crumbs, bytes)
        assert len(crumbs) == math.ceil(len(seq) / 4) + 1

        assert next(con.execute("SELECT twobit_length(nucleotides_twobit(?))", (seq,)))[0] == len(
            seq
        )

        query = f"SELECT {'twobit_rna' if rna else 'twobit_dna'}(nucleotides_twobit(?))"
        decoded = next(con.execute(query, (seq,)))[0]
        assert decoded == seq.upper()

        # test built-in substr
        sub_ofs = random.randint(0, seqlen)
        sub_len = random.randint(0, seqlen)
        decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, sub_ofs, sub_len)))[0]
        control = next(con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, sub_ofs, sub_len)))[0]
        assert decoded == control

        # test with negative offset/length -- https://sqlite.org/lang_corefunc.html#substr
        decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, 0 - sub_ofs, sub_len)))[0]
        control = next(
            con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, 0 - sub_ofs, sub_len))
        )[0]
        assert decoded == control
        decoded = next(con.execute("SELECT twobit_dna(?,?,?)", (seq, sub_ofs, 0 - sub_len)))[0]
        control = next(
            con.execute("SELECT substr(twobit_dna(?),?,?)", (seq, sub_ofs, 0 - sub_len))
        )[0]
        assert decoded == control
Пример #16
0
def test_gnomad_sites_small(tmp_path):
    dbfile = str(tmp_path / "test.gsql")

    vcf_into_sqlite(
        os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"),
        str(dbfile),
        "--table-prefix",
        "gnomad_",
    )

    con = genomicsqlite.connect(dbfile, read_only=True)

    query = (
        "SELECT variant_rowid, id_jsarray FROM (SELECT _gri_rid AS rid FROM _gri_refseq WHERE gri_refseq_name=?1) AS query, gnomad_variants WHERE variant_rowid IN"
        + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants",
                                                 "query.rid"))
    rs671 = ("chr12", 111803912, 111804012)
    print(query)

    indexed = 0
    for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671):
        print(expl[3])
        if "USING INDEX gnomad_variants__gri" in expl[3]:
            indexed += 1
    assert indexed == 3
    opcodes = list(con.execute("EXPLAIN " + query, rs671))
    accessed_cursors = list(opcode[2] for opcode in opcodes
                            if opcode[1] == "Column")
    table_rootpages = set(row[0] for row in con.execute(
        "SELECT rootpage FROM sqlite_master WHERE type='table'"))
    table_cursors = set(
        opcode[2] for opcode in opcodes
        if opcode[1] == "OpenRead" and opcode[3] in table_rootpages)
    assert sum(
        1 for cursor in accessed_cursors if cursor not in table_cursors) > 1
    assert sum(1 for cursor in accessed_cursors
               if cursor in table_cursors) == 1

    results = list(con.execute(query, rs671))
    results_rowids = set(vt[0] for vt in results)
    assert next(vt for vt in results if vt[1] and "rs671" in vt[1])

    control = "SELECT variant_rowid FROM gnomad_variants NATURAL JOIN _gri_refseq WHERE gri_refseq_name = ? AND NOT ((pos+rlen) < ? OR pos > ?)"
    control_rowids = set(vt[0] for vt in con.execute(control, rs671))
    assert len(control_rowids) == 22
    assert results_rowids == control_rowids
Пример #17
0
def test_gnomad_sites_small(tmp_path):
    dbfile = str(tmp_path / "gnomad_lines.gsql")

    vcf_lines_into_sqlite(
        os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"),
        str(dbfile),
        "--table",
        "gnomad_vcf_lines",
    )

    con = genomicsqlite.connect(dbfile, read_only=True)
    query = ("SELECT _rowid_, line FROM gnomad_vcf_lines WHERE _rowid_ IN " +
             genomicsqlite.genomic_range_rowids_sql(con, "gnomad_vcf_lines"))
    rs671 = ("chr12", 111803912, 111804012)
    print(query)

    indexed = 0
    for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671):
        print(expl[3])
        if "USING INDEX gnomad_vcf_lines__gri" in expl[3]:
            indexed += 1
    assert indexed == 3
    opcodes = list(con.execute("EXPLAIN " + query, rs671))
    accessed_cursors = list(opcode[2] for opcode in opcodes
                            if opcode[1] == "Column")
    table_rootpages = set(row[0] for row in con.execute(
        "SELECT rootpage FROM sqlite_master WHERE type='table'"))
    table_cursors = set(
        opcode[2] for opcode in opcodes
        if opcode[1] == "OpenRead" and opcode[3] in table_rootpages)
    assert sum(
        1 for cursor in accessed_cursors if cursor not in table_cursors) > 1
    assert sum(1 for cursor in accessed_cursors
               if cursor in table_cursors) == 1

    results = list(con.execute(query, rs671))
    results_rowids = set(vt[0] for vt in results)
    assert next(vt for vt in results if vt[1] and "rs671" in vt[1])

    control = "SELECT rowid FROM gnomad_vcf_lines WHERE NOT ((POS+rlen) < ? OR POS > ?)"
    control_rowids = set(vt[0]
                         for vt in con.execute(control, (rs671[1], rs671[2])))
    assert len(control_rowids) == 22
    assert results_rowids == control_rowids
Пример #18
0
def test_gri_levels_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    # test caching & invalidation:
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')"))
    assert results == [(3, 1)]
    tch1 = con.total_changes
    con.execute("INSERT INTO exons VALUES('ether',0,4097,4097,'ether')")
    tch2 = con.total_changes
    assert tch2 > tch1
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(4, 1)]
    con.commit()
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(4, 1)]
    con.execute("DELETE FROM exons WHERE rid = 'ether'")
    con.commit()
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')")

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"):
        con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')")

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")
    )
    results = list(
        con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')")
    )
    assert results == [(15, 0)]
def test_parse_genomic_range():
    con = genomicsqlite.connect(":memory:")
    for (txt, chrom, begin_pos, end_pos) in [
        ("chr1:2,345-06,789", "chr1", 2344, 6789),
        ("π:1-9,223,372,036,854,775,799", "π", 0, 9223372036854775799),
    ]:
        assert next(
            con.execute("SELECT parse_genomic_range_sequence(?)",
                        (txt, )))[0] == chrom
        assert next(con.execute("SELECT parse_genomic_range_begin(?)",
                                (txt, )))[0] == begin_pos
        assert next(con.execute("SELECT parse_genomic_range_end(?)",
                                (txt, )))[0] == end_pos

    for txt in [
            "",
            ":",
            "-",
            ":-",
            ":1-2",
            "chr1",
            "chr1:0-1",
            "chr1:1,234",
            "chr1:1,234-",
            "chr1:1,234-5,67",
            "chr1 :2,345-06,789",
            "chr1:2,345-06,789\t",
            "chr1:2345-deadbeef",
            "chr1:1-9,223,372,036,854,775,800",
    ]:
        with pytest.raises(sqlite3.OperationalError) as exc:
            con.execute("SELECT parse_genomic_range_sequence(?)", (txt, ))
        assert "parse_genomic_range():" in str(exc.value)
        with pytest.raises(sqlite3.OperationalError):
            con.execute("SELECT parse_genomic_range_begin(?)", (txt, ))
        assert "parse_genomic_range():" in str(exc.value)
        with pytest.raises(sqlite3.OperationalError):
            con.execute("SELECT parse_genomic_range_end(?)", (txt, ))
        assert "parse_genomic_range():" in str(exc.value)

    assert next(con.execute("SELECT parse_genomic_range_end(NULL)"))[0] is None
Пример #20
0
def test_twobit_column():
    # test populating a column with mixed BLOB and TEXT values
    con = genomicsqlite.connect(":memory:")

    con.executescript("CREATE TABLE test(test_twobit BLOB)")
    for elt in list("Tu") + ["foo", "bar", "gATuaCa"]:
        con.execute("INSERT INTO test(test_twobit) VALUES(nucleotides_twobit(?))", (elt,))

    column = list(con.execute("SELECT test_twobit FROM test"))
    assert isinstance(column[0][0], bytes), str([type(x[0]) for x in column])
    assert isinstance(column[-1][0], bytes)
    assert isinstance(column[-2][0], str)
    assert column[-2][0] == "bar"

    assert list(con.execute("SELECT twobit_dna(test_twobit) FROM test")) == [
        ("T",),
        ("T",),
        ("foo",),
        ("bar",),
        ("GATTACA",),
    ]
Пример #21
0
def test_twobit_corner_cases():
    con = genomicsqlite.connect(":memory:")

    for nuc in "AGCTagct":
        assert next(con.execute("SELECT length(nucleotides_twobit(?))", (nuc,)))[0] == 1
        assert (
            next(con.execute("SELECT twobit_dna(nucleotides_twobit(?))", (nuc,)))[0] == nuc.upper()
        )
    assert next(con.execute("SELECT nucleotides_twobit('')"))[0] == ""
    assert next(con.execute("SELECT nucleotides_twobit('acgt 1')"))[0] == "acgt 1"
    assert next(con.execute("SELECT twobit_dna('acgt 1')"))[0] == "acgt 1"
    assert next(con.execute("SELECT twobit_dna('acgt 1',1,6)"))[0] == "acgt 1"
    assert next(con.execute("SELECT twobit_dna('acgt 1',3,3)"))[0] == "gt "
    assert next(con.execute("SELECT twobit_dna('acgt 1',-2,-3)"))[0] == "cgt"

    # exhaustively test offset/length corner cases
    for xtest in range(-9, 9):
        for ytest in range(-9, 9):
            decoded = next(
                con.execute("SELECT twobit_rna(nucleotides_twobit('gattaca'),?,?)", (xtest, ytest))
            )[0]
            control = next(con.execute("SELECT substr('GAUUACA',?,?)", (xtest, ytest)))[0]
            assert decoded == control, str((xtest, ytest))
Пример #22
0
def test_bam(tmp_path):
    bamfile = os.path.join(HERE, "data/NA12878.chr21:20000000-22500000.bam")
    dbfile = str(tmp_path / "test.bam.sqlite")

    sam_into_sqlite(bamfile, str(dbfile), "--table-prefix", "NA12878_")

    con = genomicsqlite.connect(dbfile, read_only=True)

    count = next(con.execute("SELECT COUNT(*) FROM NA12878_reads"))[0]
    assert count == 592861

    count = next(
        con.execute("SELECT COUNT(DISTINCT qname) FROM NA12878_reads_seqs"))[0]
    assert count == 299205

    mq_hist = dict(
        con.execute("""
        SELECT mq, COUNT(*) as count FROM
            (SELECT ifnull(json_extract(tags_json, '$.MQ'),0) AS mq
             FROM NA12878_reads NATURAL JOIN NA12878_reads_tags WHERE (flag & 3840) = 0)
        GROUP BY mq ORDER BY mq DESC
    """))
    assert (mq_hist[0], mq_hist[60]) == (2734, 520522)
Пример #23
0
def test_txdbquery(genomicsqlite_txdb):
    conn = genomicsqlite.connect(genomicsqlite_txdb, read_only=True)
    assert next(conn.execute("PRAGMA page_size"))[0] == 16384

    # one query
    results = list(t[0] for t in conn.execute(
        "SELECT tx_name FROM transcript WHERE _rowid_ IN " +
        genomicsqlite.genomic_range_rowids_sql(conn, "transcript") +
        " ORDER BY tx_name",
        ("chr12", 111803912, 111804012),
    ))
    print(results)
    assert results == sorted([
        "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1",
        "ENST00000549106.1"
    ])

    # random queries
    chroms = list(
        conn.execute(
            "SELECT tx_chrom, length FROM (SELECT tx_chrom, MAX(tx_end) AS length FROM transcript GROUP BY tx_chrom) WHERE length > 1000000"
        ))

    random.seed(0xBADF00D)
    for tbl in ("transcript", "cds"):
        query = genomicsqlite.genomic_range_rowids_sql(conn, tbl)[1:-1]
        fanout = 0
        for expl in conn.execute("EXPLAIN QUERY PLAN " + query,
                                 (None, None, None)):
            print(expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                fanout += 1
        assert (tbl, fanout) in (("transcript", 5), ("cds", 3))

        pfx = "tx" if tbl == "transcript" else tbl
        control_query = f"SELECT _rowid_ FROM {tbl} NOT INDEXED WHERE {pfx}_chrom = ? AND NOT ({pfx}_end < ? OR {pfx}_start > ?) ORDER BY _rowid_"

        total_results = 0
        for _ in range(2000):
            chrom = random.choice(chroms)
            beg = random.randint(0, chrom[1] - 65536)
            end = beg + random.randint(1, random.choice([16, 256, 4096, 65536
                                                         ]))
            ids = list(row[0]
                       for row in conn.execute(query, (chrom[0], beg, end)))
            control_ids = list(
                row[0]
                for row in conn.execute(control_query, (chrom[0], beg, end)))
            assert ids == control_ids
            total_results += len(control_ids)
        assert total_results in (7341, 2660)

    # join exon to cds ("which exons are coding?")
    exon_cds_counts = (
        "SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM exon LEFT JOIN cds ON cds._rowid_ IN "
        + genomicsqlite.genomic_range_rowids_sql(conn, "cds", "exon_chrom",
                                                 "exon_start", "exon_end") +
        " AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_"
    )
    print(exon_cds_counts)
    fanout = 0
    for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts):
        print(expl[3])
        if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                in expl[3]):
            fanout += 1
    assert fanout == 3
    cds_exon_count_hist = list(
        conn.execute(
            f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds"
        ))
    for elt in cds_exon_count_hist:
        print(elt)
    assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
Пример #24
0
def main():
    header = f"""\
        -- GenomicSQLite smoke test --
        timestamp: {time.asctime(time.gmtime())}
        platform: {platform.platform()}
        uname: {os.uname()}
        python: {platform.python_implementation()} {platform.python_version()}"""
    print(textwrap.dedent(header))

    print("cpu", end="")
    try:
        if platform.system() == "Linux":
            with open("/proc/cpuinfo") as cpuinfo:
                modelname = (line.strip().replace("\t", "") for line in cpuinfo
                             if line.lower().strip().startswith("model name"))
                print(" " + next(modelname, ": ???"))
        elif platform.system() == "Darwin":
            sysctl = subprocess.run(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                check=True,
                universal_newlines=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            if sysctl.returncode == 0 and sysctl.stdout.strip():
                print(": " + sysctl.stdout.strip())
            else:
                print(": ???")
        else:
            print(": ???")
    except Exception:
        print(": ???")

    env_keys = [
        k for k in os.environ
        if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH",
                                                   "LD_LIBRARY_PATH",
                                                   "DYLD_LIBRARY_PATH")
    ]
    if env_keys:
        print("environment:")
        for k in env_keys:
            print(f"  {k}={os.environ[k]}")

    print("sqlite3: ", end="")
    import sqlite3

    conn = sqlite3.connect(":memory:")
    print(next(conn.execute("SELECT sqlite_version()"))[0])

    print("genomicsqlite: ", end="")
    try:
        import genomicsqlite
    except ImportError as exn:
        print(f"\n\nUnable to import genomicsqlite: {exn}")
        print(
            "The Python genomicsqlite package may need to be installed via pip3 or conda,"
        )
        print("or its location may need to be added to PYTHONPATH.")
        sys.exit(1)

    print(genomicsqlite.__version__)
    print(f"genomicsqlite library: {genomicsqlite._DLL}")

    dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"),
                          f"genomicsqlite_smoke_test.{time.monotonic()}")
    print(f"\ntest database: {dbfile}")
    try:
        dbconn = genomicsqlite.connect(dbfile)
        for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)):
            dbconn.execute(
                f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)"
            )
            for line in bed.strip().split("\n"):
                line = line.split("\t")
                dbconn.execute(
                    f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)",
                    (line[0], int(line[1]) - 1, int(line[2]), line[3]),
                )
            dbconn.executescript(
                genomicsqlite.create_genomic_range_index_sql(
                    dbconn, table, "chrom", "pos", "end"))
        dbconn.commit()

        query = (
            "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n"
            + textwrap.indent(
                genomicsqlite.genomic_range_rowids_sql(
                    dbconn, "exons2", "exons1.chrom", "exons1.pos",
                    "exons1.end"),
                " ",
            ) + "\nORDER BY exons1.id, exons2.id")
        print("GRI query:\n" + textwrap.indent(query, "  "))
        print("GRI query plan:")
        gri_constraints = 0
        for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query):
            print("  " + expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                gri_constraints += 1

        results = list(dbconn.execute(query))
        control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id"
        control_results = list(dbconn.execute(control))
        assert len(control_results) == 1139
        assert results == control_results

        if gri_constraints != 3:
            print("GRI query opcodes:")
            for expl in dbconn.execute("EXPLAIN " + query):
                print("  " + str(expl))

            print(
                "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan."
            )
            print(
                "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n"
            )

            sys.exit(2)

        dbconn.close()
        print("\nGenomicSQLite smoke test OK =)\n")
    finally:
        os.remove(dbfile)
Пример #25
0
def test_query_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)"
    results = list(con.execute(query, ("chr17", 43044294, 43048294)))

    control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons")
    control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query
    control_results = list(
        con.execute(control_query, ("chr17", 43044294, 43048294)))

    assert results == control_results

    for expl in con.execute(
            "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_",
        ("chr17", 43044294, 43048294),
    ):
        assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3]

    assert next((expl[3] for expl in con.execute(
        "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC",
        ("chr17", 43044294, 43048294),
    ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3]))

    with pytest.raises(sqlite3.OperationalError, match="domain error"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)"
        )

    dbfile2 = str(tmp_path / "test2.gsql")
    con2 = genomicsqlite.connect(dbfile2, unsafe_load=True)
    _fill_exons(con2)
    con2.commit()
    con2.close()

    con.executescript(
        genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True))
    query = """
        SELECT main.exons.id, db2.exons.id
            FROM main.exons LEFT JOIN db2.exons ON
            db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end)
            AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id
        """
    results = list(con.execute(query))
    assert len(results) == 5191

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)"
        )

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError, match="no such index"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        )

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid",
                                                     "beg", "end"))
    results = list(
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        ))
    assert results == []