Пример #1
0
def test_get_location_from_rsid():
    with pygwasvcf.GwasVcf(FILE) as g:
        g.index_rsid()
        chrom, pos = g.get_location_from_rsid("rs10399793")
        check_first_row("1", 49298)

    with pygwasvcf.GwasVcf(FILE, rsidx_path=FILE + ".rsidx") as g:
        chrom, pos = g.get_location_from_rsid("rs10399793")
        assert chrom == "1"
        assert pos == 49298
def test_get_nc_from_metadata():
    with pygwasvcf.GwasVcf(FILE) as g:
        metadata = g.get_metadata()
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_nc(
                rec, TRAIT, metadata) == 9
        del metadata[TRAIT]['TotalCases']
        with pytest.raises(KeyError):
            for rec in g.query(contig=CHROM, start=START, stop=STOP):
                pygwasvcf.VariantRecordGwasFuns.get_nc(rec, TRAIT, metadata)
def test_get_ss_from_metadata():
    with pygwasvcf.GwasVcf(FILE) as g:
        metadata = g.get_metadata()
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_ss(
                rec, TRAIT, metadata) == (463001 + 9)
        del metadata[TRAIT]['TotalCases']
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_ss(
                rec, TRAIT, metadata) == 463001
def test_get_id_chrpos():
    with pygwasvcf.GwasVcf(FILE) as g:
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_id(
                rec, TRAIT, create_if_missing=False) is not None
            del rec.samples[TRAIT]['ID']
            with pytest.raises(KeyError):
                assert pygwasvcf.VariantRecordGwasFuns.get_id(
                    rec, TRAIT, create_if_missing=False)
            assert pygwasvcf.VariantRecordGwasFuns.get_id(
                rec, TRAIT, create_if_missing=True) == "1-49298-T-C"
Пример #5
0
def test_get_metadata():
    with pygwasvcf.GwasVcf(FILE) as g:
        recs = g.get_metadata()
        assert TRAIT in recs
        assert "TotalVariants" in recs[TRAIT]
        assert "VariantsNotRead" in recs[TRAIT]
        assert "HarmonisedVariants" in recs[TRAIT]
        assert "VariantsNotHarmonised" in recs[TRAIT]
        assert "SwitchedAlleles" in recs[TRAIT]
        assert "TotalControls" in recs[TRAIT]
        assert "TotalCases" in recs[TRAIT]
        assert "StudyType" in recs[TRAIT]
Пример #6
0
def test_index_rsid():
    # delete old index if present
    if os.path.exists(FILE + ".rsidx"):
        os.remove(FILE + ".rsidx")

    # index GWAS-VCF
    with pygwasvcf.GwasVcf(FILE) as g:
        g.index_rsid()

    # check index exists
    assert os.path.exists(FILE + ".rsidx")

    # check contents of index
    with sqlite3.connect(FILE + ".rsidx") as dbconn:
        cur = dbconn.cursor()
        cur.execute("SELECT * FROM rsid_to_coord")

        for rec in cur.fetchall():
            assert rec[0] is not None
            assert isinstance(rec[0], int)
            assert rec[1] is not None
            assert isinstance(rec[1], str)
            assert rec[2] is not None
            assert isinstance(rec[2], int)
Пример #7
0
def read_vcf(fh, alleles, slh=None, trait=None):
    with pygwasvcf.GwasVcf(fh) as vcf_in:
        traits = vcf_in.get_traits()

        if trait is not None:
            assert trait in traits
        else:
            trait = traits[0]

        # get global field info from header
        metadata = vcf_in.get_metadata()

        if alleles:
            dtype_dict = {
                'SNP': str,
                'Z': float,
                'N': float,
                'A1': str,
                'A2': str
            }

            # Read in data
            o = [
                [
                    pygwasvcf.VariantRecordGwasFuns.get_id(
                        rec, trait,
                        create_if_missing=True),  # rsid or chr-pos-ref-alt
                    pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) /
                    pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait),
                    pygwasvcf.VariantRecordGwasFuns.get_ss(
                        rec, trait, metadata
                    ),  # if per-snp sample size unavailable then take from header
                    rec.alts[0],
                    rec.ref
                ] for rec in vcf_in.query()
            ]
            N = pd.Series([x[2] for x in o], dtype='float')

            p = pd.DataFrame({
                'SNP': pd.Series([x[0] for x in o], dtype='str'),
                'Z': pd.Series([x[1] for x in o], dtype='float'),
                'N': N,
                'A1': pd.Series([x[3] for x in o], dtype='str'),
                'A2': pd.Series([x[4] for x in o], dtype='str')
            })
        else:
            dtype_dict = {'SNP': str, 'Z': float, 'N': float}

            o = [
                [
                    pygwasvcf.VariantRecordGwasFuns.get_id(
                        rec, trait,
                        create_if_missing=True),  # rsid or chr-pos-ref-alt
                    pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) /
                    pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait),
                    pygwasvcf.VariantRecordGwasFuns.get_ss(
                        rec, trait, metadata
                    ),  # if per-snp sample size unavailable then take from header
                ] for rec in vcf_in.fetch()
            ]
            N = pd.Series([x[2] for x in o], dtype='float')

            p = pd.DataFrame({
                'SNP': pd.Series([x[0] for x in o], dtype='str'),
                'Z': pd.Series([x[1] for x in o], dtype='float'),
                'N': N
            })

    if slh is not None:
        compression = get_compression(slh)
        sl = []
        if compression == "gzip":
            try:
                with gzip.open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        else:
            try:
                with open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        f.close()
        p = p.loc[p['SNP'].isin(sl)]

    return (p)
def test_get_nc():
    with pygwasvcf.GwasVcf(FILE) as g:
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            with pytest.raises(KeyError):
                assert pygwasvcf.VariantRecordGwasFuns.get_nc(rec, TRAIT)
def test_get_id_rsid():
    with pygwasvcf.GwasVcf(FILE) as g:
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_id(
                rec, TRAIT) == "rs10399793"
def test_get_af():
    with pygwasvcf.GwasVcf(FILE) as g:
        for rec in g.query(contig=CHROM, start=START, stop=STOP):
            assert pygwasvcf.VariantRecordGwasFuns.get_af(
                rec, TRAIT) == pytest.approx(0.623765)
Пример #11
0
def test_query_by_rsid():
    with pygwasvcf.GwasVcf(FILE) as g:
        g.index_rsid()
        for num, row in enumerate(g.query(variant_id="rs10399793")):
            check_first_row(row.chrom, row.pos)
        assert num == 0
Пример #12
0
def test_query_by_chr_pos():
    with pygwasvcf.GwasVcf(FILE) as g:
        for num, row in enumerate(g.query(contig="1", start=49297,
                                          stop=49298)):
            check_first_row(row.chrom, row.pos)
        assert num == 0
Пример #13
0
def test_close():
    with pygwasvcf.GwasVcf(FILE) as g:
        assert not g.is_closed()
    assert g.is_closed()
    g = pygwasvcf.GwasVcf(FILE)
    assert g.is_closed()
Пример #14
0
def test_query_all():
    with pygwasvcf.GwasVcf(FILE) as g:
        for num, row in enumerate(g.query()):
            if num == 0:
                check_first_row(row.chrom, row.pos)
        assert num > 0