def test_multi_open2(self): from idiva.io.vcf import ReadVCF with ReadVCF.open(vcf_file) as vcf: assert isinstance(vcf, ReadVCF) a = list(map(str, vcf)) with ReadVCF.open(vcf) as vcf: b = list(map(str, vcf)) self.assertListEqual(a, b)
def test_meta_accuracy(self): from idiva.io.vcf import ReadVCF with open(vcf_file, mode='r') as fd: reference = { "INFO": { "NS": {"Number": 1, "Type": "Integer", "Description": '"Number of Samples With Data"'}, "DP": {"Number": 1, "Type": "Integer", "Description": '"Total Depth"'}, "AF": {"Number": None, "Type": "Float", "Description": '"Allele Frequency"'}, "AA": {"Number": 1, "Type": "String", "Description": '"Ancestral Allele"'}, "DB": {"Number": 0, "Type": "Flag", "Description": '"dbSNP membership, build 129"'}, "H2": {"Number": 0, "Type": "Flag", "Description": '"HapMap2 membership"'}, }, "FILTER": { "q10": {"Description": '"Quality below 10"'}, "s50": {"Description": '"Less than 50% of samples have data"'}, }, "FORMAT": { "GT": {"Number": 1, "Type": "String", "Description": '"Genotype"'}, "GQ": {"Number": 1, "Type": "Integer", "Description": '"Genotype Quality"'}, "DP": {"Number": 1, "Type": "Integer", "Description": '"Read Depth"'}, "HQ": {"Number": 2, "Type": "Integer", "Description": '"Haplotype Quality"'}, }, "fileformat": "VCFv4.0", "fileDate": "20090805", "source": "myImputationProgramV3.1", "reference": "1000GenomesPilot-NCBI36", "phasing": "partial", } self.assertDictEqual(ReadVCF(fd).meta, reference)
def test_open_read_vcf_meta(self): from idiva.db import clinvar_open from idiva.io import ReadVCF with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) assert not hasattr(vcf, "sample_ids") print(vcf.header) raise NotImplementedError
def test_howto(self): from idiva.db import clinvar_open from idiva.io.vcf import ReadVCF with clinvar_open() as fd: vcf = ReadVCF(fd) vcf.meta for dataline in vcf: dataline
def samples_column(cls, fd): from idiva.io.vcf import parse_gt for dataline in ReadVCF(fd): for gt in dataline.samples: try: (a, b) = parse_gt(gt) except: raise RuntimeError(F"Could not parse genotype: {gt}")
def test_dataline_types(self): from idiva.io.vcf import ReadVCF, RawDataline with ReadVCF.open(vcf_file) as vcf: candidate = first(vcf) self.assertIsInstance(candidate, RawDataline) self.assertIsInstance(candidate.pos, int) self.assertIsInstance(candidate.qual, float) self.assertIsInstance(candidate.info, str)
def test_dataline_types(self): from idiva.io.vcf import ReadVCF, RawDataline with open(vcf_file, mode='r') as fd: candidate = first(ReadVCF(fd)) self.assertIsInstance(candidate, RawDataline) self.assertIsInstance(candidate.pos, int) self.assertIsInstance(candidate.qual, float) self.assertIsInstance(candidate.info, str)
def test_reads_all_lines(self): for k in PATHS: with open_maybe_gz(PATHS[k], mode='r') as fd: vcf = ReadVCF(fd) from idiva.utils import seek_then_rewind with seek_then_rewind(fd, seek=None): reference = len(fd.readlines()) with vcf.rewind_when_done: candidate = len(list(vcf)) self.assertEqual(candidate, reference)
def test_count(self): from idiva.io.vcf import ReadVCF, RawDataline # ref_len_v1 = {'ctrl': 2329288, 'case': 2360972} ref_len_v2 = {'ctrl': 2227080, 'case': 2258797} for group in URLS: with download(URLS[group]).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode='r') as fd: assert isinstance(fd, io.TextIOBase) nlines = sum(1 for __ in ReadVCF(fd)) # print(F"Group {group} has {nlines} datalines") self.assertEqual(nlines, ref_len_v2[group])
def test_clinvar_df(self): from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df with clinvar_open(which='vcf_37') as fd: df = clinvar_to_df(ReadVCF(fd)) self.assertEqual(len(df), REF_LENGTHS['clinvar_df']) self.assertTrue(all( df.loc[df['CLNVC'] == 'single_nucleotide_variant'])) self.assertFalse(df['CLNVC'].isnull().values.any()) self.assertTrue('OMIM_id' in df.columns)
def test_length_clinvar(self): from idiva.db import clinvar_open from idiva.io import ReadVCF from tqdm import tqdm with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) for idx, line in tqdm(enumerate(vcf.datalines), postfix='reading clinvar file'): pass self.assertEqual(idx, REF_LENGTHS['clinvar_csv'])
def test_datalines_accuracy(self): from idiva.io.vcf import ReadVCF with open(vcf_file, mode='r') as fd: reference = [ "20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.", "20 17330 None T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3", "20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4", "20 1230237 None T None 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2", "20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3", ] candidate = list(map(str, ReadVCF(fd))) self.assertListEqual(reference, candidate)
def test_rewinds(self): from idiva.io.vcf import ReadVCF with ReadVCF.open(vcf_file) as vcf: assert isinstance(vcf, ReadVCF) with vcf.rewind_when_done: a = list(map(str, vcf)) with vcf.rewind_when_done: b = list(map(str, vcf)) self.assertListEqual(a, b)
def ref_alt_columns(cls, fd): vcf = ReadVCF(fd) special = {F"<{k}>" for k in vcf.meta['ALT'].keys()} for dataline in vcf: ref = dataline.ref alt = dataline.alt.split(',') if is_genomic_string(ref): # Cannot assume: # assert all(is_genomic_string(a) for a in alt) pass else: assert ref in special assert all((a in special) for a in alt)
def maker(): from idiva.io.vcf import ReadVCF with clinvar_open(which) as fd: df = pd.DataFrame(data=(clinvar_datalines(ReadVCF(fd)))) df = df[["RS", "CLNSIG"]].rename(columns={ 'RS': "ID", 'CLNSIG': "ClnSig" }) df = df[df.ID.fillna('').str.contains(r"^rs[0-9]+$")] df = df.groupby('ID', as_index=False) df = df.agg({ 'ClnSig': lambda s: F'"{", ".join(sorted(set(map(str, s))))}"' }) return df
def alt_column(cls, fd): vcf = ReadVCF(fd) TCGA = {"T", "C", "G", "A"} special = {F"<{k}>" for k in vcf.meta['ALT'].keys()} for dataline in vcf: checks = [{ 'single nt': alt in TCGA, 'multi nt': set(alt).issubset(TCGA), 'special': alt in special, } for alt in dataline.alt.split(',')] if not any(any(c.values()) for c in checks): print(F"ALT = '{dataline.alt}' does not fit any known format.") print(F"REF = '{dataline.ref}'.") raise RuntimeError("Assumption on ALT column failed.")
def ref_column(cls, fd): vcf = ReadVCF(fd) TCGA = {"T", "C", "G", "A"} special = {F"<{k}>" for k in vcf.meta['ALT'].keys()} for dataline in vcf: assert "," not in dataline.ref checks = { 'single nt': dataline.ref in TCGA, 'multi nt': set(dataline.ref).issubset(TCGA), 'special': (dataline.ref in special), } if not any(checks.values()): print(F"REF = '{dataline.ref}' does not fit any known format.") print(F"ALT = '{dataline.alt}'.") raise RuntimeError("Assumption on REF column failed.")
def create_dbSNP_df(dbSNP_file_path: Path, out_base: Path, which_chrom: typing.Union[int, str] = 17) -> None: """ Converts the dbSNP vcf file to a dataframe """ log.info( f"Converting {dbSNP_file_path} to out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz" ) out_path = out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz' print(out_path) assert out_base.exists() with open(dbSNP_file_path, mode='r') as fd: df = dbSNP_to_df(ReadVCF(fd), which_chrom='NC' if which_chrom == '_all' else f'NC_{str(which_chrom).zfill(6)}') df.to_csv(out_path, index=False, compression="gzip") if not len(df): log.warning(f'created dataframe is empty for chrom {which_chrom}')
def test_open_read_vcf_datalines(self): from idiva.db import clinvar_open from idiva.io import ReadVCF with clinvar_open(which='vcf_37') as fd: vcf = ReadVCF(fd) reference = [ "1 865568 846933 G A . . ALLELEID=824438;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865568G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1", "1 865583 972363 C T . . ALLELEID=959431;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865583C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1", "1 865628 789256 G A . . AF_ESP=0.00347;AF_EXAC=0.00622;AF_TGP=0.00280;ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.865628G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=41285790", ] from idiva.io.vcf import RawDataline datalines: typing.List[RawDataline] datalines = list(at_most_n(vcf, n=len(reference))) self.assertIsInstance(datalines[0], RawDataline) candidate = list(map(str, datalines)) self.assertListEqual(reference, candidate) self.assertEqual(datalines[0].ref, 'G') self.assertEqual(datalines[1].ref, 'C') self.assertEqual(datalines[2].ref, 'G')
def test_db_clf(self): with ReadVCF.open(URLS['case']) as case: result = phenomenet_classifier(case=case) self.assertTrue(len(result.df))
def test_db_clf(self): with ReadVCF.open(URLS['case']) as case: result = db_classifier(case=case, ctrl=None) self.assertTrue(len(result.df))
def test_sanity2(self): from idiva.io.vcf import ReadVCF with ReadVCF.open(vcf_file) as vcf: with ReadVCF.open(vcf) as vcf: list(vcf)
def test_sanity(self): from idiva.io.vcf import ReadVCF with open(vcf_file, mode='r') as fd: ReadVCF(fd)
def id_is_unique(cls, fd): import pandas as pd ids = [dataline.id for dataline in ReadVCF(fd)] assert pd.Series(ids).is_unique
def test_read_case(self): with PATHS['case'].open(mode='r') as fd: candidate = str(list(at_most_n(ReadVCF(fd), 10)).pop()) reference = "17 186 rs547289895 G A 100 PASS AC=1;AF=0.000199681;AN=5008;NS=2504;DP=18075;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0" self.assertEqual(reference, candidate)
def test_phenom_basic(self): with ReadVCF.open(URLS['case']) as case: result = phenomenet_classifier_basic(case=case, ctrl=None) self.assertTrue(len(result.df)) log.info('passed!')
def test_phenom_clf(self): with ReadVCF.open(URLS['case']) as case, ReadVCF.open( URLS['ctrl']) as ctrl: result = phenomenet_classifier(case=case, ctrl=ctrl) self.assertTrue(len(result.df)) log.info('passed!')
def format_is_gt(cls, fd): for dataline in ReadVCF(fd): assert (dataline.format == "GT")