Пример #1
0
def test_unused():
    extra = ['XXX', 'YYY']
    keys = VEP.keys + extra
    ann = VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|xval|yval', keys=keys)
    assert ann.unused() == extra, ann.unused()

    assert ann.effects['XXX'] == 'xval'
Пример #2
0
def test_bug():
    e = sorted([
        VEP('missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding'
            ),
        VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
            )
    ])
    assert e[-1].so == 'missense_variant', e[-1].so
Пример #3
0
def test_unused():
    extra = ['YYY']
    keys = VEP.keys + extra
    ann = VEP(
        'missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding|xval|yval',
        keys=keys)
    assert ann.unused() == extra, ann.unused()

    assert ann.effects['YYY'] == 'yval'
Пример #4
0
def test_splice():

    e = VEP(
        'splice_acceptor_variant&intron_variant&feature_truncation|||ENSG00000221978|CCNL2|ENST00000408918||||-/226|protein_coding|1'
    )
    assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, True)

    e = VEP(
        'intron_variant&feature_elongation|||ENSG00000187634|SAMD11|ENST00000341065||||-/589|protein_coding|1'
    )
    assert (e.is_coding, e.is_exonic, e.is_splicing) == (False, False, False)
Пример #5
0
def test_gemini_issue812():
    ann = VEP(
        'protein_altering_variant|caGCAGCAGCAGCAGCAACAGCAG/caA|QQQQQQQQ/Q|ENSG00000204842|ATXN2|ENST00000608853|1/25|||14-21/1153|protein_coding|',
        keys=
        "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL"
        .split("|"))
    assert ann.is_coding
Пример #6
0
def test_bug_vcf2db_21():
    ann = VEP(
        'synonymous_variant|tcA/tcG|S|ENSG00000186092|OR4F5|ENST00000335137|1/1|||60/305|protein_coding||Low_complexity_(Seg):seg&Transmembrane_helices:TMhelix&Prints_domain:PR00237&Superfamily_domains:SSF81321&Gene3D:1.20.1070.10&hmmpanther:PTHR26451&hmmpanther:PTHR26451:SF72&PROSITE_profiles:PS50262||||ENST00000335137.3:c.180A>G|ENST00000335137.3:c.180A>G(p.%3D)|||-0.817044|0.039',
        keys=
        "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|DOMAINS|CLIN_SIG"
        .split("|"))

    assert ann.codon_change == "tcA/tcG", ann.codon_change
Пример #7
0
def test_32():
    keys = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|SOURCE|GIVEN_REF|USED_REF|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|SpliceRegion".split(
        "|")
    s = "-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|ENSG00000197915|Transcript|ENST00000368801|protein_coding|2/3||ENST00000368801.2:c.1del|ENSP00000357791.2:p.Met1?|77/9623|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|HGNC|HGNC:20846|YES|1|P1|CCDS30859.1|ENSP00000357791|Q86YZ3||UPI00001D7CAD||Ensembl|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000420707|antisense_RNA||1/8|ENST00000420707.5:n.159-25632del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||5||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|intron_variant&non_coding_transcript_variant|MODIFIER|FLG-AS1|ENSG00000237975|Transcript|ENST00000593011|antisense_RNA||1/3|ENST00000593011.5:n.296+54843del|||||||rs34061715&COSM111478||1||deletion|HGNC|HGNC:27913||4||||||||Ensembl|T|T|||||10|0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||,-|frameshift_variant&start_lost&start_retained_variant|HIGH|HRNR|388697|Transcript|NM_001009931.2|protein_coding|2/3||NM_001009931.2:c.1del|NP_001009931.1:p.Met1?|80/9632|1/8553|1/2850|M/X|Atg/tg|rs34061715&COSM111478||-1||deletion|EntrezGene|HGNC:20846|YES||||NP_001009931.1||||rseq_mrna_match|RefSeq|T|T||||||0.874|0.7337|0.8818|0.9544|0.9592|0.8875|||0.9028|0.7227|0.8276|0.9554|0.9063|0.9541|0.9411|0.9142|0.9069|0.9592|EUR||0&1|0&1|||||||||".split(
        ",")
    for e in s:
        eff = VEP(e, keys=keys)
        if not "intron" in e.lower():
            assert eff.impact_severity == "HIGH", (eff.impact_severity, e)
Пример #8
0
def test_canonical_order():
    effects = EFFECTS[:]
    effects.append(
        VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene|*",
            prioritize_canonical=True))
    effects = sorted(effects)
    assert effects[-1].is_canonical
    assert effects[0].impact_severity == "LOW"
    assert not effects[0].is_canonical
Пример #9
0
def test_veps():

    f = os.path.join(HERE, "vep-csqs.txt.gz")
    with gzip.open(f, "rt") as veps:
        for csq in (VEP(l.strip()) for l in veps):
            assert csq.severity in (1, 2, 3)
            assert csq.is_pseudogene in (True, False)
            assert csq.coding in (True, False)
            assert isinstance(csq.polyphen_value,
                              float) or csq.polyphen_value is None
            csq.gene
            assert isinstance(csq.sift_value, float) or csq.sift_value is None
Пример #10
0
def test_weird_vep():
    keys = "Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE|CANONICAL|CCDS|RadialSVM_score|RadialSVM_pred|LR_score|LR_pred|CADD_raw|CADD_phred|Reliability_index".split(
        "|")

    csqs = [
        "?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||,non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||",
        "non_coding_transcript_exon_variant&non_coding_transcript_variant|||117581|TWIST2|NM_001271893.1_dupl8|1/1||||mRNA|||||||||,?|||117581|TWIST2|NM_001271893.1|1/1||||protein_coding|YES||||||||",
        "?|||115286|SLC25A26|NM_173471.3|1/1||||protein_coding|YES||||||||",
        "|||ENSG00000138190|EXOC6|ENST00000260762||||-/804|protein_coding,|||ENSG00000138190|EXOC6|ENST00000371547||||-/820|protein_coding,|||ENSG00000138190|EXOC6|ENST00000443748||||-/701|protein_coding,NMD_transcript_variant|||ENSG00000138190|EXOC6|ENST00000495132||||-/404|nonsense_mediated_decay,|||ENSG00000138190|EXOC6|ENST00000371552||||-/799|protein_coding",
        "|||ENSG00000013503|POLR3B|ENST00000539066||||-/1075|protein_coding,nc_transcript_variant|||ENSG00000013503|POLR3B|ENST00000549195|||||processed_transcript,|||ENSG00000013503|POLR3B|ENST00000549569||||-/170|protein_coding,|||ENSG00000013503|POLR3B|ENST00000228347||||-/1133|",
        "|||ENSG00000147202|DIAPH2|ENST00000373054||||-/1097|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000355827||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000324765||||-/1101|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373049||||-/1096|protein_coding,|||ENSG00000147202|DIAPH2|ENST00000373061||||-/1101|protein_coding",
    ]
    import sys
    for cs in csqs:
        for c in cs.split(","):
            v = VEP(c, keys)
            assert v.impact_severity in ('LOW', 'MEDIUM', 'HIGH')
Пример #11
0
def test_vep():

    ann = VEP(
        'missense_variant|tTt/tGt|F/C|ENSG00000186092|OR4F5|ENST00000335137|1/1|possibly_damaging(0.568)|deleterious(0)|113/305|protein_coding'
    )
    assert ann.gene == 'OR4F5'
    assert ann.transcript == 'ENST00000335137'
    assert ann.aa_change == "F/C", ann.aa_change
    assert ann.consequences == ['missense_variant']
    assert ann.coding
    assert ann.biotype == "protein_coding"
    assert ann.severity == 2
    assert ann.impact_severity == "MED", ann.impact_severity
    assert not ann.is_pseudogene
    assert ann.polyphen_value == 0.568, ann.polyphen
    assert ann.polyphen_class == "possibly_damaging", ann.polyphen
    assert ann.sift_value == 0.0, ann.sift
    assert ann.sift_class == "deleterious", ann.sift
Пример #12
0
		gen = record.genotype(s).gt_type
		if gen:
			samples_gen_counts[gen] += 1
		else:
			samples_gen_counts[3] += 1
	samples_gen_counts[4] = float(samples_gen_counts[1] + 2*samples_gen_counts[2])/2*sum(samples_gen_counts[0:3])
	return samples_gen_counts

if __name__ == '__main__':
	#vcf_fname = sys.argv[1]
	vcf_fname = "/Users/dashazhernakova/Documents/Doby/GenomeRussia/60samples/ps+nov+yak+papers.filtered.from_russia.rsids.VEP.coding.vcf.gz"
	vcf_reader = vcf.Reader(open(vcf_fname))
	VEP.keys = vcf_reader.infos['CSQ'].desc.split(" ")[-1].split("|")
	
	sample_groups = ['Pskov', 'Novgorod', 'Yakut', 'Simons', 'Pagani', 'all']
	transcr_dict = defaultdict[list]
	
	for record in vcf_reader:
		if not record.ID:
			record.ID = record.CHROM + ":" + str(record.POS)
		effects = []
		for annot in record.INFO['CSQ']:
			vep = VEP(annot)
			
			if vep.effects['LoF'] == 'HC':
				sample_mafs = fillSampleMAFs(sample_groups, record)
				transcr_dict[vep.effects['Feature'] + ":" + vep.effects['Symbol']].append([record.ID, sample_mafs])
	
	for transcr, snps in transcr_dict.items():
		for snp in snps:
			print transcr.split(":")[1] + "\t" + transcr.split(":")[0] + "\t". join(snp)
Пример #13
0
            csq.gene
            assert isinstance(csq.sift_value, float) or csq.sift_value is None


def test_snpeffs():
    f = os.path.join(HERE, "snpeff-anns.txt.gz")
    with gzip.open(f, "rt") as anns:
        for csq in (SnpEff(l.strip()) for l in anns):
            assert csq.severity in (1, 2, 3)
            assert csq.is_pseudogene in (True, False)
            assert csq.coding in (True, False)
            assert csq.polyphen_value is None


EFFECTS = [
    VEP("upstream_gene_variant|||ENSG00000223972|DDX11L1|ENST00000456328|||||processed_transcript"
        ),
    VEP("downstream_gene_variant|||ENSG00000227232|WASH7P|ENST00000488147|||||unprocessed_pseudogene"
        ),
    VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
        ),
    VEP("non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
        ),
    VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
        ),
    VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
        ),
    VEP("splice_region_variant&non_coding_exon_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000456328|2/3||||processed_transcript"
        ),
    VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene"
        ),
    VEP("intron_variant&nc_transcript_variant|||ENSG00000223972|DDX11L1|ENST00000450305|||||transcribed_unprocessed_pseudogene"
Пример #14
0
for i, line in enumerate(sys.stdin):
    if line[0] == "#":
        print(line, end="")
        if "<ID=CSQ," in line:
            csq_keys = get_csq_keys(line)

        continue
    if i % 1000 == 0:
        print("filter: %d" % i, file=sys.stderr)

    toks = line.rstrip().split("\t")
    info = toks[7]
    pos = info.index('CSQ=') + 4

    vi = info[pos:].split(";")[0]

    veps = [VEP(c, keys=csq_keys) for c in vi.split(",")]

    if not any(isfunctional(v) for v in veps):
        continue

    if 'max_aaf_all=' in info:
        vals = info.split('max_aaf_all=')[1].split(";")[0].split(",")
        if max(map(float, vals)) > 0.001:
            print("skipping because of max_aaf_all:", line, file=sys.stderr)
            continue

    print(line, end="")
    sys.stdout.flush()