def parse_hgvs_muts_file(hgvs_muts_file, raise_exception=True): hgvs_muts = [] for m in [l.rstrip('\n') for l in open(hgvs_muts_file).readlines()]: if raise_exception: hgvs_mut = hgvs.HGVSName(m) hgvs_muts += [hgvs_mut] else: try: hgvs_mut = hgvs.HGVSName(m) hgvs_muts += [hgvs_mut] except hgvs.InvalidHGVSName: sys.stderr.write("Invalid HGVS found: {}\n".format(m)) pass return hgvs_muts
def convertGenomicPosToTranscriptPos(genomicPos, chrom, genome, transcript): """ Given a genomic position, chrom (in format "chrN"), genome (SequenceFileDB for genome), and transcript (pyhgvs transcript object): Returns a string of the transcript position at the given genomic position """ # use "T" and "A" for ref and alt because transcript position is not dependent on these values # converts genomic position to transcript position hgvs_name = str(pyhgvs.format_hgvs_name(chrom, genomicPos, "T", "A", genome, transcript)) # parses out transcript position from full hgvs_name transcriptPos = str(pyhgvs.HGVSName(hgvs_name).cdna_start) return transcriptPos
def test_apply_hgvs(self): # p.Leu1303Phefs == c.3908dupT transcripts = \ SeqIO.to_dict(SeqIO.parse("tests/test_data/BRCA1_transcripts.fa", "fasta")) brca1_mut = hgvs.HGVSName("ENST00000357654:c.3908dupT") normal_p = transcripts["ENST00000357654"].seq.translate() assert_equals("L", normal_p[1302]) mut_c = verify.apply_hgvs(transcripts["ENST00000357654"].seq, brca1_mut) assert_equals("TT", mut_c[3907:3909]) mut_p = mut_c.translate() assert_equals("F", mut_p[1302])
def parseVar(variantHGVS): ''' Parses the given variant HGVS and returns a dictionary containing: HGVS type, variant type, ref allele, and alt allele ''' varHGVS = hgvs.HGVSName(str(variantHGVS)) varParsed = {"typeHGVS": varHGVS.kind, "varRef": varHGVS.ref_allele, "varAlt": varHGVS.alt_allele, "varType": varHGVS.mutation_type} return varParsed
def get_hgvs_mutations(self, transcript_id, ignore_introns=True): hgvs_muts = [] if ignore_introns: df = self.df[self.df.Variant_Classification != "Intron"] else: df = self.df for r in df.HGVS_coding_DNA_change: if r != "Exception_encountered": try: h = hgvs.HGVSName(r) # remove .version postfix if h.transcript == transcript_id or ".".join( h.transcript.split(".")[:-1]) == transcript_id: hgvs_muts += [h] except: sys.stderr.write("Invalid HGVS found: {}\n".format(r)) pass return hgvs_muts
def fix_mutalyzer(row, coord_hash): if not pandas.isnull(row['Chromosomal Variant']): return row['Chromosomal Variant'] + '___?' # else mutalyzer cannot handle try: hgvs_name = hgvs.HGVSName(row['Input Variant']) except: return '___' nm = hgvs_name.transcript if not nm: nm = row['Input Variant'].split(':')[0] start = hgvs_name.cdna_start.coord start_offset = hgvs_name.cdna_start.offset end = hgvs_name.cdna_end.coord end_offset = hgvs_name.cdna_end.offset if row['Input Variant'] in ('NM_004985.4:c.*1638A>G', 'NM_001363.4:c.*6G>A', 'NM_004985.4:c.*2591A>G', 'NM_004985.4:c.*2888A>G', 'NM_004985.4:c.*3377C>T'): return '___' if start == end and start_offset == 0 and start > 0: #print(nm, start, row['Input Variant']) try: chrom, g_coord, c_nuc, strand = coord_hash[nm][start] if strand == '+': ref, alt = hgvs_name.ref_allele, hgvs_name.alt_allele else: ref, alt = comp[hgvs_name.ref_allele], comp[ hgvs_name.alt_allele] return 'XXX:g.%s%s>%s' % (g_coord, ref, alt) + '___' + chrom except: return '___' return '___'
# Parse the HGVS name into genomic coordinates and alleles. chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript) print(chrom, offset, ref, alt) # Returns variant in VCF style: ('chr11', 17496508, 'T', 'C') # Notice that since the transcript is on the negative strand, the alleles # are reverse complemented during conversion. # Format an HGVS name. chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C') transcript = get_transcript('NM_000352.3') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) print(hgvs_name) # Returns 'NM_000352.3(ABCC8):c.215A>G' hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G') # fields of the HGVS name are available as attributes: # # hgvs_name.transcript = 'NM_000352.3' # hgvs_name.kind = 'c' # hgvs_name.mutation_type = '>' # hgvs_name.cdna_start = hgvs.CDNACoord(215, -10) # hgvs_name.cdna_end = hgvs.CDNACoord(215, -10) # hgvs_name.ref_allele = 'A' # hgvs_name.alt_allele = 'G' print((hgvs_name.transcript, hgvs_name.kind, hgvs_name.mutation_type, hgvs_name.cdna_start, hgvs_name.cdna_end, hgvs_name.ref_allele, hgvs_name.alt_allele))
def alter_coords_hgvs_sequential(h1, h2): """Change HGVS coords of h2 after applying h1""" if h1.kind == "c" and h2.kind == "c": if h1.mutation_type == ">": h3 = hgvs.HGVSName(h2.name) elif h1.mutation_type == "del": if h1.cdna_start.coord > h2.cdna_end.coord: h3 = hgvs.HGVSName(h2.name) elif h1.cdna_end.coord < h2.cdna_start.coord: h3 = hgvs.HGVSName(h2.name) h3.cdna_start = hgvs.CDNACoord(coord=h3.cdna_start.coord - len(h1.ref_allele)) h3.cdna_end = hgvs.CDNACoord(coord=h3.cdna_end.coord - len(h1.ref_allele)) else: raise (Exception( "Overlapping del not implemented.\nhgvs1: {}\nhgvs2: {}". format(h1, h2))) elif h1.mutation_type == "ins": if h1.cdna_start.coord > h2.cdna_end.coord: h3 = hgvs.HGVSName(h2.name) elif h1.cdna_end.coord < h2.cdna_start.coord: h3 = hgvs.HGVSName(h2.name) h3.cdna_start = hgvs.CDNACoord(coord=h3.cdna_start.coord + len(h1.alt_allele)) h3.cdna_end = hgvs.CDNACoord(coord=h3.cdna_end.coord + len(h1.alt_allele)) else: raise (Exception("Overlapping ins not implemented")) elif h1.mutation_type == "dup": if h1.cdna_start.coord > h2.cdna_end.coord: h3 = hgvs.HGVSName(h2.name) elif h1.cdna_end.coord < h2.cdna_start.coord: h3 = hgvs.HGVSName(h2.name) h3.cdna_start = hgvs.CDNACoord(coord=h3.cdna_start.coord + len(h1.alt_allele) - len(h1.ref_allele)) h3.cdna_end = hgvs.CDNACoord(coord=h3.cdna_end.coord + len(h1.alt_allele) - len(h1.ref_allele)) else: raise (Exception("Overlapping dup not implemented")) elif h1.mutation_type == "delins": if h1.cdna_start.coord > h2.cdna_end.coord: h3 = hgvs.HGVSName(h2.name) elif h1.cdna_end.coord < h2.cdna_start.coord: h3 = hgvs.HGVSName(h2.name) h3.cdna_start = hgvs.CDNACoord(coord=h3.cdna_start.coord - len(h1.ref_allele) + len(h1.alt_allele)) h3.cdna_end = hgvs.CDNACoord(coord=h3.cdna_end.coord - len(h1.ref_allele) + len(h1.alt_allele)) else: raise (Exception("Overlapping delins not implemented")) else: raise (Exception("Unexpected mutation_type {}".format( h1.mutation_type))) return h3 else: raise (Exception("Only cDNA mutations have been implemented"))
genome = SequenceFileDB(args.ref) # Read RefSeq transcripts into a python dict. #with open('/home/mbosio/projects/rtt/code/chrx.txt') as infile: with open(args.transcript) as infile: transcripts = hgvs.utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) with open(args.infile) as rd,open(args.outfile,'w') as wr: wr.write('\t'.join(['#Chr','Start','End','Ref','Alt','Group']) + '\n') for line in rd: #print line.strip() try: a = hgvs.HGVSName(line.strip()) a.chrom='X' outlist = hgvs.get_vcf_allele(a,genome) outlist = [str(x) for x in outlist] outlist.append('Cases') wr.write('\t'.join(outlist)+'\n') except : wr.write('Error:%s %s \n'%(line.strip(),sys.exc_info()[0])) print line.strip() print sys.exc_info()[0] pass