def check_HGVS_conversion_error(): df = pd.read_csv(ORIGINAL_FILE, sep="\t") n_diff = 0 n_real_diff = 0 for row in df.iterrows(): original_genome_coor = [row[1].Chrom, row[1].Pos, row[1].Ref.replace("-", ""), row[1].Alt.replace("-", "")] try: chrm, pos, ref, alt = pyhgvs.parse_hgvs_name( row[1].HGVS, GENOME, get_transcript=get_transcript) chrm = chrm[3:] pos = str(pos) converted_genome_coor = [chrm, pos, ref, alt] except (pyhgvs.InvalidHGVSName, NotImplementedError, ValueError, AssertionError): converted_genome_coor = ["NA"] if converted_genome_coor == original_genome_coor: continue elif "NA" in converted_genome_coor or "None" in original_genome_coor: continue else: n_diff += 1 if not string_comp.variant_equal(original_genome_coor, converted_genome_coor): n_real_diff += 1 print "total number of rows in file: ", len(df) print "number of mismatching result by direct comparison: ", n_diff print "number of mismatching result by string comparison: ", n_real_diff
def translate(variant,transcripts,get_transcript): genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript) except: return 1 return chrom, offset, ref, alt
def hgvs_to_vcf(self, hgvs_variant): """ Converts a single variant provided in HGVS notation to genomic coordinate notation. See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs. Args: hgvs_variant (str): HGVS description of variant, such as NM_001100.3:c.137T>C. The portion prior to the colon is the refseqID used as the reference for the variant The portion after the colon is an HGVS-style description of the mutation (a SNP from T to C at location 137 in the example above. Returns: tuple of str: (chromosome_number, coordinate, ref, alt) in that order denoting the VCF notation of the variant """ # Library requires string not unicode, ensure format is correct hgvs_variant = str(hgvs_variant) chromosome_number, coordinate, ref, alt = pyhgvs.parse_hgvs_name( hgvs_variant, self.genome, get_transcript=self._get_transcript) chromosome_number = re.match('chr(.+)', chromosome_number).group(1) coordinate = str(coordinate) return chromosome_number, coordinate, ref, alt
def to_chrom_coordinate(self, cDNA): try: chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(cDNA, self.genome, \ get_transcript = self.get_transcript) return chrom, offset, ref, alt except: print "[%s] cannot be coverted to chromosome coordinate" % cDNA return None, None, None, None
def parse_hgvs(hgvs_name, fasta, genes): genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x)) with open(genes) as infile: transcripts = hgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
def get_genome_coor(hgvs_c): genome = SequenceFileDB('data/hg19.fa') refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( hgvs_c, genome, get_transcript=get_transcript) return chrom + ":" + str(offset) + ":" + ref + ">" + alt
def HGVS_to_GenomeCoor(HGVS): """use counsyl pyhgvs for this""" genome = SequenceFileDB('../data/hg19.fa') refGene = "../data/BRCA12.refGene.txt" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( HGVS, genome, get_transcript=get_transcript) genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt return genome_coordinate
def HGVS_to_genome_coor(HGVS): try: chrm, pos, ref, alt = pyhgvs.parse_hgvs_name( HGVS, GENOME, get_transcript=get_transcript) chrm = chrm[3:] pos = str(pos) ref = ref.replace("-", "") alt = alt.replace("-", "") genome_coor = "_".join([chrm, pos, ref, alt]) except(pyhgvs.InvalidHGVSName, NotImplementedError, ValueError, AssertionError): genome_coor = "not translated" return genome_coor
def egl_variants(filename): print('\ \n----------------------------------\ \n\ \n EGL Database \ \n\ \n----------------------------------') # Load file egl = pd.read_csv(filename, usecols=range(0, 9), header=None) # Rename columns accordingly egl.columns = [ 'ID', 'gene', '', 'exon', 'hgvs', 'protein_change', 'sig', 'last reviewed', 'alias listing' ] # Filter LAMA2 lama2_egl = egl[egl.iloc[:, 1].str.match(r'LAMA2')] lama2_egl = lama2_egl.loc[:, ['ID', 'hgvs', 'sig']] lama2_egl = lama2_egl.replace({'sig': dsig}) lama2_egl['ID'] = 'egl_' + lama2_egl['ID'].astype(str) genome = Fasta('reference/chr6.fa') # Converting HGVS to VCF genomic coordinates count = 0 transcript = get_transcript('NM_000426') for i in lama2_egl.index: try: #CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name(lama2_egl.loc[i,'hgvs'], genome, get_transcript=get_transcript) CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name( lama2_egl.loc[i, 'hgvs'], genome, transcript) lama2_egl.loc[i, "CHROM"] = 6 lama2_egl.loc[i, "POS"] = POS lama2_egl.loc[i, "REF"] = REF lama2_egl.loc[i, "ALT"] = ALT except: print('\nException:', lama2_egl.loc[i, 'hgvs']) count += 1 print('Number of exceptions:', count) pass lama2_egl = lama2_egl.astype({'CHROM': 'int32', 'POS': 'int32'}) # Pathogenic variants #pathogenic_egl = lama2_egl[lama2_egl.sig.str.match(r'[Pp]athogenic')] #return [lama2_egl, pathogenic_egl] return lama2_egl
def test_pyhgvs_cdna_coordinate_correct(self): for i in self.data: pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38'] pyhgvs_cDNA = i['pyhgvs_cDNA'] genome = SequenceFileDB('../reference_genome/hg38/hg38.fa') def get_transcript(name): REFGENE = "../refgene38_brca.txt" with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(pyhgvs_cDNA, genome, get_transcript=get_transcript) test_coord = chrom + ":" + "g." + str(offset) + ":" + ref + ">" + alt self.assertEqual(pyhgvs_coord, test_coord)
def test_pyhgvs_cdna_coordinate_correct(self): for i in self.data: pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38'] pyhgvs_cDNA = i['pyhgvs_cDNA'] genome = SequenceFileDB('../reference_genome/hg38/hg38.fa') def get_transcript(name): REFGENE = "../refgene38_brca.txt" with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( pyhgvs_cDNA, genome, get_transcript=get_transcript) test_coord = chrom + ":" + "g." + str( offset) + ":" + ref + ">" + alt self.assertEqual(pyhgvs_coord, test_coord)
def check_hgvs(hgvs, submitterteam, submitter, file): try: chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( str(hgvs), genome, get_transcript=get_transcript) except ValueError, e: # 'falsche' HGVS-Codes überspringen und anzeigen print e, hgvs if file not in overview[submitterteam]['incorrect JSONs'].keys(): overview[submitterteam]['incorrect JSONs'][file] = {} if 'falscher HGVS-Code' not in overview[submitterteam][ 'incorrect JSONs'][file]: overview[submitterteam]['incorrect JSONs'][file][ 'falscher HGVS-Code'] = {} overview[submitterteam]['incorrect JSONs'][file][ 'submitter'] = submitter overview[submitterteam]['incorrect JSONs'][file][ 'falscher HGVS-Code'][hgvs] = str(e) else: overview[submitterteam]['incorrect JSONs'][file][ 'falscher HGVS-Code'][hgvs] = str(e)
def hgvs_to_vcf(self, hgvs_variant): """ Converts a single variant provided in HGVS notation to genomic coordinate notation. See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs. Args: hgvs_variant (str): HGVS description of variant, such as NM_001100.3:c.137T>C. The portion prior to the colon is the refseqID used as the reference for the variant The portion after the colon is an HGVS-style description of the mutation (a SNP from T to C at location 137 in the example above. Returns: tuple of str: (chromosome_number, coordinate, ref, alt) in that order denoting the VCF notation of the variant """ # Library requires string not unicode, ensure format is correct hgvs_variant = str(hgvs_variant) chromosome_number, coordinate, ref, alt = pyhgvs.parse_hgvs_name(hgvs_variant, self.genome, get_transcript=self._get_transcript) chromosome_number = re.match('chr(.+)', chromosome_number).group(1) coordinate = str(coordinate) return chromosome_number, coordinate, ref, alt
else: print 'keine Mutationen: ', file continue genotype = mutation['Test Information']['Genotype'] if genotype == 'Hemizygous': genotype = '1' elif genotype == 'Homozygous': genotype = '1/1' elif genotype == 'Heterozygous' or genotype == 'Compound Heterozygous': genotype = '0/1' else: genotype = './1' for hgvscode in hgvslist: try: chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( str(hgvscode), genome, get_transcript=get_transcript) if hgvscode in multivcf['NM'].tolist(): index = multivcf['NM'].tolist().index(hgvscode) multivcf.set_value(index, caseID, genotype) if caseID not in jsonlist2: jsonlist2.append(caseID) else: chromo = chrom.split('chr')[1] multivcf.set_value(x, '#CHROM', str(chromo)) try: multivcf.set_value(x, 'sort', int(chromo)) except ValueError, e: multivcf.set_value(x, 'sort', 30) multivcf.set_value(x, 'NM', hgvscode) multivcf.set_value(x, 'POS', offset)
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsPColumnName = 'HGVS_Protein' labelLine = brcaFile.readline().rstrip().split('\t') writeLine = '\t'.join(labelLine) + '\n' outputFile.writelines(writeLine) # Store indexes of the relevant columns hgvsG36Index = labelLine.index(hgvsG36ColumnName) hgvsG37Index = labelLine.index(hgvsG37ColumnName) hgvsG38Index = labelLine.index(hgvsG38ColumnName) refSeqIndex = labelLine.index(refSeqColumnName) hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName) hgvsPIndex = labelLine.index(hgvsPColumnName) geneSymbolIndex = labelLine.index("Gene_Symbol") synonymIndex = labelLine.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in brcaFile: parsedLine = line.rstrip().split('\t') if parsedLine[geneSymbolIndex] == 'BRCA1': parsedLine[refSeqIndex] = 'NM_007294.3' elif parsedLine[geneSymbolIndex] == 'BRCA2': parsedLine[refSeqIndex] = 'NM_000059.3' # Format genomic variant position strings to contain relevant refseq strings oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG36Index] oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG37Index] oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG38Index].split(',')[0] oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex] chrom38 = oldHgvsGenomic38.split(':')[1] offset38 = oldHgvsGenomic38.split(':')[2] ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0] alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(parsedLine[refSeqIndex]) transcript37 = get_transcript37(parsedLine[refSeqIndex]) transcript36 = get_transcript36(parsedLine[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format cdna_coord = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string synonymString = [] if parsedLine[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif parsedLine[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) if calcProtein == True: #print('oldHgvsGenomic38:', oldHgvsGenomic38) #print('oldHgvsCDNA: ', oldHgvsCDNA) #print('cdna: ', cdna_coord) try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: print('hgvs.exceptions.HGVSParseError: ', e) print( 'GRCh38 Genomic change: ', '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)) print('') #print('oldProtein: ', parsedLine[hgvsPIndex]) #print('protein:', protein_coord) #print('') # write new data into line parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord) if calcProtein == True: parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord)) parsedLine[synonymIndex] = ','.join(synonymString) writeLine = '\t'.join(parsedLine) + '\n' outputFile.writelines(writeLine) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close() outputFile.close()
def main(args): options = parse_args() hdp = hgvs.dataproviders.uta.connect() am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38') hn = hgvs.normalizer.Normalizer(hdp) hp = hgvs.parser.Parser() # Read genome sequence using pyfaidx genome = Fasta(options.refFASTA) # Read RefSeq transcripts into a python dict. with open(options.refSEQ) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) babelfish38 = Babelfish(hdp, assembly_name="GRCh38") ## extract base variant representation with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF, 'w') as out_vcf: vcf_reader = vcf.Reader(in_vcf) vcf_writer = vcf.Writer(out_vcf, vcf_reader) for record in vcf_reader: # Convert variants for indel HGVS representation chrom, offset, ref, alt = (str(record.CHROM), record.POS, str(record.REF), str(record.ALT[0])) print('chrom: {}, offset: {}, ref: {}, alt: {}'.format( chrom, offset, ref, alt)) if 'chr13' in record.CHROM: transcript_id = "NM_000059.3" elif 'chr17' in record.CHROM: transcript_id = "NM_007294.4" transcript = get_transcript(transcript_id) try: hgvs_name = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript, use_gene=False, max_allele_length=50000) hgvs_c = hp.parse_hgvs_variant(hgvs_name) if len(ref) == len(alt) and len(ref) == 1: # Variant is a SNP, normalize using hgvs Normalizer function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' norm_hgvs_c = hn.normalize(hgvs_c) if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4' chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( str(norm_hgvs_c), genome, normalize=False, get_transcript=get_transcript) else: # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' hgvs_g = am38.c_to_g(hgvs_c) vcf_values = babelfish38.hgvs_to_vcf(hgvs_g) chrom, offset, ref, alt = 'chr{}'.format( vcf_values[0] ), vcf_values[1], vcf_values[2], vcf_values[3] except hgvs.exceptions.HGVSUnsupportedOperationError as e: print( 'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format( e)) except hgvs.exceptions.HGVSInvalidIntervalError as e: print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e)) except hgvs.exceptions.HGVSInvalidVariantError as e: print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e)) except AttributeError as e: print('AttributeError: {}'.format(e)) except KeyError as e: print('KeyError: {}'.format(e)) # Update and write the new normalized record record.POS = offset record.REF = ref record.ALT = [alt] vcf_writer.write_record(record)
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = [ "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein" ] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split( ',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str( pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name( cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) if calcProtein == True: try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print('hgvs.exceptions.HGVSParseError: ', e) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Catch parse errors thrown by ometa.runtime.ParseError. except ParseError as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print(message) print('ometa.runtime.ParseError', ex) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index( "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str( int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str( int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format( str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hgvs_parser = hgvs.parser.Parser() hgvs_dp = hgvs.dataproviders.uta.connect() hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp) hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38') genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1'] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue cdna_coord_LOVD = cdna_coord_LOVD.strip() # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue try: chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) except Exception as e: print('parse error: {}'.format(cdna_coord_LOVD)) print(e) protein_coord = None if calcProtein: try: genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38) var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord) var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message protein_coord = hgvs_am.c_to_p(var_c1_norm) except Exception as e: template = "An error of type {0} occured. Arguments:{1!r}" error_name = type(e).__name__ message = template.format(error_name, e.args) logging.error(message) logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change) logging.error(line) # Exceptions related to invalid data data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError']) if error_name not in data_errors: # output some more if exception doesn't seem to be related to invalid data logging.error("Non data error raised") logging.exception(message) if error_name == "DatabaseError": # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs raise EnvironmentError("Issue with UTA database. Aborting") # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36) line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37) line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
#!/usr/bin/env python import argparse import pyhgvs as hgvs import hgvs.utils as hgvs_utils from pygr.seqdb import SequenceFileDB if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', help="Input file") parser.add_argument('-g', '--genome', help="Genome reference FASTA") parser.add_argument('-r', '--refgene', help="RefGene Transcripts") parser.add_argument('-o', '--outfile', help="Output file name") args = parser.parse_args() genome = SequenceFileDB(args.genome) with open(args.refgene) as infile: transcripts = hgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript)
def lovd_variants(filename): print('\ \n--------------------------------------\ \n\ \n LOVD Database \ \n\ \n--------------------------------------') # Load the file lovd_ori = pd.read_csv(filename, header=0, quotechar='"', doublequote=True, sep="\t", skiprows=4110, nrows=2055) # Cleaning up the names of columns lovd_ori.columns = lovd_ori.columns.str.replace("['\{{','\}}']", '') lovd_ori.columns = lovd_ori.columns.str.replace("VariantOnGenome/", '') lovd_ori = lovd_ori.drop_duplicates(subset='DNA/hg38') lovd_ori['DNA/hg38'] = "NC_000006.12:" + lovd_ori['DNA/hg38'].astype(str) lovd = lovd_ori.rename(columns={ 'DBID': 'ID', 'DNA/hg38': 'hgvs', 'ClinicalClassification': 'sig' }) # Selecting important columns lovd = lovd.loc[:, ['ID', 'hgvs', 'sig', 'type']] lovd = lovd.replace({'type': dtype}) genome = Fasta('reference/NC_000006.fa') # Converting HGVS to VCF genomic coordinates count = 0 for i in lovd.index: try: #print('Trying %d\n' %(lovd.loc[i,'hgvs'])) CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name( lovd.loc[i, 'hgvs'], genome) lovd.loc[i, 'CHROM'] = 6 lovd.loc[i, "POS"] = POS lovd.loc[i, "REF"] = REF lovd.loc[i, "ALT"] = ALT except: print('\nException:', lovd.loc[i, 'hgvs']) count += 1 print('Number of exceptions:', count) pass # Filtering pathogenic variants #pathogenic_lovd = lovd[lovd.sig.str.match(r'.*[Pp]athogenic.*', na=False)] #return [lovd, pathogenic_lovd] lovd.dropna(inplace=True) lovd = lovd.astype({'CHROM': 'int32', 'POS': 'int32'}) #print(lovd) return lovd
import pyximport pyximport.install() import pyhgvs as hgvs import pyhgvs.utils as hgvs_utils import pygr from pygr.seqdb import SequenceFileDB from optparse import OptionParser parser = OptionParser() parser.add_option("-i", "--input_string", dest="instr", help="format NM_007294.3:c.8T>G", default="") (options, args) = parser.parse_args() instr = options.instr # Read genome sequence using pygr. fn_hg19='/raw/human_sequence_reference/UCSC/hg19.fa' fn_refgene='/home/david/software/hgvs-master/pyhgvs/data/genes.refGene' genome = SequenceFileDB(fn_hg19) def get_transcript(name): return transcripts.get(name) with open(fn_refgene) as infile: transcripts = hgvs_utils.read_transcripts(infile) out = hgvs.parse_hgvs_name(instr,genome,get_transcript=get_transcript) print( "\t".join( [str(x) for x in out] ) )
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath source = options.source logfile = options.logfile if options.verbose: logging_level = logging.DEBUG else: logging_level = logging.CRITICAL logging.basicConfig(filename=logfile, filemode="w", level=logging_level) with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source={0}'.format(source), file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format( annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace( '"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']] if hgvsName == '-': logging.debug("hgvs name == '-' for line: %s", parsedLine) continue gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower() if gene_symbol == 'brca1': transcript = 'NM_007294.3' elif gene_symbol == 'brca2': transcript = 'NM_000059.3' else: logging.debug("improper gene symbol: %s", gene_symbol) continue queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name( queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format( chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: logging.debug("could not parse hgvs field: %s", queryHgvsName)
# Read RefSeq transcripts into a python dict. # The RefSeq transcripts can be downloaded from: https://github.com/counsyl/hgvs/blob/master/pyhgvs/data/genes.refGene with open('references/genes.refGene') as infile: transcripts = read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) # Store the variant information in a list vcf = [] for v in data: chrom, offset, ref, alt = hgvs.parse_hgvs_name( gene + ':' + v, genome, get_transcript=get_transcript) vcf.append([chrom, offset, ref, alt]) # Define reference genome ref_genome = [ 'reference=GRCh37/hg19', 'contig=<ID=1,length=249250621>', 'contig=<ID=2,length=243199373>', 'contig=<ID=3,length=198022430>', 'contig=<ID=4,length=191154276>', 'contig=<ID=5,length=180915260>', 'contig=<ID=6,length=171115067>', 'contig=<ID=7,length=159138663>', 'contig=<ID=8,length=146364022>', 'contig=<ID=9,length=141213431>', 'contig=<ID=10,length=135534747>', 'contig=<ID=11,length=135006516>', 'contig=<ID=12,length=133851895>', 'contig=<ID=13,length=115169878.', 'contig=<ID=14,length=107349540>', 'contig=<ID=15,length=102531392>', 'contig=<ID=16,length=90354753>', 'contig=<ID=17,length=81195210>', 'contig=<ID=18,length=78077248>', 'contig=<ID=19,length=59128983>', 'contig=<ID=20,length=63025520>', 'contig=<ID=21,length=48129895>',
if '_' in cdot.split(':')[1]: return fix_alt_range(chrom, offset, ref, cdot) return fix_alt_single(chrom, offset, ref, cdot) dat_file = '../../data/raw/UC_all_panel_variants_01_20_2016.xlsx' df_pre = pandas.read_excel(dat_file) crit = df_pre.apply( lambda row: 'ins' in row['Alt'] and row['Gene Symbol'] in FOCUS_GENES, axis=1) df = df_pre[crit] df.loc[:, 'id'] = df.apply( lambda row: row['Transcript'] + ':' + row['Pos'] + row['Alt'], axis=1) vals = list(df[['id', 'Classification']].values) for val in vals: # print(val) v, c = val print(v, c) chrom, offset, ref, alt = hgvs.parse_hgvs_name( str(v), genome, get_transcript=get_transcript) print(v, c, chrom, offset, ref, alt) # h = str(hgvs.HGVSName(v)).split("'")[1] # print(v) # try: # except: # print('FAIL', v, c) # next convert to g. https://github.com/counsyl/hgvs # need hg19
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath source = options.source logfile = options.logfile if options.verbose: logging_level = logging.DEBUG else: logging_level = logging.CRITICAL logging.basicConfig(filename=logfile, filemode="w", level=logging_level) with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source={0}'.format(source), file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']] if hgvsName == '-': logging.debug("hgvs name == '-' for line: %s", parsedLine) continue gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower() if gene_symbol == 'brca1': transcript = 'NM_007294.3' elif gene_symbol == 'brca2': transcript = 'NM_000059.3' else: logging.debug("improper gene symbol: %s", gene_symbol) continue queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: logging.debug("could not parse hgvs field: %s", queryHgvsName)
def main(args): options = parse_args() exLOVDFile = options.inEXLOVD annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath errorsFile = options.errors with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source=exLOVD', file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format( annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = exLOVDFile.readline().strip().replace(' ', '_').replace( '"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the bic flat file for line in exLOVDFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change. if 'cDNA' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['cDNA']] elif 'dna_change' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['dna_change']] else: sys.exit("ERROR: could not parse hgvs name.") if hgvsName == '-': print(parsedLine) continue queryHgvsName = hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name( queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format( chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: print(str(e) + ': could not parse hgvs field ' + queryHgvsName, file=errorsFile)
from pygr.seqdb import SequenceFileDB if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', help="Input file") parser.add_argument('-g', '--genome', help="Genome reference FASTA") parser.add_argument('-r', '--refgene', help="RefGene Transcripts") parser.add_argument('-o', '--outfile', help="Output file name") args = parser.parse_args() genome = SequenceFileDB(args.genome) with open(args.refgene) as infile: transcripts = hgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) with open(args.infile, 'r') as infile: with open(args.outfile, 'w') as outfile: reader = csv.reader(infile, dialect='excel-tab') header = reader.next() for row in reader: print row[1] chrom, offset, ref, alt = hgvs.parse_hgvs_name(row[1], genome, get_transcript=get_transcript) outfile.write("{}\t{}\t{}\t{}\n".format(chrom, offset, ref, alt))
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath errorsFile = options.errors source = options.source with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) if source == "exLOVD": print('##source=exLOVD', file=vcfFile) elif source == "LOVD": print('##source=LOVD', file=vcfFile) else: raise ValueError('Source is %s, must be either LOVD or exLOVD' % (source)) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change. if 'cDNA' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['cDNA']] elif 'dna_change' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['dna_change']] else: sys.exit("ERROR: could not parse hgvs name.") if hgvsName == '-': print(parsedLine) continue queryHgvsName = hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: print(str(e)+': could not parse hgvs field '+queryHgvsName, file=errorsFile)
def convert_HGVS(hgvs_c, GENOME): chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( hgvs_c, GENOME, get_transcript=get_transcript) genome_coor = chrom + ":" + str(offset) + ":" + ref + ">" + alt HGVS_p = HGVS_cDNA_to_protein(hgvs_c) return genome_coor, HGVS_p
# Read genome sequence using pyfaidx. genome = Genome('/tmp/hg19.fa') # Read RefSeq transcripts into a python dict. with open('pyhgvs/data/genes.refGene') as infile: transcripts = hgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) # Parse the HGVS name into genomic coordinates and alleles. chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript) print(chrom, offset, ref, alt) # Returns variant in VCF style: ('chr11', 17496508, 'T', 'C') # Notice that since the transcript is on the negative strand, the alleles # are reverse complemented during conversion. # Format an HGVS name. chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C') transcript = get_transcript('NM_000352.3') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) print(hgvs_name) # Returns 'NM_000352.3(ABCC8):c.215A>G' hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G') # fields of the HGVS name are available as attributes:
from pygr.seqdb import SequenceFileDB if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', help="Input file") parser.add_argument('-g', '--genome', help="Genome reference FASTA") parser.add_argument('-r', '--refgene', help="RefGene Transcripts") parser.add_argument('-o', '--outfile', help="Output file name") args = parser.parse_args() genome = SequenceFileDB(args.genome) with open(args.refgene) as infile: transcripts = hgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) with open(args.infile, 'r') as infile: with open(args.outfile, 'w') as outfile: reader = csv.reader(infile, dialect='excel-tab') header = reader.next() for row in reader: print row[1] chrom, offset, ref, alt = hgvs.parse_hgvs_name( row[1], genome, get_transcript=get_transcript) outfile.write("{}\t{}\t{}\t{}\n".format( chrom, offset, ref, alt))