def from_assembly(cls, assembly: Assembly, alt_aln_method: str = 'splign') -> 'HgvsMachinery': """ Initializes `biocommons/hgvs` machinery with Universal Transcript Archive (UTA) data provider. :param alt_aln_method: alignment method (default 'splign'). :param assembly: determines the assembly build. """ data_provider = uta.connect() assembly_name = ASSEMBLY__HGVS_ASSEMBLY_NAME[assembly] accession__contig = data_provider.get_assembly_map( assembly_name=assembly_name) return cls(assembly_mapper=AssemblyMapper( hdp=data_provider, assembly_name=assembly_name, alt_aln_method=alt_aln_method, add_gene_symbol=False, prevalidation_level='NONE', normalize=True, replace_reference=False), normalizer_3p=Normalizer(hdp=data_provider, shuffle_direction=3), normalizer_5p=Normalizer(hdp=data_provider, shuffle_direction=5), accession__contig=accession__contig, contig__accession={ contig: acc for acc, contig in accession__contig.items() })
def annotator(annotation): # 连接UTA, 创建Mapper、Parser、Normalizer hdp = connect() am = AssemblyMapper(hdp, assembly_name=annotation, alt_aln_method='splign', replace_reference=True) hp = Parser() hn = Normalizer(hdp) return am, hp, hn, hdp
HGVS_SEQREPO_DIR environment variables but is otherwise not configurable by the caller. """ from hgvs import __version__, global_config # noqa: F401 from hgvs.assemblymapper import AssemblyMapper from hgvs.dataproviders.uta import connect from hgvs.normalizer import Normalizer from hgvs.parser import Parser from hgvs.validator import Validator from hgvs.variantmapper import VariantMapper # provide standard abbreviated, short, and long names for instances hp = parser = hgvs_parser = Parser() hdp = hgvs_data_provider = connect() vm = variant_mapper = hgvs_variant_mapper = VariantMapper(hgvs_data_provider) am37 = hgvs_assembly_mapper_37 = AssemblyMapper(hgvs_data_provider, assembly_name='GRCh37') am38 = projector = hgvs_assembly_mapper_38 = AssemblyMapper( hgvs_data_provider, assembly_name='GRCh38') hn = normalizer = hgvs_normalizer = Normalizer(hgvs_data_provider) hv = validator = hgvs_validator = Validator(hgvs_data_provider) # functionalized forms of common methods parse = parser.parse normalize = normalizer.normalize validate = validator.validate c_to_g = projector.c_to_g c_to_n = projector.c_to_n c_to_p = projector.c_to_p
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = [ "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein" ] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split( ',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str( pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name( cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) if calcProtein == True: try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print('hgvs.exceptions.HGVSParseError: ', e) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Catch parse errors thrown by ometa.runtime.ParseError. except ParseError as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print(message) print('ometa.runtime.ParseError', ex) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index( "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str( int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str( int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format( str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1'] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) if calcProtein == True: try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38) print('hgvs.exceptions.HGVSParseError: ', e) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Catch parse errors thrown by ometa.runtime.ParseError. except ParseError as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38) print(message) print('ometa.runtime.ParseError', ex) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36) line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37) line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsPColumnName = 'HGVS_Protein' labelLine = brcaFile.readline().rstrip().split('\t') writeLine = '\t'.join(labelLine) + '\n' outputFile.writelines(writeLine) # Store indexes of the relevant columns hgvsG36Index = labelLine.index(hgvsG36ColumnName) hgvsG37Index = labelLine.index(hgvsG37ColumnName) hgvsG38Index = labelLine.index(hgvsG38ColumnName) refSeqIndex = labelLine.index(refSeqColumnName) hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName) hgvsPIndex = labelLine.index(hgvsPColumnName) geneSymbolIndex = labelLine.index("Gene_Symbol") synonymIndex = labelLine.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in brcaFile: parsedLine = line.rstrip().split('\t') if parsedLine[geneSymbolIndex] == 'BRCA1': parsedLine[refSeqIndex] = 'NM_007294.3' elif parsedLine[geneSymbolIndex] == 'BRCA2': parsedLine[refSeqIndex] = 'NM_000059.3' # Format genomic variant position strings to contain relevant refseq strings oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG36Index] oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG37Index] oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG38Index].split(',')[0] oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex] chrom38 = oldHgvsGenomic38.split(':')[1] offset38 = oldHgvsGenomic38.split(':')[2] ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0] alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(parsedLine[refSeqIndex]) transcript37 = get_transcript37(parsedLine[refSeqIndex]) transcript36 = get_transcript36(parsedLine[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format cdna_coord = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string synonymString = [] if parsedLine[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif parsedLine[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) if calcProtein == True: #print('oldHgvsGenomic38:', oldHgvsGenomic38) #print('oldHgvsCDNA: ', oldHgvsCDNA) #print('cdna: ', cdna_coord) try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: print('hgvs.exceptions.HGVSParseError: ', e) print( 'GRCh38 Genomic change: ', '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)) print('') #print('oldProtein: ', parsedLine[hgvsPIndex]) #print('protein:', protein_coord) #print('') # write new data into line parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord) if calcProtein == True: parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord)) parsedLine[synonymIndex] = ','.join(synonymString) writeLine = '\t'.join(parsedLine) + '\n' outputFile.writelines(writeLine) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close() outputFile.close()