def get_transcript(name): global REFGENE if REFGENE is None: sys.exit("No reference genome was provided. Try to locate hg38.BRCA.refGene.txt.") with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name)
def parse_hgvs(hgvs_name, fasta, genes): genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x)) with open(genes) as infile: transcripts = hgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
def get_genome_coor(hgvs_c): genome = SequenceFileDB('data/hg19.fa') refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( hgvs_c, genome, get_transcript=get_transcript) return chrom + ":" + str(offset) + ":" + ref + ">" + alt
def HGVS_to_GenomeCoor(HGVS): """use counsyl pyhgvs for this""" genome = SequenceFileDB('../data/hg19.fa') refGene = "../data/BRCA12.refGene.txt" with open(refGene) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( HGVS, genome, get_transcript=get_transcript) genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt return genome_coordinate
def __init__(self,ref_fa,ref_tr): #ref_tr is a file of transcripts formatted # like the refGene.txt file from ucsc #ref_fa is a reference file (genome) in fasta format #reference sequence self.ref_fa = ref_fa self.ref = SequenceFileDB(self.ref_fa) #refseq transcripts self.ref_tr = ref_tr with open(self.ref_tr) as infile: self.tr = util.read_transcripts(infile)
def calc_all(variants, priors, genome, transcripts, processes): global brca1Transcript, brca2Transcript inputData = csv.DictReader(variants, delimiter="\t") fieldnames = inputData.fieldnames newHeaders = open("headers.tsv", "r").read().split() for header in newHeaders: fieldnames.append(header) outputData = csv.DictWriter(priors, delimiter="\t", lineterminator="\n", fieldnames=fieldnames) outputData.writerow(dict((fn, fn) for fn in inputData.fieldnames)) # read RefSeq transcripts transcripts = pyhgvs_utils.read_transcripts(transcripts) brca1Transcript = transcripts.get(BRCA1_RefSeq) brca2Transcript = transcripts.get(BRCA2_RefSeq) if processes > 1: # Create a pool of processes and calculate in parallel click.echo("Processing using {} processes".format(processes), err=True) pool = multiprocessing.Pool(processes) try: # Normal map has a bug if there is no timout that prevents Keyboard interrupts: # https://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool/1408476#1408476 # calc_one_partial = functools.partial(calc_one, brca1=brca1Transcript, brca2=brca2Transcript) calculatedVariants = pool.map_async(calc_one, list(inputData)).get(99999999) # Sort output as the order of p.map is not deterministic outputData.writerows(sorted( calculatedVariants, key=lambda d: "{0}:g.{1}:{2}>{3}".format(d["Chr"], d["Pos"], d["Ref"], d["Alt"]))) except KeyboardInterrupt: pool.terminate() else: outputData.writerows(sorted( map(calc_one, inputData), key=lambda d: "{0}:g.{1}:{2}>{3}".format(d["Chr"], d["Pos"], d["Ref"], d["Alt"]) ))
def add_HGVS_c_counsyl(in_path, out_path): # counsyl pyhgvs setups with open('../data/BRCA12.refGene.txt') as infile: transcripts_counsyl = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts_counsyl.get(name) f_in = open(in_path, "r") f_out = open(out_path, "w") line_num = 0 unmatching_cases = 0 for line in f_in: line_num += 1 print line_num if line_num == 1: f_out.write(line) continue items = line.strip().split("\t") genome_coors = items[2].split(":") chrom = genome_coors[0] offset = int(genome_coors[1]) ref = genome_coors[2].split(">")[0] alt = genome_coors[2].split(">")[1] transcript_id = items[4] transcript = get_transcript(transcript_id) hgvs_name_with_transcript = pyhgvs.variant_to_hgvs_name( chrom, offset, ref, alt, GENOME, transcript) hgvs_c = hgvs_name_with_transcript.format(use_prefix=False, use_gene=False) if items[5] == "-": items[5] = hgvs_c elif items[5] == hgvs_c: pass else: unmatching_cases += 1 new_line = "\t".join(items) + "\n" f_out.write(new_line) print "unmatching cases: ", unmatching_cases
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath errorsFile = options.errors source = options.source with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) if source == "exLOVD": print('##source=exLOVD', file=vcfFile) elif source == "LOVD": print('##source=LOVD', file=vcfFile) else: raise ValueError('Source is %s, must be either LOVD or exLOVD' % (source)) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change. if 'cDNA' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['cDNA']] elif 'dna_change' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['dna_change']] else: sys.exit("ERROR: could not parse hgvs name.") if hgvsName == '-': print(parsedLine) continue queryHgvsName = hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: print(str(e)+': could not parse hgvs field '+queryHgvsName, file=errorsFile)
#Load the data df = pd.read_excel(io=path, sheet_name=genename + '_' + variants, engine='openpyxl') data = [] for name in df['cDNA variant']: data.append(name) #read the genome # the reference genome can be downloaded from: http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/ genome = Fasta('references/hg19.fa') # Read RefSeq transcripts into a python dict. # The RefSeq transcripts can be downloaded from: https://github.com/counsyl/hgvs/blob/master/pyhgvs/data/genes.refGene with open('references/genes.refGene') as infile: transcripts = read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) # Store the variant information in a list vcf = [] for v in data: chrom, offset, ref, alt = hgvs.parse_hgvs_name( gene + ':' + v, genome, get_transcript=get_transcript) vcf.append([chrom, offset, ref, alt]) # Define reference genome
def main(args): options = parse_args() exLOVDFile = options.inEXLOVD annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath errorsFile = options.errors with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source=exLOVD', file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format( annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = exLOVDFile.readline().strip().replace(' ', '_').replace( '"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the bic flat file for line in exLOVDFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change. if 'cDNA' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['cDNA']] elif 'dna_change' in fieldIdxDict: hgvsName = parsedLine[fieldIdxDict['dna_change']] else: sys.exit("ERROR: could not parse hgvs name.") if hgvsName == '-': print(parsedLine) continue queryHgvsName = hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name( queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format( chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: print(str(e) + ': could not parse hgvs field ' + queryHgvsName, file=errorsFile)
def get_transcript(name): REFGENE = "../refgene38_brca.txt" with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name)
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath source = options.source logfile = options.logfile if options.verbose: logging_level = logging.DEBUG else: logging_level = logging.CRITICAL logging.basicConfig(filename=logfile, filemode="w", level=logging_level) with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source={0}'.format(source), file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format( annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace( '"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']] if hgvsName == '-': logging.debug("hgvs name == '-' for line: %s", parsedLine) continue gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower() if gene_symbol == 'brca1': transcript = 'NM_007294.3' elif gene_symbol == 'brca2': transcript = 'NM_000059.3' else: logging.debug("improper gene symbol: %s", gene_symbol) continue queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name( queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format( chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: logging.debug("could not parse hgvs field: %s", queryHgvsName)
config = configparser.ConfigParser() config.read(BinPath + '/config.ini') omim_dict = read_morbidmap(BinPath + '/' + config['DEFAULT']['morbidmap']) pathogenic_dict, pathogenic_dict2 = read_pathogenic_site( BinPath + '/' + config['DEFAULT']['pathogenic_ref']) ba1_exception = read_ba1_exception(BinPath + '/' + config['DEFAULT']['ba1_exception']) pvs1_levels = read_pvs1_levels(BinPath + '/' + config['DEFAULT']['pvs1levels']) domain_bed = create_bed_dict(BinPath + '/' + config['DEFAULT']['domain']) hotspot_bed = create_bed_dict(BinPath + '/' + config['DEFAULT']['hotspot']) curated_region = create_bed_dict(BinPath + '/' + config['DEFAULT']['curated_region']) exon_lof_frequent = create_bed_dict(BinPath + '/' + config['DEFAULT']['exon_lof_frequent']) genome = Fasta(BinPath + '/' + config['DEFAULT']['ref']) with open(BinPath + '/' + config['DEFAULT']['trans']) as gpefile: transcripts = read_transcripts(gpefile) gene_trans = {} trans_gene = {} with open(BinPath + '/' + config['DEFAULT']['trans']) as f: for line in f: record = line.strip().split("\t") gene = record[12] trans = record[1] gene_trans[gene] = trans trans_gene[trans] = gene
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsPColumnName = 'HGVS_Protein' labelLine = brcaFile.readline().rstrip().split('\t') writeLine = '\t'.join(labelLine) + '\n' outputFile.writelines(writeLine) # Store indexes of the relevant columns hgvsG36Index = labelLine.index(hgvsG36ColumnName) hgvsG37Index = labelLine.index(hgvsG37ColumnName) hgvsG38Index = labelLine.index(hgvsG38ColumnName) refSeqIndex = labelLine.index(refSeqColumnName) hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName) hgvsPIndex = labelLine.index(hgvsPColumnName) geneSymbolIndex = labelLine.index("Gene_Symbol") synonymIndex = labelLine.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in brcaFile: parsedLine = line.rstrip().split('\t') if parsedLine[geneSymbolIndex] == 'BRCA1': parsedLine[refSeqIndex] = 'NM_007294.3' elif parsedLine[geneSymbolIndex] == 'BRCA2': parsedLine[refSeqIndex] = 'NM_000059.3' # Format genomic variant position strings to contain relevant refseq strings oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG36Index] oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG37Index] oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG38Index].split(',')[0] oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex] chrom38 = oldHgvsGenomic38.split(':')[1] offset38 = oldHgvsGenomic38.split(':')[2] ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0] alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(parsedLine[refSeqIndex]) transcript37 = get_transcript37(parsedLine[refSeqIndex]) transcript36 = get_transcript36(parsedLine[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format cdna_coord = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string synonymString = [] if parsedLine[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif parsedLine[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) if calcProtein == True: #print('oldHgvsGenomic38:', oldHgvsGenomic38) #print('oldHgvsCDNA: ', oldHgvsCDNA) #print('cdna: ', cdna_coord) try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: print('hgvs.exceptions.HGVSParseError: ', e) print( 'GRCh38 Genomic change: ', '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)) print('') #print('oldProtein: ', parsedLine[hgvsPIndex]) #print('protein:', protein_coord) #print('') # write new data into line parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord) if calcProtein == True: parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord)) parsedLine[synonymIndex] = ','.join(synonymString) writeLine = '\t'.join(parsedLine) + '\n' outputFile.writelines(writeLine) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close() outputFile.close()
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = [ "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein" ] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split( ',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str( pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name( cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) if calcProtein == True: try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print('hgvs.exceptions.HGVSParseError: ', e) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Catch parse errors thrown by ometa.runtime.ParseError. except ParseError as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print(message) print('ometa.runtime.ParseError', ex) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index( "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str( int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str( int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format( str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() hdp = hgvs.dataproviders.uta.connect() am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38') hn = hgvs.normalizer.Normalizer(hdp) hp = hgvs.parser.Parser() # Read genome sequence using pyfaidx genome = Fasta(options.refFASTA) # Read RefSeq transcripts into a python dict. with open(options.refSEQ) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) babelfish38 = Babelfish(hdp, assembly_name="GRCh38") ## extract base variant representation with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF, 'w') as out_vcf: vcf_reader = vcf.Reader(in_vcf) vcf_writer = vcf.Writer(out_vcf, vcf_reader) for record in vcf_reader: # Convert variants for indel HGVS representation chrom, offset, ref, alt = (str(record.CHROM), record.POS, str(record.REF), str(record.ALT[0])) print('chrom: {}, offset: {}, ref: {}, alt: {}'.format( chrom, offset, ref, alt)) if 'chr13' in record.CHROM: transcript_id = "NM_000059.3" elif 'chr17' in record.CHROM: transcript_id = "NM_007294.4" transcript = get_transcript(transcript_id) try: hgvs_name = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript, use_gene=False, max_allele_length=50000) hgvs_c = hp.parse_hgvs_variant(hgvs_name) if len(ref) == len(alt) and len(ref) == 1: # Variant is a SNP, normalize using hgvs Normalizer function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' norm_hgvs_c = hn.normalize(hgvs_c) if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4' chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( str(norm_hgvs_c), genome, normalize=False, get_transcript=get_transcript) else: # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' hgvs_g = am38.c_to_g(hgvs_c) vcf_values = babelfish38.hgvs_to_vcf(hgvs_g) chrom, offset, ref, alt = 'chr{}'.format( vcf_values[0] ), vcf_values[1], vcf_values[2], vcf_values[3] except hgvs.exceptions.HGVSUnsupportedOperationError as e: print( 'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format( e)) except hgvs.exceptions.HGVSInvalidIntervalError as e: print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e)) except hgvs.exceptions.HGVSInvalidVariantError as e: print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e)) except AttributeError as e: print('AttributeError: {}'.format(e)) except KeyError as e: print('KeyError: {}'.format(e)) # Update and write the new normalized record record.POS = offset record.REF = ref record.ALT = [alt] vcf_writer.write_record(record)
def get_transcript(name): REFGENE = "../resources/refseq/hg38.BRCA.refGene.txt" with open(REFGENE) as infile: TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile) return TRANSCRIPTS.get(name)
import toolshed as ts from collections import defaultdict import pyhgvs as hgvs from pygr.seqdb import SequenceFileDB from pyhgvs.utils import read_transcripts import sys var=sys.argv[1] #'ogfiles/fetalgenomevariants.txt' trans=sys.argv[2] #'ogfiles/fetaltranscripts.txt' f=open(sys.argv[3],'w') #'fetalvariants.vcf' infile='Homo_sapiens.GRCh37.75.genePred' #obtained by running gtfToGenePred UCSC binary on the GTF of GRCh37 from ENSEMBL. added 0 as placeholder undocumented id field to the front of each line using sed with open(infile) as reffile: transcripts=read_transcripts(reffile) def get_transcript(name): return transcripts.get(name) def translate(variant,transcripts,get_transcript): genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript) except: return 1 return chrom, offset, ref, alt def readgenes(trans): genes=defaultdict(str) for fields in (x.rstrip('\r\n').split("\t") for x in ts.nopen(trans)): gene=fields[0]; transcript=fields[1] genes[gene]=transcript return genes
def main(): options = parse_args() inputFile = options.input annotFile_path = options.inAnnot vcfFile = options.out genome_path = options.gpath refseq_path = options.rpath source = options.source logfile = options.logfile if options.verbose: logging_level = logging.DEBUG else: logging_level = logging.CRITICAL logging.basicConfig(filename=logfile, filemode="w", level=logging_level) with open(refseq_path) as infile: transcripts = hgvs_utils.read_transcripts(infile) genome = SequenceFileDB(genome_path) def get_transcript(name): return transcripts.get(name) # open and store annotation fields in a dictionary annotDict = defaultdict() with open(annotFile_path) as inAnnotFile: for line in inAnnotFile: line = line.strip().split('\t') annotDict[line[0]] = line[1] # print header lines to vcf file print('##fileformat=VCFv4.0', file=vcfFile) print('##source={0}'.format(source), file=vcfFile) print('##reference=GRCh37', file=vcfFile) for annotation, description in annotDict.items(): print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile) # extract INFO field column indicies for annotation terms headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t') fieldIdxDict = defaultdict() for index, field in enumerate(headerline): fieldIdxDict[field] = index # extract info from each line of the flat file for line in inputFile: line = line.replace('"', '') INFO_field = list() parsedLine = line.strip().split('\t') for field in headerline: field_index = fieldIdxDict[field] field_value = parsedLine[field_index] field_value = normalize(field, field_value) INFO_field.append('{0}={1}'.format(field, field_value)) # extract hgvs cDNA term for variant and cleanup formatting hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']] if hgvsName == '-': logging.debug("hgvs name == '-' for line: %s", parsedLine) continue gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower() if gene_symbol == 'brca1': transcript = 'NM_007294.3' elif gene_symbol == 'brca2': transcript = 'NM_000059.3' else: logging.debug("improper gene symbol: %s", gene_symbol) continue queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0] INFO_field_string = ';'.join(INFO_field) try: chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript) chrom = chrom.replace('chr', '') print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile) except Exception as e: logging.debug("could not parse hgvs field: %s", queryHgvsName)
import pyhgvs import pyhgvs.utils as pyhgvs_utils from pygr.seqdb import SequenceFileDB import sys import os import json import string_comp ORIGINAL_FILE = "../BRCA_selectedLabs_only/ClinVarBRCA.selectedLabsOnly.txt" ERROR = "../BRCA_selectedLabs_only/BRCA.wrong_genome_Coor" FILE = "../BRCA_selectedLabs_only/BRCA.pre-processed" GENOME = SequenceFileDB("../reference_files/hg19.fa") with open('../reference_files/genes.refGene.BRCA.txt') as infile: transcripts = pyhgvs_utils.read_transcripts(infile) def get_transcript(name): return transcripts.get(name) def main(): check_HGVS_conversion_error() #HGVS_conversion() def HGVS_to_genome_coor(HGVS): try: chrm, pos, ref, alt = pyhgvs.parse_hgvs_name( HGVS, GENOME, get_transcript=get_transcript) chrm = chrm[3:] pos = str(pos)
('NM_000352.3', 'c', '>', CDNACoord(215, -10), CDNACoord(215, -10), 'A', 'G') """ from __future__ import print_function from __future__ import unicode_literals import pyhgvs as hgvs import pyhgvs.utils as hgvs_utils from pyfaidx import Fasta # Read genome sequence using pyfaidx. genome = Genome('/tmp/hg19.fa') # Read RefSeq transcripts into a python dict. with open('pyhgvs/data/genes.refGene') as infile: transcripts = hgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) # Parse the HGVS name into genomic coordinates and alleles. chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript) print(chrom, offset, ref, alt) # Returns variant in VCF style: ('chr11', 17496508, 'T', 'C') # Notice that since the transcript is on the negative strand, the alleles # are reverse complemented during conversion.
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hgvs_parser = hgvs.parser.Parser() hgvs_dp = hgvs.dataproviders.uta.connect() hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp) hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38') genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1'] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue cdna_coord_LOVD = cdna_coord_LOVD.strip() # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue try: chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) except Exception as e: print('parse error: {}'.format(cdna_coord_LOVD)) print(e) protein_coord = None if calcProtein: try: genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38) var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord) var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message protein_coord = hgvs_am.c_to_p(var_c1_norm) except Exception as e: template = "An error of type {0} occured. Arguments:{1!r}" error_name = type(e).__name__ message = template.format(error_name, e.args) logging.error(message) logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change) logging.error(line) # Exceptions related to invalid data data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError']) if error_name not in data_errors: # output some more if exception doesn't seem to be related to invalid data logging.error("Non data error raised") logging.exception(message) if error_name == "DatabaseError": # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs raise EnvironmentError("Issue with UTA database. Aborting") # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36) line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37) line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()