def evaluate_minor_allele(vepParser, lfh): ''' some dbSNP variants are reported in VCF w/single mutations but are actually multiallelic (have additional minor allele) and reported as such on the dbSNP website. To catch these we need to evaluate the 1000Genomes minor allele frequency ''' refSnpId = vepParser.get('ref_snp_id') refAllele = vepParser.get('ref_allele') altAlleles = vepParser.get('alt_allele') frequencies = vepParser.get_frequencies(refSnpId) if frequencies is None: return None if 'minor_allele_freq' not in frequencies: # nothing to change return frequencies minorAlleleFreq = frequencies['minor_allele_freq'] minorAllele = frequencies['minor_allele'] match = False aFreq = get_allele_frequencies(minorAllele, frequencies) # minor allele already in frequency list; update if aFreq is not None: match = True if '1000Genomes' in aFreq: frequencies['values'][minorAllele]['1000Genomes'].append( {'gmaf': minorAlleleFreq}) else: frequencies['values'][minorAllele]['1000Genomes'] = [{ 'gmaf': minorAlleleFreq }] # the minor allele is one of the alt alleles in the VCF, but not in frequency list; add elif minorAllele in altAlleles: match = True frequencies['values'][minorAllele]['1000Genomes'] = [{ 'gmaf': minorAlleleFreq }] # then check to see if it is a normalized allele (INDEL/DIV/INS/etc) else: for alt in altAlleles: nRef, nAlt = VcfEntryParser(None).normalize_alleles( refAllele, alt, snvDivMinus=True) if minorAllele == nAlt: match = True aFreq = get_allele_frequencies(nAlt, frequencies) if aFreq is not None: # this should never happen, b/c it should have been caught above, but... if '1000Genomes' in aFreq: frequencies['values'][minorAllele][ '1000Genomes'].append({'gmaf': minorAlleleFreq}) else: frequencies['values'][minorAllele]['1000Genomes'] = [{ 'gmaf': minorAlleleFreq }] else: frequencies['values'][minorAllele]['1000Genomes'] = [{ 'gmaf': minorAlleleFreq }] break # if match found, no need to iterate over the rest if minorAllele == nRef: # don't add it in; can/should be inferred from other allele frequencies match = True warning("Matched minor allele", minorAllele, "to normalized reference for variant -", refSnpId, refAllele + '/' + '/'.join(altAlleles), "- IGNORED", file=lfh, flush=True) break if not match: warning("Unable to match minor allele", minorAllele, "to variant -", refSnpId, "- alleles:", refAllele + '/' + '/'.join(altAlleles), "Assuming VCF/dbSNP record mismatch.", file=lfh, flush=True) frequencies['values'][minorAllele]['1000Genomes'] = [{ 'gmaf': minorAlleleFreq }] # also add it to the alternative alleles, so it will be loaded as its own variant vepParser.set('alt_allele', altAlleles.append(minorAllele)) return frequencies
def load_annotation(chromosome): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' lfn = xstr(chromosome) + '.log' lfh = open(lfn, 'w') algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py', json.dumps({'chromosome': chromosome}), args.commit) algInvocId = xstr(algInvocation.getAlgorithmInvocationId()) algInvocation.close() warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True) vepParser = VepJsonParser(args.rankingFile, verbose=True) indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = 'chr' + xstr(chromosome) + '.json.gz' fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 skipCount = 0 resume = True if args.resumeAfter is None else False if not resume: warning("--resumeAfter flag specified; Finding skip until point", args.resumeAfter, file=lfh, flush=True) previousSnp = None with database.cursor() as cursor: copyObj = io.StringIO() with open(fname, 'r') as fhandle: mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() if not resume: if previousSnp == args.resumeAfter: warning(previousSnp, refSnpId, file=lfh, flush=True) warning("Resuming after:", args.resumeAfter, "- SKIPPED", skipCount, "lines.", file=lfh, flush=True) resume = True else: previousSnp = refSnpId skipCount = skipCount + 1 continue if lineCount == 1 or variantCount % args.commitAfter == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() vepParser.set('ref_snp_id', refSnpId) refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: vepParser.set('is_multi_allelic', isMultiAllelic) vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies(vepParser, lfh) vepParser.adsp_rank_and_sort_consequences() # for each allele for alt in altAllele: variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location(alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles( refAllele, alt, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence( nAlt, vepParser) valueStr = '#'.join( ('chr' + xstr(chrom), xstr(position), xstr(vepParser.get('is_multi_allelic')), binIndex, refSnpId, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str( get_adsp_ranked_allele_consequences( nAlt, vepParser)), json.dumps(vepResult), algInvocId)) copyObj.write(valueStr + '\n') if variantCount % args.logAfter == 0 \ and variantCount % args.commitAfter != 0: warning("PARSED", variantCount, file=lfh, flush=True) if variantCount % args.commitAfter == 0: tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" if variantCount % args.logAfter == 0: warning(message, "; up to = ", refSnpId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) if args.test: die("Test complete") copyObj = io.StringIO() # reset io string except Exception as e: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) warning(str(e), file=lfh, flush=True) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True) mappedFile.close() database.close() indexer.close() lfh.close()
def load_annotation(): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() lineCount = 0 variantCount = 0 with database.cursor() as cursor: with open(args.vcfFile, 'r') as fh: with open(args.logFileName, 'w') as lfh: warning("Parsing", args.vcfFile, file=lfh, flush=True) for line in fh: lineCount = lineCount + 1 vepResult = {} # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refAllele = entry.get('ref') altAllele = entry.get('alt') # assuming not multiallelic if refAllele == '0': # happens sometimes refAllele = '?' if altAllele == '0': altAllele = '?' # truncatedRef = truncate(refAllele, 20) # truncatedAlt = truncate(altAllele, 20) chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt)) isMultiAllelic = False # assuming b/c of how the VCF files were generated metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele)) try: if variant_is_missing(database, 'chr' + chrom, metaseqId): warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True) variantCount = variantCount + 1 positionEnd = entry.infer_variant_end_location(altAllele) binIndex = indexer.find_bin_index(chrom, position, positionEnd) cursor.execute(INSERT_SQL, ('chr' + chrom, position, isMultiAllelic, binIndex, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 50 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", metaseqId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) print("FAIL", file=sys.stdout) raise warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True) database.close() indexer.close()
def load_annotation(chromosome): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = '00-All.MT.vcf.gz' if chromosome == 'M' \ else '00-All.' + xstr(chromosome) + '.vcf.gz' logFname = path.join(args.logDir, fname + '.log') fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 warning("Parsing", fname) warning("Logging:", logFname) with database.cursor() as cursor: with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh: warning("Parsing", fname, file=lfh, flush=True) mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = { } # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') if '.' not in altAllele: if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) continue # sanity check/all missing are lacking alt alleles chrom = xstr(entry.get('chrom')) position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: if variant_is_missing(database, 'chr' + chrom, refSnpId): warning(refSnpId, "- MISSING -- LOADING", file=lfh, flush=True) for alt in altAllele: if alt == '.': # checking again in case some are multiallelic alt = '?' variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location( alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) cursor.execute( INSERT_SQL, ('chr' + chrom, xstr(position), isMultiAllelic, binIndex, refSnpId, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) raise if not args.commit: database.rollback() warning("DONE -- rolling back") mappedFile.close() database.close() indexer.close() warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
def update_variant_records_from_vcf(): ''' lookup and update variant records from a VCF file assuming the load by file was called by a plugin, so this variant has already been verified to be new to the resource; no need to check alternative metaseq IDs''' cupdater = CADDUpdater(args.logFile, args.databaseDir) database = Database(args.gusConfigFile) database.connect() lineCount = 0 with database.cursor() as updateCursor, \ open(args.vcfFile, 'r') as fh: try: for line in fh: if line.startswith("#"): continue lineCount = lineCount + 1 entry = VcfEntryParser(line.rstrip()) refAllele = entry.get('ref') altAllele = entry.get('alt') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) metaseqId = ':'.join( (chrom, xstr(position), refAllele, altAllele)) record = {"metaseq_id": metaseqId} # mimic "record" if len(refAllele) > 1 or len(altAllele) > 1: # only doing SNVs update_variant_record(record, cupdater, INDEL) else: update_variant_record(record, cupdater, SNV) if lineCount % args.commitAfter == 0: warning("Processed:", lineCount, "- SNVs:", cupdater.get_update_count(SNV), "- INDELS:", cupdater.get_update_count(INDEL), " - Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() cupdater.clear_update_sql() if cupdater.buffered_variant_count() > 0: # trailing updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() warning("DONE - Updated SNVs:", cupdater.get_update_count(SNV), "- Updated INDELS:", cupdater.get_update_count(INDEL), "- Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) cupdater.close_lfh() except Exception as e: warning(e, entry, file=cupdater.lfh(), flush=True) database.rollback() database.close() print("FAIL", file=sys.stdout) raise
def load_annotation(): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' fname = args.inputFile lineCount = 0 variantCount = 0 skipCount = 0 warning("Parsing variants from:", fname) # should print to plugin log with database.cursor() as cursor, database.cursor("RealDictCursor") as dcursor: copyObj = io.StringIO() with open(fname, 'r') as fh: with open(args.logFile, 'w') as lfh: warning("Parsing variants from:", fname, file=lfh, flush=True) for line in fh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there may be json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() if lineCount == 1 or variantCount % 5000 == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() refAllele = entry.get('ref') altAllele = entry.get('alt') # assumes no multialleic variants if refAllele == '0': # happens sometimes refAllele = '?' if altAllele == '0': altAllele = '?' # truncatedRef = truncate(refAllele, 20) # truncatedAlt = truncate(altAllele, 20) chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) try: # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt)) metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele)) if duplicate(metaseqId, dcursor): warning("SKIPPING:",metaseqId, "- already loaded.", file=lfh, flush=True) continue vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies() vepParser.adsp_rank_and_sort_consequences() # for each allele variantCount = variantCount + 1 positionEnd = entry.infer_variant_end_location(altAllele) binIndex = indexer.find_bin_index(chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles(refAllele, altAllele, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence(nAlt) valueStr = '#'.join(( 'chr' + xstr(chrom), xstr(position), binIndex, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str(get_adsp_ranked_allele_consequences(nAlt)), json.dumps(vepResult), algInvocId )) copyObj.write(valueStr + '\n') if variantCount % 5000 == 0: warning("FOUND", variantCount, " new variants", file=lfh, flush=True) tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" warning(message, "; up to = ", metaseqId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) copyObj = io.StringIO() # reset io string except Exception: warning("ERROR parsing variant on line", line, ' - ', metaseqId, file=lfh, flush=True) print("FAIL", file=sys.stdout) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True)