class AlgorithmInvocation(object): ''' transaction management for algorithm invocation ''' def __init__(self, script=None, parameters=None, commit=False, gusConfigFile=None): self._database = None self._algorithm_invocation_id = None self.__get_db_handle(gusConfigFile) if script is not None: self.insertAlgorithmInvocation(script, parameters, commit) def __get_db_handle(self, gusConfigFile): ''' create database connection ''' self._database = Database(gusConfigFile) self._database.connect() def insertAlgorithmInvocation(self, script, parameters, commit): ''' create the entry for the algorithm invocation ''' sql = """INSERT INTO AlgorithmInvocation (script_name, script_parameters, commit_mode) VALUES (%s, %s, %s) RETURNING algorithm_invocation_id""" with self._database.cursor() as cursor: cursor.execute(sql, (script, parameters, commit)) self._algorithm_invocation_id = cursor.fetchone()[0] self._database.commit() def getAlgorithmInvocationId(self): ''' return algorithm invocation id ''' return self._algorithm_invocation_id def close(self): ''' close db connection ''' self._database.close()
def load_annotation(chromosome): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' lfn = xstr(chromosome) + '.log' lfh = open(lfn, 'w') algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py', json.dumps({'chromosome': chromosome}), args.commit) algInvocId = xstr(algInvocation.getAlgorithmInvocationId()) algInvocation.close() warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True) vepParser = VepJsonParser(args.rankingFile, verbose=True) indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = 'chr' + xstr(chromosome) + '.json.gz' fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 skipCount = 0 resume = True if args.resumeAfter is None else False if not resume: warning("--resumeAfter flag specified; Finding skip until point", args.resumeAfter, file=lfh, flush=True) previousSnp = None with database.cursor() as cursor: copyObj = io.StringIO() with open(fname, 'r') as fhandle: mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() if not resume: if previousSnp == args.resumeAfter: warning(previousSnp, refSnpId, file=lfh, flush=True) warning("Resuming after:", args.resumeAfter, "- SKIPPED", skipCount, "lines.", file=lfh, flush=True) resume = True else: previousSnp = refSnpId skipCount = skipCount + 1 continue if lineCount == 1 or variantCount % args.commitAfter == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() vepParser.set('ref_snp_id', refSnpId) refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: vepParser.set('is_multi_allelic', isMultiAllelic) vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies(vepParser, lfh) vepParser.adsp_rank_and_sort_consequences() # for each allele for alt in altAllele: variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location(alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles( refAllele, alt, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence( nAlt, vepParser) valueStr = '#'.join( ('chr' + xstr(chrom), xstr(position), xstr(vepParser.get('is_multi_allelic')), binIndex, refSnpId, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str( get_adsp_ranked_allele_consequences( nAlt, vepParser)), json.dumps(vepResult), algInvocId)) copyObj.write(valueStr + '\n') if variantCount % args.logAfter == 0 \ and variantCount % args.commitAfter != 0: warning("PARSED", variantCount, file=lfh, flush=True) if variantCount % args.commitAfter == 0: tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" if variantCount % args.logAfter == 0: warning(message, "; up to = ", refSnpId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) if args.test: die("Test complete") copyObj = io.StringIO() # reset io string except Exception as e: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) warning(str(e), file=lfh, flush=True) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True) mappedFile.close() database.close() indexer.close() lfh.close()
def load_annotation(): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() lineCount = 0 variantCount = 0 with database.cursor() as cursor: with open(args.vcfFile, 'r') as fh: with open(args.logFileName, 'w') as lfh: warning("Parsing", args.vcfFile, file=lfh, flush=True) for line in fh: lineCount = lineCount + 1 vepResult = {} # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refAllele = entry.get('ref') altAllele = entry.get('alt') # assuming not multiallelic if refAllele == '0': # happens sometimes refAllele = '?' if altAllele == '0': altAllele = '?' # truncatedRef = truncate(refAllele, 20) # truncatedAlt = truncate(altAllele, 20) chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) # metaseqId = ':'.join((chrom, xstr(position), truncatedRef, truncatedAlt)) isMultiAllelic = False # assuming b/c of how the VCF files were generated metaseqId = ':'.join((chrom, xstr(position), refAllele, altAllele)) try: if variant_is_missing(database, 'chr' + chrom, metaseqId): warning(metaseqId, "- MISSING -- LOADING", file=lfh, flush=True) variantCount = variantCount + 1 positionEnd = entry.infer_variant_end_location(altAllele) binIndex = indexer.find_bin_index(chrom, position, positionEnd) cursor.execute(INSERT_SQL, ('chr' + chrom, position, isMultiAllelic, binIndex, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 50 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", metaseqId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) print("FAIL", file=sys.stdout) raise warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True) database.close() indexer.close()
def load_annotation(chromosome): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = '00-All.MT.vcf.gz' if chromosome == 'M' \ else '00-All.' + xstr(chromosome) + '.vcf.gz' logFname = path.join(args.logDir, fname + '.log') fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 warning("Parsing", fname) warning("Logging:", logFname) with database.cursor() as cursor: with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh: warning("Parsing", fname, file=lfh, flush=True) mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = { } # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') if '.' not in altAllele: if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) continue # sanity check/all missing are lacking alt alleles chrom = xstr(entry.get('chrom')) position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: if variant_is_missing(database, 'chr' + chrom, refSnpId): warning(refSnpId, "- MISSING -- LOADING", file=lfh, flush=True) for alt in altAllele: if alt == '.': # checking again in case some are multiallelic alt = '?' variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location( alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) cursor.execute( INSERT_SQL, ('chr' + chrom, xstr(position), isMultiAllelic, binIndex, refSnpId, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) raise if not args.commit: database.rollback() warning("DONE -- rolling back") mappedFile.close() database.close() indexer.close() warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)
def update_variant_records_from_vcf(): ''' lookup and update variant records from a VCF file assuming the load by file was called by a plugin, so this variant has already been verified to be new to the resource; no need to check alternative metaseq IDs''' cupdater = CADDUpdater(args.logFile, args.databaseDir) database = Database(args.gusConfigFile) database.connect() lineCount = 0 with database.cursor() as updateCursor, \ open(args.vcfFile, 'r') as fh: try: for line in fh: if line.startswith("#"): continue lineCount = lineCount + 1 entry = VcfEntryParser(line.rstrip()) refAllele = entry.get('ref') altAllele = entry.get('alt') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) metaseqId = ':'.join( (chrom, xstr(position), refAllele, altAllele)) record = {"metaseq_id": metaseqId} # mimic "record" if len(refAllele) > 1 or len(altAllele) > 1: # only doing SNVs update_variant_record(record, cupdater, INDEL) else: update_variant_record(record, cupdater, SNV) if lineCount % args.commitAfter == 0: warning("Processed:", lineCount, "- SNVs:", cupdater.get_update_count(SNV), "- INDELS:", cupdater.get_update_count(INDEL), " - Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() cupdater.clear_update_sql() if cupdater.buffered_variant_count() > 0: # trailing updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: database.commit() else: database.rollback() warning("DONE - Updated SNVs:", cupdater.get_update_count(SNV), "- Updated INDELS:", cupdater.get_update_count(INDEL), "- Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) cupdater.close_lfh() except Exception as e: warning(e, entry, file=cupdater.lfh(), flush=True) database.rollback() database.close() print("FAIL", file=sys.stdout) raise
def update_variant_records(chromosome): chrLabel = 'chr' + xstr(chromosome) logFileName = path.join(args.logFilePath, chrLabel + '.log') cupdater = CADDUpdater(logFileName, args.databaseDir) cupdater.setChrm(chrLabel) selectSQL = "SELECT metaseq_id, cadd_scores FROM Variant_" + chrLabel database = Database(args.gusConfigFile) database.connect() lineCount = 0 updateCount = 0 updateIndelCount = 0 skipCount = 0 with database.cursor("RealDictCursor") as selectCursor, \ database.cursor() as updateCursor: try: warning("Fetching", chrLabel, "variants", file=cupdater.lfh(), flush=True) selectCursor.execute(selectSQL) warning("DONE - Fetching", file=cupdater.lfh(), flush=True) for record in selectCursor: if args.debug and args.veryVerbose: warning(record, file=cupdater.lfh(), flush=True) if record['cadd_scores'] is not None: if args.debug and args.veryVerbose: warning("Skipping", record['metaseq_id'], file=cupdater.lfh(), flush=True) skipCount = skipCount + 1 continue lineCount = lineCount + 1 metaseqId = record['metaseq_id'] chrm, position, refAllele, altAllele = metaseqId.split(':') if len(refAllele) > 1 or len(altAllele) > 1: # only doing SNVs update_variant_record(record, cupdater, INDEL) else: update_variant_record(record, cupdater, SNV) if cupdater.get_total_update_count( ) % args.commitAfter == 0 and cupdater.buffered_variant_count( ) > 0: if args.commit: if args.debug: warning("Starting Update", file=cupdater.lfh(), flush=True) updateCursor.execute(cupdater.sql_buffer_str()) if args.debug: warning("Done", file=cupdater.lfh(), flush=True) cupdater.clear_update_sql() database.commit() else: database.rollback() warning(metaseqId, "- Processed:", lineCount, "- SNVs:", cupdater.get_update_count(SNV), "- INDELS:", cupdater.get_update_count(INDEL), "- Skipped:", skipCount, " - Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) if cupdater.buffered_variant_count() > 0: updateCursor.execute(cupdater.sql_buffer_str()) if args.commit: # trailing database.commit() else: database.rollback() warning("DONE - Updated SNVs:", cupdater.get_update_count(SNV), "- Updated INDELS:", cupdater.get_update_count(INDEL), "- Skipped", skipCount, "- Not Matched:", cupdater.get_not_matched_count(), file=cupdater.lfh(), flush=True) cupdater.close_lfh() except Exception as e: warning(e, file=cupdater.lfh(), flush=True) if args.commit: database.commit() else: database.rollback() database.close() raise database.close()
help= "full path file containing mapping of chr names to length; tab-delim, no header", required=True) parser.add_argument('--commit', action='store_true', help="run in commit mode", required=False) parser.add_argument( '--gusConfigFile', '--full path to gus config file, else assumes $GUS_HOME/config/gus.config' ) args = parser.parse_args() increments = [ -1, 64000000, 32000000, 16000000, 8000000, 4000000, 2000000, 1000000, 500000, 250000, 125000, 62500, 31250, 15625 ] binCount = 0 numLevels = len(increments) chrMap = read_chr_map() insertSql = "INSERT INTO BinIndexRef (chromosome, level, global_bin, global_bin_path, location) VALUES (%s, %s, %s, %s, %s)" database = Database(args.gusConfigFile) database.connect() cursor = database.cursor() load_bins() database.close()