def load_annotation(chromosome): ''' parse over a JSON file, extract position, frequencies, ids, and ADSP-ranked most severe consequence; bulk load using COPY ''' lfn = xstr(chromosome) + '.log' lfh = open(lfn, 'w') algInvocation = AlgorithmInvocation('parallel_load_dbsnp_vep_result.py', json.dumps({'chromosome': chromosome}), args.commit) algInvocId = xstr(algInvocation.getAlgorithmInvocationId()) algInvocation.close() warning("Algorithm Invocation ID", algInvocId, file=lfh, flush=True) vepParser = VepJsonParser(args.rankingFile, verbose=True) indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = 'chr' + xstr(chromosome) + '.json.gz' fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 skipCount = 0 resume = True if args.resumeAfter is None else False if not resume: warning("--resumeAfter flag specified; Finding skip until point", args.resumeAfter, file=lfh, flush=True) previousSnp = None with database.cursor() as cursor: copyObj = io.StringIO() with open(fname, 'r') as fhandle: mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = json.loads(line.rstrip()) vepParser.set_annotation(copy.deepcopy(vepResult)) vepInputStr = vepParser.get('input') entry = VcfEntryParser(vepInputStr) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() if not resume: if previousSnp == args.resumeAfter: warning(previousSnp, refSnpId, file=lfh, flush=True) warning("Resuming after:", args.resumeAfter, "- SKIPPED", skipCount, "lines.", file=lfh, flush=True) resume = True else: previousSnp = refSnpId skipCount = skipCount + 1 continue if lineCount == 1 or variantCount % args.commitAfter == 0: warning('Processing new copy object', file=lfh, flush=True) tstart = datetime.now() vepParser.set('ref_snp_id', refSnpId) refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') chrom = xstr(entry.get('chrom')) if chrom == 'MT': chrom = 'M' position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: vepParser.set('is_multi_allelic', isMultiAllelic) vepParser.set('ref_allele', refAllele) vepParser.set('alt_allele', altAllele) frequencies = get_frequencies(vepParser, lfh) vepParser.adsp_rank_and_sort_consequences() # for each allele for alt in altAllele: variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location(alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) # NOTE: VEP uses normalized alleles to indicate variant_allele nRef, nAlt = entry.normalize_alleles( refAllele, alt, snvDivMinus=True) alleleFreq = None if frequencies is None \ else get_allele_frequencies(nAlt, frequencies) msConseq = get_most_severe_consequence( nAlt, vepParser) valueStr = '#'.join( ('chr' + xstr(chrom), xstr(position), xstr(vepParser.get('is_multi_allelic')), binIndex, refSnpId, metaseqId, json2str(alleleFreq), json2str(msConseq), json2str( get_adsp_ranked_allele_consequences( nAlt, vepParser)), json.dumps(vepResult), algInvocId)) copyObj.write(valueStr + '\n') if variantCount % args.logAfter == 0 \ and variantCount % args.commitAfter != 0: warning("PARSED", variantCount, file=lfh, flush=True) if variantCount % args.commitAfter == 0: tendw = datetime.now() warning('Copy object prepared in ' + str(tendw - tstart) + '; ' + str(copyObj.tell()) + ' bytes; transfering to database', file=lfh, flush=True) copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "COMMITTED " + message else: database.rollback() message = "PARSED " + message + " -- rolling back" if variantCount % args.logAfter == 0: warning(message, "; up to = ", refSnpId, file=lfh, flush=True) tend = datetime.now() warning('Database copy time: ' + str(tend - tendw), file=lfh, flush=True) warning(' Total time: ' + str(tend - tstart), file=lfh, flush=True) if args.test: die("Test complete") copyObj = io.StringIO() # reset io string except Exception as e: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) warning(str(e), file=lfh, flush=True) raise # final commit / leftovers copyObj.seek(0) cursor.copy_from(copyObj, 'variant', sep='#', null="NULL", columns=VARIANT_COLUMNS) message = '{:,}'.format(variantCount) if args.commit: database.commit() message = "DONE - COMMITTED " + message else: database.rollback() message = "DONE - PARSED " + message + " -- rolling back" warning(message, file=lfh, flush=True) mappedFile.close() database.close() indexer.close() lfh.close()
def load_annotation(chromosome): ''' extract basic SNP information from VCF line; check against AnnotatedDB; if missing load ''' indexer = BinIndex(args.gusConfigFile, verbose=False) database = Database(args.gusConfigFile) database.connect() fname = '00-All.MT.vcf.gz' if chromosome == 'M' \ else '00-All.' + xstr(chromosome) + '.vcf.gz' logFname = path.join(args.logDir, fname + '.log') fname = path.join(args.dir, fname) lineCount = 0 variantCount = 0 warning("Parsing", fname) warning("Logging:", logFname) with database.cursor() as cursor: with open(fname, 'r') as fhandle, open(logFname, 'w') as lfh: warning("Parsing", fname, file=lfh, flush=True) mappedFile = mmap.mmap(fhandle.fileno(), 0, prot=mmap.PROT_READ) # put file in swap with gzip.GzipFile(mode='r', fileobj=mappedFile) as gfh: for line in gfh: lineCount = lineCount + 1 vepResult = { } # will just be the input string, so it matches the annotated variants if line.startswith('#'): continue entry = VcfEntryParser(line.rstrip()) # there are json formatting issues w/the input str # so replace w/the parsed entry; which is now a dict vepResult['input'] = entry.get_entry() refSnpId = entry.get_refsnp() refAllele = entry.get('ref') altAllele = entry.get('alt').split(',') if '.' not in altAllele: if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) continue # sanity check/all missing are lacking alt alleles chrom = xstr(entry.get('chrom')) position = int(entry.get('pos')) isMultiAllelic = len(altAllele) > 1 try: if variant_is_missing(database, 'chr' + chrom, refSnpId): warning(refSnpId, "- MISSING -- LOADING", file=lfh, flush=True) for alt in altAllele: if alt == '.': # checking again in case some are multiallelic alt = '?' variantCount = variantCount + 1 metaseqId = ':'.join( (chrom, xstr(position), refAllele, alt)) positionEnd = entry.infer_variant_end_location( alt) binIndex = indexer.find_bin_index( chrom, position, positionEnd) cursor.execute( INSERT_SQL, ('chr' + chrom, xstr(position), isMultiAllelic, binIndex, refSnpId, metaseqId, json.dumps(vepResult), algInvocId)) if args.commit: database.commit() else: database.rollback() if lineCount % 25000 == 0: warning("Parsed", lineCount, "lines.", file=lfh, flush=True) warning("Loaded", variantCount, "missing variants", file=lfh, flush=True) except Exception: warning("ERROR parsing variant", refSnpId, file=lfh, flush=True) warning(lineCount, ":", line, file=lfh, flush=True) raise if not args.commit: database.rollback() warning("DONE -- rolling back") mappedFile.close() database.close() indexer.close() warning("DONE - Loaded", variantCount, "missing variants", file=lfh, flush=True)