def test_get_delta_score_acceptor(self): record = Record('10', 94077, 'A', ['C']) scores = get_delta_scores(record, self.ann, 500, 0) self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) record = Record('chr10', 94077, 'A', ['C']) scores = get_delta_scores(record, self.ann, 500, 0) self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
def test_get_delta_score_donor(self): record = Record('10', 94555, 'C', ['T']) scores = get_delta_scores(record, self.ann) self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) scores = get_delta_scores(record, self.ann_without_prefix) self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) record = Record('chr10', 94555, 'C', ['T']) scores = get_delta_scores(record, self.ann) self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) scores = get_delta_scores(record, self.ann_without_prefix) self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
def run_serial(args): """ 串行运行 """ try: vcf = pysam.VariantFile(args.I) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() header = vcf.header header.add_line( '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant ' 'annotation. These include delta scores (DS) and delta positions (DP) for ' 'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). ' 'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">' ) try: output = pysam.VariantFile(args.O, mode='w', header=header) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() ann = Annotator(args.R, args.A) for record in vcf: scores = get_delta_scores(record, ann, args.D, args.M) if len(scores) > 0: record.info['SpliceAI'] = scores output.write(record) vcf.close() output.close()
def test_get_delta_score_donor(self): ''' test get_delta_scores for a predicted donor ''' class Record(): chrom, pos, ref, alts = '10', 94555, 'C', ['T'] record = Record() scores = get_delta_scores(record, self.ann) self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0'])
def test_get_delta_score_acceptor(self): ''' test get_delta_scores for a predicted acceptor ''' class Record(): chrom, pos, ref, alts = '10', 94077, 'A', ['C'] record = Record() scores = get_delta_scores(record, self.ann) self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193'])
def main(): args = get_options() if None in [args.I, args.O, args.D, args.M]: logging.error( 'Usage: spliceai [-h] [-I [input]] [-O [output]] -R reference -A annotation ' '[-D [distance]] [-M [mask]]') exit() try: vcf = pysam.VariantFile(args.I) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() header = vcf.header header.add_line( '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant ' 'annotation. These include delta scores (DS) and delta positions (DP) for ' 'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). ' 'This version also includes the distance (DIST) to the nearest splice site.' 'Format: ALLELE|SYMBOL|DIST|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">' ) try: output = pysam.VariantFile(args.O, mode='w', header=header) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() # loading prescored files prescored_files = [] try: for filename in args.P: vcf_file = pysam.VariantFile(filename) prescored_files.append(vcf_file) except (IOError, ValueError) as e: logging.error('{}'.format(e)) ann = Annotator(args.R, args.A) for record in vcf: scores = get_delta_scores(record, ann, args.D, args.M, prescored_files) if len(scores) > 0: record.info['SpliceAI'] = scores output.write(record) vcf.close() output.close()
def main(): args = get_options() vcf = pysam.VariantFile(args.I) header = vcf.header header.add_line( '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAI variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">' ) output = pysam.VariantFile(args.O, mode='w', header=header) ann = annotator(args.R, args.A) for record in vcf: scores = get_delta_scores(record, ann) if len(scores) > 0: record.info['SpliceAI'] = scores output.write(record)
def main(): args = get_options() if None in [args.I, args.O, args.D, args.M]: logging.error( 'Usage: spliceai [-h] [-I [input]] [-O [output]] -R reference -A annotation ' '[-D [distance]] [-M [mask]]') exit() try: vcf = pysam.VariantFile(args.I) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() header = vcf.header ###Adding header lines required to satisfy vcf format output.write() header.add_line('###fileDate=20191004') header.add_line('##reference=GRCh37/hg19') header.add_line( '##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant ' 'annotation. These include delta scores (DS) and delta positions (DP) for ' 'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). ' 'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">' ) header.add_line( '##INFO=<ID=NS,Number=20000,Type=Integer,Description="Dummy NS">') try: output = pysam.VariantFile(args.O, mode='w', header=header) except (IOError, ValueError) as e: logging.error('{}'.format(e)) exit() ann = Annotator(args.R, args.A) for record in vcf: scores = get_delta_scores(record, ann, args.D, args.M) if len(scores) > 0: record.info['SpliceAI'] = scores output.write(record) vcf.close() output.close()
def process_record(records, results, ref_fasta, annotations, dist_var, mask): # 创建一个注释助手类 ann = Annotator(ref_fasta, annotations) # 监听队列 while True: # 尝试从队列获得一个待打分的变异 try: record = records.get_nowait() except queue.Empty: continue # 判断队列是否结束 if record != 'END': # 对变异进行打分并把结果放入队列 scores = get_delta_scores(record, ann, dist_var, mask) results.put((record.id, scores)) else: # 队列结束,重新把结束标志放入队列,以终止其他进程 records.put('END') break
def process_record(ann, distance, mask, record): scores = get_delta_scores(record, ann, distance, mask) if len(scores) > 0: record.info['SpliceAI'] = scores return record
def process_variant(variant, genome_version, spliceai_distance, spliceai_mask, use_precomputed_scores): try: chrom, pos, ref, alt = parse_variant(variant) except ValueError as e: return { "variant": variant, "error": f"ERROR: {e}", } if len(ref) > 1 and len(alt) > 1: return { "variant": variant, "error": f"ERROR: SpliceAI does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}", } # generate error message if variant falls outside annotated exons or introns OTHER_GENOME_VERSION = {"37": "38", "38": "37"} chrom_without_chr = chrom.replace("chr", "") if not ANNOTATION_INTERVAL_TREES[genome_version][chrom_without_chr].at(pos): other_genome_version = OTHER_GENOME_VERSION[genome_version] other_genome_overlapping_intervals = ANNOTATION_INTERVAL_TREES[other_genome_version][chrom_without_chr].at(pos) if other_genome_overlapping_intervals: other_genome_genes = " and ".join(sorted(set([str(i.data).split("---")[0] for i in other_genome_overlapping_intervals]))) return { "variant": variant, "error": f"ERROR: In GRCh{genome_version}, {chrom}-{pos}-{ref}-{alt} falls outside all gencode exons and introns." f"SpliceAI only works for variants within known exons or introns. However, in GRCh{other_genome_version}, " f"{chrom}:{pos} falls within {other_genome_genes}, so perhaps GRCh{genome_version} is not the correct genome version?" } else: return { "variant": variant, "error": f"ERROR: {chrom}-{pos}-{ref}-{alt} falls outside all Gencode exons and introns on " f"GRCh{genome_version}. SpliceAI only works for variants that are within known exons or introns.", } """ NOTE: The reason SpliceAI currently works only for variants " f"within annotated exons or introns is that, although the SpliceAI neural net takes any " f"arbitrary nucleotide sequence as input, SpliceAI needs 1) the transcript strand " f"to determine whether to reverse-complement the reference genome sequence before passing it " f"to the neural net, and 2) transcript start and end positions to determine where to truncate " f"the reference genome sequence. """ source = None scores = [] if (len(ref) <= 5 or len(alt) <= 2) and str(spliceai_distance) == str(SPLICEAI_DEFAULT_DISTANCE) and str(use_precomputed_scores) == "1": # examples: ("masked", "snv", "hg19") ("raw", "indel", "hg38") key = ( "masked" if str(spliceai_mask) == "1" else ("raw" if str(spliceai_mask) == "0" else None), "snv" if len(ref) == 1 and len(alt) == 1 else "indel", "hg19" if genome_version == "37" else ("hg38" if genome_version == "38" else None), ) try: results = SPLICEAI_CACHE_FILES[key].fetch(chrom, pos-1, pos+1) for line in results: # ['1', '739023', '.', 'C', 'CT', '.', '.', 'SpliceAI=CT|AL669831.1|0.00|0.00|0.00|0.00|-1|-37|-48|-37'] fields = line.split("\t") if fields[0] == chrom and int(fields[1]) == pos and fields[3] == ref and fields[4] == alt: scores.append(fields[7]) if scores: source = "lookup" #print(f"Fetched: ", scores, flush=True) except Exception as e: print(f"ERROR: couldn't retrieve scores using tabix: {type(e)}: {e}", flush=True) if not scores: if exceeds_rate_limit(request.remote_addr, request_type="SpliceAI: computed"): return { "variant": variant, "error": f"ERROR: Rate limit reached. To prevent a user from overwhelming the server and making it " f"unavailable to other users, this tool allows no more than " f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE['SpliceAI: computed']} computed requests per minute per user.", } record = VariantRecord(chrom, pos, ref, alt) try: scores = get_delta_scores( record, SPLICEAI_ANNOTATOR[genome_version], spliceai_distance, spliceai_mask) source = "computed" #print(f"Computed: ", scores, flush=True) except Exception as e: return { "variant": variant, "error": f"ERROR: {type(e)}: {e}", } if not scores: return { "variant": variant, "error": f"ERROR: The SpliceAI model did not return any scores for {variant}. This is typically due to the " f"variant falling outside of all Gencode exons and introns.", } scores = [s[s.index("|")+1:] for s in scores] # drop allele field return { "variant": variant, "genome_version": genome_version, "chrom": chrom, "pos": pos, "ref": ref, "alt": alt, "scores": scores, "source": source, }