def main(params): args = parse_args(params) with open(args.callset1_vcf) as f: callset1_vcf = vcf.Reader(f) one_vars = Variants(callset1_vcf, MAX_INDEL_LEN) with open(args.callset2_vcf) as f: callset2_vcf = vcf.Reader(f) two_vars = Variants(callset2_vcf, MAX_INDEL_LEN) if args.reference: ref = Genome(args.reference, abbreviate=lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None stat_reporter, errors = evaluate_variants(one_vars, two_vars, args.sv_eps, args.sv_eps, ref, window) print_results(stat_reporter(VARIANT_TYPE.SNP), "SNP") print_results(stat_reporter(VARIANT_TYPE.INDEL_DEL), "INDEL DELETIONS") print_results(stat_reporter(VARIANT_TYPE.INDEL_INS), "INDEL INSERTIONS") print_results(stat_reporter(VARIANT_TYPE.INDEL_OTH), "INDEL OTHER") print_results(stat_reporter(VARIANT_TYPE.SV_DEL), "SV DELETIONS") print_results(stat_reporter(VARIANT_TYPE.SV_INS), "SV INSERTIONS") print_results(stat_reporter(VARIANT_TYPE.SV_OTH), "SV OTHER")
def main(params): args = parse_args(params) if args.normalize and not args.reference: print("Normalization requires a reference file.",file=sys.stderr) if args.reference: ref = Genome(args.reference,abbreviate= lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None with open(args.true_vcf) as f: true_vcf = vcf.Reader(f) if args.normalize: true_vcf = normalize(ref,true_vcf) true_vars = Variants(true_vcf, MAX_INDEL_LEN) with open(args.predicted_vcf) as f: pred_vcf = vcf.Reader(f) if args.normalize: pred_vcf = normalize(ref,pred_vcf) pred_vars = Variants(pred_vcf, MAX_INDEL_LEN) if args.knownFP: with open(args.knownFP) as f: known_fp_vcf = vcf.Reader(f) known_fp_vars = Variants(known_fp_vcf, MAX_INDEL_LEN, knownFP=True) else: known_fp_vars = None sv_eps = args.sv_eps stat_reporter, annotated_vars = evaluate_variants( true_vars, pred_vars, sv_eps, # tolerance for SV len sv_eps, # tolerance for SV breakpoints ref, window, known_fp_vars ) if args.output == "tsv": print(get_text_header(params),file=sys.stdout) tsvwriter = csv.writer(sys.stdout, delimiter='\t') tsvwriter.writerow(get_tsv_header(args.knownFP)) tsvwriter.writerow(tsv_row("SNP",stat_reporter(VARIANT_TYPE.SNP),args.snp_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Deletions",stat_reporter(VARIANT_TYPE.INDEL_DEL),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Insertions",stat_reporter(VARIANT_TYPE.INDEL_INS),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Inversions",stat_reporter(VARIANT_TYPE.INDEL_INV),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Other",stat_reporter(VARIANT_TYPE.INDEL_OTH),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Deletions",stat_reporter(VARIANT_TYPE.SV_DEL),args.sv_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Insertions",stat_reporter(VARIANT_TYPE.SV_INS),args.sv_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Other",stat_reporter(VARIANT_TYPE.SV_OTH),args.sv_err_rate,args.knownFP,args.hideFP)) else: print(get_text_header(params),file=sys.stdout) snp_stats = stat_reporter(VARIANT_TYPE.SNP) print_snp_stats(snp_stats, args.snp_err_rate, known_fp_vars,args.hideFP) def print_sv(var_type, description,args): assert 'INDEL' in var_type or 'SV' in var_type err_rate = args.indel_err_rate if 'INDEL' in var_type else args.sv_err_rate print_sv_stats(description, stat_reporter(var_type), err_rate,args) def print_oth(var_type, description): print_sv_other_results(description, stat_reporter(var_type)['num_true'], stat_reporter(var_type)['num_pred']) print_sv(VARIANT_TYPE.INDEL_DEL, 'INDEL DELETION',args) print_sv(VARIANT_TYPE.INDEL_INS, 'INDEL INSERTION',args) print_sv(VARIANT_TYPE.INDEL_INV, 'INDEL INVERSION',args) print_oth(VARIANT_TYPE.INDEL_OTH, 'INDEL OTHER') print_sv(VARIANT_TYPE.SV_DEL, 'SV DELETION',args), print_sv(VARIANT_TYPE.SV_INS, 'SV INSERTION',args), print_oth(VARIANT_TYPE.SV_OTH, 'SV OTHER') if args.output_vcf : output_annotated_variants(annotated_vars,ref.keys() if ref != None else None, open(args.output_vcf,'w'),get_vcf_header_lines(params))
def main(params): args = parse_args(params) with open(args.true_vcf) as f: true_vcf = vcf.Reader(f) true_vars = Variants(true_vcf, MAX_INDEL_LEN) with open(args.predicted_vcf) as f: pred_vcf = vcf.Reader(f) pred_vars = Variants(pred_vcf, MAX_INDEL_LEN) if args.reference: ref = Genome(args.reference, abbreviate=lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None if args.knownFP: with open(args.knownFP) as f: known_fp_vcf = vcf.Reader(f) known_fp_vars = Variants(known_fp_vcf, MAX_INDEL_LEN, knownFp=True) else: known_fp_vars = None # Estimated total number of errors in validation data for SNPs, indels and SVs. snp_err = get_snp_err(true_vars, args.snp_err_rate) indel_err = get_indel_err(true_vars, args.indel_err_rate) sv_err = get_sv_err(true_vars, args.sv_err_rate) sv_eps = args.sv_eps stat_reporter, errors = evaluate_variants(true_vars, pred_vars, sv_eps, sv_eps, ref, window, known_fp_vars) snp_stats = stat_reporter(VARIANT_TYPE.SNP) print_snp_stats(snp_stats, snp_err, known_fp_vars) def print_sv(var_type, description): assert 'INDEL' in var_type or 'SV' in var_type err = indel_err if 'INDEL' in var_type else sv_err print_sv_stats(description, stat_reporter(var_type), err) def print_oth(var_type, description): print_sv_other_results(description, stat_reporter(var_type)['num_true'], stat_reporter(var_type)['num_pred']) print_sv(VARIANT_TYPE.INDEL_DEL, 'INDEL DELETION') print_sv(VARIANT_TYPE.INDEL_INS, 'INDEL INSERTION') print_oth(VARIANT_TYPE.INDEL_OTH, 'INDEL OTHER') print_sv(VARIANT_TYPE.SV_DEL, 'SV DELETION'), print_sv(VARIANT_TYPE.SV_INS, 'SV INSERTION'), print_oth(VARIANT_TYPE.SV_OTH, 'SV OTHER') if args.err_vcf: output_errors(errors, ref.keys() if ref != None else None, open(args.err_vcf, 'w'))
def main(params): args = parse_args(params) if args.normalize and not args.reference: print("Normalization requires a reference file.",file=sys.stderr) if args.reference: ref = Genome(args.reference,abbreviate= lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None contig_lookup = get_contig_lookup(args.refindex) true_vcf = vcf.Reader(open(args.true_vcf)) if args.normalize: true_vcf = normalize(ref, true_vcf) pred_vcf = vcf.Reader(open(args.predicted_vcf)) if args.normalize: pred_vcf = normalize(ref,pred_vcf) sv_eps = args.sv_eps # if args.knownFP: # known_fp_vcf = vcf.Reader(open(args.knownFP,'r')) # known_fp_vars = Variants(known_fp_vcf,MAX_INDEL_LEN,knownFP=True) # else: # known_fp_vars = None if args.knownFP: true_vars = Variants(true_vcf,MAX_INDEL_LEN) pred_vars = Variants(pred_vcf,MAX_INDEL_LEN) known_fp_vcf = vcf.Reader(open(args.knownFP,'r')) known_fp_vars = Variants(known_fp_vcf,MAX_INDEL_LEN,knownFP=True) stat_reporter, annotated_vars = evaluate_variants( true_vars, pred_vars, sv_eps, sv_eps, ref, window, known_fp_vars) if args.output_vcf: output_annotated_variants(annotated_vars,ref.keys() if ref != None else None, open(args.output_vcf,'w'),get_vcf_header_lines(params)) else: if args.output_vcf: outVCF = vcf.Writer(open(args.output_vcf,'w'),true_vcf) else: outVCF = None stat_reporter = evaluate_low_memory( true_vcf, pred_vcf, sv_eps, sv_eps, ref, window, MAX_INDEL_LEN, contig_lookup, outVCF ) if args.output == "tsv": print(get_text_header(params),file=sys.stdout) tsvwriter = csv.writer(sys.stdout, delimiter='\t') tsvwriter.writerow(get_tsv_header(args.knownFP)) tsvwriter.writerow(tsv_row("SNP",stat_reporter(VARIANT_TYPE.SNP),args.snp_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Deletions",stat_reporter(VARIANT_TYPE.INDEL_DEL),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Insertions",stat_reporter(VARIANT_TYPE.INDEL_INS),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Inversions",stat_reporter(VARIANT_TYPE.INDEL_INV),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("Indel Other",stat_reporter(VARIANT_TYPE.INDEL_OTH),args.indel_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Deletions",stat_reporter(VARIANT_TYPE.SV_DEL),args.sv_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Insertions",stat_reporter(VARIANT_TYPE.SV_INS),args.sv_err_rate,args.knownFP,args.hideFP)) tsvwriter.writerow(tsv_row("SV Other",stat_reporter(VARIANT_TYPE.SV_OTH),args.sv_err_rate,args.knownFP,args.hideFP)) elif args.output == "json": output_dict = add_json_header(params,{}) output_dict["SNP"] = json_dict(stat_reporter(VARIANT_TYPE.SNP),args.snp_err_rate,args.knownFP,args.hideFP) output_dict["Indel Deletions"] = json_dict(stat_reporter(VARIANT_TYPE.INDEL_DEL),args.indel_err_rate,args.knownFP,args.hideFP) output_dict["Indel Insertions"] = json_dict(stat_reporter(VARIANT_TYPE.INDEL_INS),args.indel_err_rate,args.knownFP,args.hideFP) output_dict["Indel Inversions"] = json_dict(stat_reporter(VARIANT_TYPE.INDEL_INV),args.indel_err_rate,args.knownFP,args.hideFP) output_dict["Indel Other"] = json_dict(stat_reporter(VARIANT_TYPE.INDEL_OTH),args.indel_err_rate,args.knownFP,args.hideFP) output_dict["SV Deleitions"] = json_dict(stat_reporter(VARIANT_TYPE.SV_DEL),args.sv_err_rate,args.knownFP,args.hideFP) output_dict["SV Insertions"] = json_dict(stat_reporter(VARIANT_TYPE.SV_INS),args.sv_err_rate,args.knownFP,args.hideFP) json.dump(output_dict,sys.stdout) else: print(get_text_header(params),file=sys.stdout) snp_stats = stat_reporter(VARIANT_TYPE.SNP) print_snp_stats(snp_stats, args.snp_err_rate, args.knownFP,args.hideFP) def print_sv(var_type, description,args): assert 'INDEL' in var_type or 'SV' in var_type err_rate = args.indel_err_rate if 'INDEL' in var_type else args.sv_err_rate print_sv_stats(description, stat_reporter(var_type), err_rate,args) def print_oth(var_type, description): print_sv_other_results(description, stat_reporter(var_type)['num_true'], stat_reporter(var_type)['num_pred']) print_sv(VARIANT_TYPE.INDEL_DEL, 'INDEL DELETION',args) print_sv(VARIANT_TYPE.INDEL_INS, 'INDEL INSERTION',args) print_sv(VARIANT_TYPE.INDEL_INV, 'INDEL INVERSION',args) print_oth(VARIANT_TYPE.INDEL_OTH, 'INDEL OTHER') print_sv(VARIANT_TYPE.SV_DEL, 'SV DELETION',args), print_sv(VARIANT_TYPE.SV_INS, 'SV INSERTION',args), print_oth(VARIANT_TYPE.SV_OTH, 'SV OTHER')
def main(params): args = parse_args(params) if args.normalize and not args.reference: print("Normalization requires a reference file.", file=sys.stderr) if args.reference: ref = Genome(args.reference, abbreviate=lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None with open(args.true_vcf) as f: true_vcf = vcf.Reader(f) if args.normalize: true_vcf = normalize(ref, true_vcf) true_vars = Variants(true_vcf, MAX_INDEL_LEN) with open(args.predicted_vcf) as f: pred_vcf = vcf.Reader(f) if args.normalize: pred_vcf = normalize(ref, pred_vcf) pred_vars = Variants(pred_vcf, MAX_INDEL_LEN) if args.knownFP: with open(args.knownFP) as f: known_fp_vcf = vcf.Reader(f) known_fp_vars = Variants(known_fp_vcf, MAX_INDEL_LEN, knownFp=True) else: known_fp_vars = None # Estimated total number of errors in validation data for SNPs, indels and SVs. snp_err = get_snp_err(true_vars, args.snp_err_rate) indel_err = get_indel_err(true_vars, args.indel_err_rate) sv_err = get_sv_err(true_vars, args.sv_err_rate) sv_eps = args.sv_eps stat_reporter, annotated_vars = evaluate_variants(true_vars, pred_vars, sv_eps, sv_eps, ref, window, known_fp_vars) if args.output == "tsv": print(get_text_header(params), file=sys.stdout) tsvwriter = csv.writer(sys.stdout, delimiter='\t') tsvwriter.writerow(tsv_header) tsvwriter.writerow( tsv_row("SNP", stat_reporter(VARIANT_TYPE.SNP), snp_err)) tsvwriter.writerow( tsv_row("Indel Deletions", stat_reporter(VARIANT_TYPE.INDEL_DEL), indel_err)) tsvwriter.writerow( tsv_row("Indel Insertions", stat_reporter(VARIANT_TYPE.INDEL_INS), indel_err)) tsvwriter.writerow( tsv_row("Indel Inversions", stat_reporter(VARIANT_TYPE.INDEL_INV), indel_err)) tsvwriter.writerow( tsv_row("Indel Other", stat_reporter(VARIANT_TYPE.INDEL_OTH), indel_err)) tsvwriter.writerow( tsv_row("SV Deletions", stat_reporter(VARIANT_TYPE.SV_DEL), sv_err)) tsvwriter.writerow( tsv_row("SV Insertions", stat_reporter(VARIANT_TYPE.SV_INS), sv_err)) tsvwriter.writerow( tsv_row("SV Other", stat_reporter(VARIANT_TYPE.SV_OTH), sv_err)) else: print(get_text_header(params), file=sys.stdout) snp_stats = stat_reporter(VARIANT_TYPE.SNP) print_snp_stats(snp_stats, snp_err, known_fp_vars) def print_sv(var_type, description): assert 'INDEL' in var_type or 'SV' in var_type err = indel_err if 'INDEL' in var_type else sv_err print_sv_stats(description, stat_reporter(var_type), err) def print_oth(var_type, description): print_sv_other_results(description, stat_reporter(var_type)['num_true'], stat_reporter(var_type)['num_pred']) print_sv(VARIANT_TYPE.INDEL_DEL, 'INDEL DELETION') print_sv(VARIANT_TYPE.INDEL_INS, 'INDEL INSERTION') print_sv(VARIANT_TYPE.INDEL_INV, 'INDEL INVERSION') print_oth(VARIANT_TYPE.INDEL_OTH, 'INDEL OTHER') print_sv(VARIANT_TYPE.SV_DEL, 'SV DELETION'), print_sv(VARIANT_TYPE.SV_INS, 'SV INSERTION'), print_oth(VARIANT_TYPE.SV_OTH, 'SV OTHER') if args.output_vcf: output_annotated_variants(annotated_vars, ref.keys() if ref != None else None, open(args.output_vcf, 'w'), get_vcf_header_lines(params))
def main(params): args = parse_args(params) if args.normalize and not args.reference: print("Normalization requires a reference file.", file=sys.stderr) if args.reference: ref = Genome(args.reference, abbreviate=lambda ctig: ctig.split()[0]) window = args.window else: ref = None window = None contig_lookup = get_contig_lookup(args.refindex) true_vcf = vcf.Reader(open(args.true_vcf)) if args.normalize: true_vcf = normalize(ref, true_vcf) pred_vcf = vcf.Reader(open(args.predicted_vcf)) if args.normalize: pred_vcf = normalize(ref, pred_vcf) sv_eps = args.sv_eps # if args.knownFP: # known_fp_vcf = vcf.Reader(open(args.knownFP,'r')) # known_fp_vars = Variants(known_fp_vcf,MAX_INDEL_LEN,knownFP=True) # else: # known_fp_vars = None if args.knownFP: true_vars = Variants(true_vcf, MAX_INDEL_LEN) pred_vars = Variants(pred_vcf, MAX_INDEL_LEN) known_fp_vcf = vcf.Reader(open(args.knownFP, 'r')) known_fp_vars = Variants(known_fp_vcf, MAX_INDEL_LEN, knownFP=True) stat_reporter, annotated_vars = evaluate_variants( true_vars, pred_vars, sv_eps, sv_eps, ref, window, known_fp_vars) if args.output_vcf: output_annotated_variants(annotated_vars, ref.keys() if ref != None else None, open(args.output_vcf, 'w'), get_vcf_header_lines(params)) else: if args.output_vcf: outVCF = vcf.Writer(open(args.output_vcf, 'w'), true_vcf) else: outVCF = None stat_reporter = evaluate_low_memory(true_vcf, pred_vcf, sv_eps, sv_eps, ref, window, MAX_INDEL_LEN, contig_lookup, outVCF) if args.output == "tsv": print(get_text_header(params), file=sys.stdout) tsvwriter = csv.writer(sys.stdout, delimiter='\t') tsvwriter.writerow(get_tsv_header(args.knownFP)) tsvwriter.writerow( tsv_row("SNP", stat_reporter(VARIANT_TYPE.SNP), args.snp_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("Indel Deletions", stat_reporter(VARIANT_TYPE.INDEL_DEL), args.indel_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("Indel Insertions", stat_reporter(VARIANT_TYPE.INDEL_INS), args.indel_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("Indel Inversions", stat_reporter(VARIANT_TYPE.INDEL_INV), args.indel_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("Indel Other", stat_reporter(VARIANT_TYPE.INDEL_OTH), args.indel_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("SV Deletions", stat_reporter(VARIANT_TYPE.SV_DEL), args.sv_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("SV Insertions", stat_reporter(VARIANT_TYPE.SV_INS), args.sv_err_rate, args.knownFP, args.hideFP)) tsvwriter.writerow( tsv_row("SV Other", stat_reporter(VARIANT_TYPE.SV_OTH), args.sv_err_rate, args.knownFP, args.hideFP)) elif args.output == "json": output_dict = add_json_header(params, {}) output_dict["SNP"] = json_dict(stat_reporter(VARIANT_TYPE.SNP), args.snp_err_rate, args.knownFP, args.hideFP) output_dict["Indel Deletions"] = json_dict( stat_reporter(VARIANT_TYPE.INDEL_DEL), args.indel_err_rate, args.knownFP, args.hideFP) output_dict["Indel Insertions"] = json_dict( stat_reporter(VARIANT_TYPE.INDEL_INS), args.indel_err_rate, args.knownFP, args.hideFP) output_dict["Indel Inversions"] = json_dict( stat_reporter(VARIANT_TYPE.INDEL_INV), args.indel_err_rate, args.knownFP, args.hideFP) output_dict["Indel Other"] = json_dict( stat_reporter(VARIANT_TYPE.INDEL_OTH), args.indel_err_rate, args.knownFP, args.hideFP) output_dict["SV Deleitions"] = json_dict( stat_reporter(VARIANT_TYPE.SV_DEL), args.sv_err_rate, args.knownFP, args.hideFP) output_dict["SV Insertions"] = json_dict( stat_reporter(VARIANT_TYPE.SV_INS), args.sv_err_rate, args.knownFP, args.hideFP) json.dump(output_dict, sys.stdout) else: print(get_text_header(params), file=sys.stdout) snp_stats = stat_reporter(VARIANT_TYPE.SNP) print_snp_stats(snp_stats, args.snp_err_rate, args.knownFP, args.hideFP) def print_sv(var_type, description, args): assert 'INDEL' in var_type or 'SV' in var_type err_rate = args.indel_err_rate if 'INDEL' in var_type else args.sv_err_rate print_sv_stats(description, stat_reporter(var_type), err_rate, args) def print_oth(var_type, description): print_sv_other_results(description, stat_reporter(var_type)['num_true'], stat_reporter(var_type)['num_pred']) print_sv(VARIANT_TYPE.INDEL_DEL, 'INDEL DELETION', args) print_sv(VARIANT_TYPE.INDEL_INS, 'INDEL INSERTION', args) print_sv(VARIANT_TYPE.INDEL_INV, 'INDEL INVERSION', args) print_oth(VARIANT_TYPE.INDEL_OTH, 'INDEL OTHER') print_sv(VARIANT_TYPE.SV_DEL, 'SV DELETION', args), print_sv(VARIANT_TYPE.SV_INS, 'SV INSERTION', args), print_oth(VARIANT_TYPE.SV_OTH, 'SV OTHER')