def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest sample-list', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_sample_list', type=str) parser.add_argument( '--valid-sample-list', type=str, default=None, help='Sample ids not found in this list will cause an error') parser.add_argument('--prefix', type=str, default=None, help='Prefix to add to metric names') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) samples = iou.read_samples_list(args.test_sample_list, fail_on_empty=False) if args.valid_sample_list is not None: valid_samples = iou.read_samples_list(args.valid_sample_list) else: valid_samples = None # Get metrics metrics = get_metrics(samples, valid_samples, args.prefix) # Write metrics write_metrics(metrics)
def get_metrics(file, sample_list): samples = iou.read_samples_list(sample_list) samples_set = set(samples) data = [0, 0, 0, 0] # ++, --, +-, -+ for line in file: tokens = line.decode().strip().split('\t') test_record(tokens, samples_set) first = tokens[2] second = tokens[5] val = first + second if val == '++': data[0] += 1 elif val == '--': data[1] += 1 elif val == '+-': data[2] += 1 elif val == '-+': data[3] += 1 else: raise ValueError("Unrecognized orientation: %s / %s" % (first, second)) if len(samples) == 1: metric_suffix = "_" + samples[0] else: metric_suffix = "_merged" return { PLUS_PLUS_KEY + metric_suffix: data[0], MINUS_MINUS_KEY + metric_suffix: data[1], PLUS_MINUS_KEY + metric_suffix: data[2], MINUS_PLUS_KEY + metric_suffix: data[3] }
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest ped-file', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_ped_file', type=str) parser.add_argument( '--sample-list', type=str, default=None, help='Sample ids not found in this list will cause an error') parser.add_argument('--prefix', type=str, default=None, help='Prefix to add to metric names') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.sample_list is not None: samples = iou.read_samples_list(args.sample_list) else: samples = None # Get metrics df = pd.read_csv(args.test_ped_file, sep='\t', names=range(6)) metrics = get_metrics(df, valid_samples=samples, metric_prefix=args.prefix) # Write metrics write_metrics(metrics)
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest medcov', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_file', type=str) parser.add_argument('sample_list', type=str) parser.add_argument('--baseline-file', type=str, default=None) # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) samples = iou.read_samples_list(args.sample_list) # Read file with open(args.test_file, mode='r') as ftest: if args.baseline_file is None: metrics = get_metrics(ftest, None, samples) else: with open(args.baseline_file, mode='r') as fbase: metrics = get_metrics(ftest, fbase, samples) # Write metrics write_metrics(metrics)
def get_metrics(matrix_file, sample_list, low_mem_mode): samples = iou.read_samples_list(sample_list) samples_set = set(samples) header = matrix_file.readline().decode().strip().split('\t') header_samples_set = set(header[3:]) tu.test_sets_equal(header_samples_set, samples_set, item_str="sample", name_a="header", name_b="samples list") data = [] interval_size = None num_records = 0 for line in matrix_file: num_records += 1 tokens = line.decode().strip().split('\t') tu.test_is_int(tokens, 1) tu.test_is_int(tokens, 2) if interval_size is None: interval_size = int(tokens[2]) - int(tokens[1]) else: if interval_size != int(tokens[2]) - int(tokens[1]): raise ValueError( "Interval not of size {:d}: {:s}:{:d}-{:d}".format( interval_size, tokens[0], int(tokens[1]), int(tokens[2]))) counts = tokens[3:] test_record(counts, len(samples_set)) if not low_mem_mode: data.append([int(x) for x in counts]) if not low_mem_mode: arr = np.asarray(data) quantiles = np.quantile(arr, [0.25, 0.50, 0.75]) max_over_samples = arr.max(axis=1) num_zero_in_all = len(max_over_samples[max_over_samples == 0]) min_over_samples = arr.min(axis=1) num_zero_in_one = len(min_over_samples[min_over_samples == 0]) metrics = { Q25_KEY: quantiles[0], Q50_KEY: quantiles[1], Q75_KEY: quantiles[2], INTERVALS_KEY: num_records, ALL_ZERO_KEY: num_zero_in_all, ONE_ZERO_KEY: num_zero_in_one } column_means = arr.mean(axis=0) col = 0 for sample in header[3:]: metrics[SAMPLE_MEAN_KEY + "_" + sample] = column_means[col] col += 1 else: metrics = {INTERVALS_KEY: num_records} return metrics
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest merged-depth', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_bed', type=str) parser.add_argument('contig_list', type=str) parser.add_argument('type', type=str) parser.add_argument('--baseline-bed', type=str, default=None, help="Baseline bed file to evaluate against") parser.add_argument( '--test-hits', type=str, help= "List of test record ids that overlap baseline set (required if using --baseline-bed)" ) parser.add_argument( '--baseline-hits', type=str, help= "List of baseline record ids that overlap test set (required if using --baseline-bed)" ) # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if (bool(args.baseline_bed) ^ bool(args.test_hits)) or \ (bool(args.baseline_bed) ^ bool(args.baseline_hits)) or \ (bool(args.test_hits) ^ bool(args.baseline_hits)): raise ValueError( "Inconsistent arguments specified: --baseline-bed, --test-hits, and --baseline-hits must be specified together." ) contigs = iou.read_contig_list(args.contig_list) # Read file with gzip.open(args.test_bed, mode='rb') as ftest: if args.baseline_bed is None: metrics = get_metrics(ftest, None, contigs, args.type, args.test_hits, args.baseline_hits) else: with gzip.open(args.baseline_bed, mode='rb') as fbase: metrics = get_metrics(ftest, fbase, contigs, args.type, args.test_hits, args.baseline_hits) # Write metrics write_metrics(metrics)
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest plot-metrics', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('metrics_a', type=str) parser.add_argument('metrics_b', type=str) parser.add_argument('pdf_out', type=str) parser.add_argument('--sample-list', type=str, default=None) parser.add_argument('--changes-only', action='store_true', help='Only plot values that are different') parser.add_argument('--metrics-out', type=str, help='Write plotted metrics to tsv', default=None) # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Read metric tables and join df_a = get_metrics(args.metrics_a) df_b = get_metrics(args.metrics_b) df = df_a.join(df_b, how='outer', lsuffix='_a', rsuffix='_b', sort=True) # If sample ids are provided, consolidate sample-specific metrics if args.sample_list is not None: samples = iou.read_samples_list(args.sample_list) df = consolidate_sample_metrics(df, samples) # Only plot changed metrics if args.changes_only: df = df[df["value_a"] != df["value_b"]] # Write raw data to file if args.metrics_out is not None: df.to_csv(args.metrics_out, sep='\t') # Plot plot_data(df, args.pdf_out)
def get_metrics(baf_file, sample_list): samples = iou.read_samples_list(sample_list) samples_set = set(samples) data = [] for line in baf_file: tokens = line.decode().strip().split('\t') test_record(tokens, samples_set) baf = float(tokens[2]) data.append(baf) arr = np.asarray(data) quantiles = np.quantile(arr, [0.25, 0.50, 0.75]) if len(samples) == 1: metric_suffix = "_" + samples[0] else: metric_suffix = "_merged" return { Q25_KEY + metric_suffix: quantiles[0], Q50_KEY + metric_suffix: quantiles[1], Q75_KEY + metric_suffix: quantiles[2], COUNT_KEY + metric_suffix: len(arr) }
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest metrics-file', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('metrics_file', type=str) parser.add_argument('contig_list', type=str) parser.add_argument('--common', action='store_true') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) contigs = iou.read_contig_list(args.contig_list) tu.test_is_not_empty(contigs, "contigs") # Read file df = pd.read_csv(args.metrics_file, sep='\t') metrics = get_metrics(df, contigs, args.common) # Write metrics write_metrics(metrics)
def get_metrics(sr_file, sample_list): samples = iou.read_samples_list(sample_list) samples_set = set(samples) side_metrics = [0, 0] for line in sr_file: tokens = line.decode().strip().split('\t') test_record(tokens, samples_set) side = tokens[2] if side == 'left': side_metrics[0] += 1 elif side == 'right': side_metrics[1] += 1 else: raise ValueError("Unrecognized orientation: %s" % side) if len(samples) == 1: metric_suffix = "_" + samples[0] else: metric_suffix = "_merged" return { LEFT_KEY + metric_suffix: side_metrics[0], RIGHT_KEY + metric_suffix: side_metrics[1] }
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest vcf', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_vcf', type=str) parser.add_argument('contig_list', type=str) parser.add_argument('sample_list', type=str) parser.add_argument( 'types', type=str, help='Comma-delimited list of variant types (case-sensitive)') parser.add_argument('metric_prefix', type=str) parser.add_argument( '--baseline-vcf', type=str, help='Baseline vcf to provide evaluation metrics against') parser.add_argument( '--baseline-bed', type=str, help= 'Baseline bed file to provide evaluation metrics against. Must have header beginning with "' + BED_FILE_HEADER_CHAR + '" and the following columns: "' + '", "'.join([ BED_FILE_CHROM_COL, BED_FILE_START_COL, BED_FILE_END_COL, BED_FILE_SVTYPE_COL ]) + '"') parser.add_argument( '--min-reciprocal-overlap', type=float, default=0.5, help='Minimum reciprocal overlap for validation metrics [0.5]') parser.add_argument('--padding', type=int, default=50, help='Interval padding for validation metrics [50]') parser.add_argument( '--max-warnings', type=int, default=50, help='Maximum number of records to print warnings for [50]') parser.add_argument('--fp-file', type=str, default=None, help='Write false positives to file') parser.add_argument('--fn-file', type=str, default=None, help='Write false negatives to file') parser.add_argument('--fp-pass-file', type=str, default=None, help='Write PASS false positives to file') parser.add_argument('--fn-pass-file', type=str, default=None, help='Write PASS false negatives to file') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if (args.baseline_vcf is None and args.baseline_bed is None) and (args.fp_file is not None or args.fn_file is not None): raise ValueError( "FP and FN files cannot be generated if --baseline-vcf and --baseline-bed aren't specified" ) if args.baseline_vcf is not None and args.baseline_bed is not None: raise ValueError( "Cannot specify both --baseline-vcf and --baseline-bed") types_list = args.types.split(',') contigs = iou.read_contig_list(args.contig_list) samples = iou.read_samples_list(args.sample_list) metrics, fp_intervals, fn_intervals, fp_intervals_pass, fn_intervals_pass = get_metrics( args.test_vcf, args.baseline_vcf, args.baseline_bed, contigs, types_list, args.min_reciprocal_overlap, args.padding, samples, args.metric_prefix, args.max_warnings) # Write metrics write_metrics(metrics) if args.fp_file is not None and fp_intervals is not None: write_intervals(args.fp_file, fp_intervals) if args.fn_file is not None and fn_intervals is not None: write_intervals(args.fn_file, fn_intervals) if args.fp_pass_file is not None and fp_intervals_pass is not None: write_intervals(args.fp_pass_file, fp_intervals_pass) if args.fn_pass_file is not None and fn_intervals_pass is not None: write_intervals(args.fn_pass_file, fn_intervals_pass)
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtest vcf', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('test_vcf', type=str) parser.add_argument('contig_list', type=str) parser.add_argument('sample_list', type=str) parser.add_argument( 'types', type=str, help='Comma-delimited list of variant types (case-sensitive)') parser.add_argument('metric_prefix', type=str) parser.add_argument( '--baseline-vcf', type=str, help='Baseline vcf to provide evaluation metrics against') parser.add_argument( '--min-reciprocal-overlap', type=float, default=0.5, help='Minimum reciprocal overlap for validation metrics [0.5]') parser.add_argument('--padding', type=int, default=50, help='Interval padding for validation metrics [50]') parser.add_argument( '--max-warnings', type=int, default=50, help='Maximum number of records to print warnings for [50]') parser.add_argument('--fp-file', type=str, default=None, help='Write false positives to file') parser.add_argument('--fn-file', type=str, default=None, help='Write false negatives to file') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.baseline_vcf is None and (args.fp_file is not None or args.fn_file is not None): raise ValueError( "FP and FN files cannot be generated if --baseline-vcf isn't specified" ) types_list = args.types.split(',') contigs = iou.read_contig_list(args.contig_list) samples = iou.read_samples_list(args.sample_list) metrics, fp_intervals, fn_intervals = get_metrics( args.test_vcf, args.baseline_vcf, contigs, types_list, args.min_reciprocal_overlap, args.padding, samples, args.metric_prefix, args.max_warnings) # Write metrics write_metrics(metrics) if args.fp_file is not None and fp_intervals is not None: write_intervals(args.fp_file, fp_intervals) if args.fn_file is not None and fn_intervals is not None: write_intervals(args.fn_file, fn_intervals)