def test_fromFiles_TwoFilesReturnsValidRecallReport(self): contents_1 = """sample query_probe_header ref_probe_header classification CFT073 >CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1; >GT_CONF=1; unmapped CFT073 >CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2; >CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133; primary_correct CFT073 >CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3; >GT_CONF=3; unmapped """ contents_2 = """sample query_probe_header ref_probe_header classification CFT073 >CHROM=1;POS=1260;IV=[50,60);PVID=4;NB_ALL=4;ALL_ID=4;NB_DIFF_ALL_SEQ=4;ALL_SEQ_ID=4; >CHROM=GC00000578_3;SAMPLE=CFT073;POS=165;IV=[25,29);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=3;GT_CONF=3.22199; primary_incorrect CFT073 >CHROM=1;POS=1262;IV=[60,70);PVID=5;NB_ALL=5;ALL_ID=5;NB_DIFF_ALL_SEQ=5;ALL_SEQ_ID=5; >GT_CONF=5; unmapped CFT073 >CHROM=1;POS=1281;IV=[70,80);PVID=6;NB_ALL=6;ALL_ID=6;NB_DIFF_ALL_SEQ=6;ALL_SEQ_ID=6; >GT_CONF=6; unmapped """ path_1 = create_tmp_file(contents_1) path_2 = create_tmp_file(contents_2) contents_1_input = StringIO(contents_1) contents_2_input = StringIO(contents_2) dataframes = [ pd.read_csv(contents_1_input, sep="\t", keep_default_na=False), pd.read_csv(contents_2_input, sep="\t", keep_default_na=False), ] actual = PrecisionReport.from_files([path_1, path_2]) expected = PrecisionReport(dataframes) path_1.unlink() path_2.unlink() assert actual == expected
def test_calculatePrecision_OneReportWithThreeRowsTwoPartiallyCorrectOneBelowThreshold( self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame( data=[ create_precision_report_row(0.4, gt_conf=100), create_precision_report_row(0.8, gt_conf=20), create_precision_report_row(0.3, gt_conf=100), ], columns=columns, ) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) confidence_threshold = 80 actual = calculator._calculate_precision_for_a_given_confidence( confidence_threshold) assert actual.precision == 0.7 / 2.0 assert actual.true_positives == 0.7 assert actual.total == 2.0
def test_calculatePrecision_NoReportsRaisesEmptyReportError(self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame(columns=columns) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) with pytest.raises(EmptyReportError): calculator._calculate_precision_for_a_given_confidence()
def test_calculatePrecision_OneReportWithOneRowCompletelyCorrectBelowConfThreasholdRaisesEmptyReportError( self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame(data=[create_precision_report_row(1.0, gt_conf=10)], columns=columns) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) confidence_threshold = 60 with pytest.raises(EmptyReportError): calculator._calculate_precision_for_a_given_confidence( confidence_threshold)
def test_calculatePrecision_OneReportWithOneRowCompletelyIncorrectReturnsZero( self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame(data=[create_precision_report_row(0.0, gt_conf=100)], columns=columns) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) actual = calculator._calculate_precision_for_a_given_confidence() assert actual.precision == 0.0 assert actual.true_positives == 0.0 assert actual.total == 1.0
def test_calculatePrecision_OneReportWithOneRowCompletelyCorrectEqualConfThreasholdReturnsOne( self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame(data=[create_precision_report_row(1.0, gt_conf=60)], columns=columns) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) confidence_threshold = 60 actual = calculator._calculate_precision_for_a_given_confidence( confidence_threshold) assert actual.precision == 1.0 assert actual.true_positives == 1.0 assert actual.total == 1.0
def test_init_gtconfIsExtractedCorrectly(self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] dfs = pd.DataFrame( data=[ create_precision_report_row(0.0, gt_conf=100), create_precision_report_row(0.0, gt_conf=100), create_precision_report_row(0.0, gt_conf=10), create_precision_report_row(0.0, gt_conf=100), ], columns=columns, ) report = PrecisionReport([dfs]) actual = report.report.GT_CONF expected = pd.Series([100.0, 100.0, 10.0, 100.0]) assert actual.equals(expected)
def test_calculatePrecision_OneReportWithTwoRowsPartiallyCorrect(self): columns = [ "sample", "query_probe_header", "ref_probe_header", "classification" ] df = pd.DataFrame( data=[ create_precision_report_row(0.5, gt_conf=100), create_precision_report_row(0.7, gt_conf=100), ], columns=columns, ) report = PrecisionReport([df]) calculator = PrecisionCalculator(report) actual = calculator._calculate_precision_for_a_given_confidence() assert actual.precision == 1.2 / 2 assert actual.true_positives == 1.2 assert actual.total == 2.0
import pandas as pd # setup precision_report_files_for_all_samples = ( snakemake.input.precision_report_files_for_all_samples) output = Path(snakemake.output.precision_file_for_all_samples) gt_conf_percentiles = snakemake.params.gt_conf_percentiles tool = snakemake.wildcards.tool coverage = snakemake.wildcards.coverage coverage_threshold = snakemake.wildcards.coverage_threshold strand_bias_threshold = snakemake.wildcards.strand_bias_threshold gaps_threshold = snakemake.wildcards.gaps_threshold # API usage logging.info(f"Loading report") precision_report = PrecisionReport.from_files( precision_report_files_for_all_samples) logging.info(f"Creating calculator") precision_calculator = PrecisionCalculator(precision_report) logging.info(f"Calculating precision") precision_df = precision_calculator.get_precision_report(gt_conf_percentiles) metadata_df = pd.DataFrame( data={ "tool": [tool] * len(precision_df), "coverage": [coverage] * len(precision_df), "coverage_threshold": [coverage_threshold] * len(precision_df), "strand_bias_threshold": [strand_bias_threshold] * len(precision_df), "gaps_threshold": [gaps_threshold] * len(precision_df), })
# setup precision_report_files_for_one_sample = ( snakemake.input.precision_report_files_for_one_sample) output = Path(snakemake.output.precision_file_for_one_sample) sample = snakemake.wildcards.sample tool = snakemake.wildcards.tool coverage = snakemake.wildcards.coverage coverage_threshold = snakemake.wildcards.coverage_threshold strand_bias_threshold = snakemake.wildcards.strand_bias_threshold gaps_threshold = snakemake.wildcards.gaps_threshold gt_conf_percentiles = [0] # API usage logging.info(f"Loading report") precision_report = PrecisionReport.from_files( precision_report_files_for_one_sample) logging.info(f"Creating calculator") precision_calculator = PrecisionCalculator(precision_report) logging.info(f"Calculating precision") precision_df = precision_calculator.get_precision_report(gt_conf_percentiles) metadata_df = pd.DataFrame( data={ "sample": [sample] * len(precision_df), "tool": [tool] * len(precision_df), "coverage": [coverage] * len(precision_df), "coverage_threshold": [coverage_threshold] * len(precision_df), "strand_bias_threshold": [strand_bias_threshold] * len(precision_df), "gaps_threshold": [gaps_threshold] * len(precision_df),