def test_fromFiles_TwoFilesReturnsValidRecallReport(self):
        contents_1 = """sample	query_probe_header	ref_probe_header	classification
CFT073	>CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;	>GT_CONF=1;	unmapped
CFT073	>CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;	>CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133;	primary_correct
CFT073	>CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;	>GT_CONF=3;	unmapped
"""
        contents_2 = """sample	query_probe_header	ref_probe_header	classification
CFT073	>CHROM=1;POS=1260;IV=[50,60);PVID=4;NB_ALL=4;ALL_ID=4;NB_DIFF_ALL_SEQ=4;ALL_SEQ_ID=4;	>CHROM=GC00000578_3;SAMPLE=CFT073;POS=165;IV=[25,29);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=3;GT_CONF=3.22199;	primary_incorrect
CFT073	>CHROM=1;POS=1262;IV=[60,70);PVID=5;NB_ALL=5;ALL_ID=5;NB_DIFF_ALL_SEQ=5;ALL_SEQ_ID=5;	>GT_CONF=5;	unmapped
CFT073	>CHROM=1;POS=1281;IV=[70,80);PVID=6;NB_ALL=6;ALL_ID=6;NB_DIFF_ALL_SEQ=6;ALL_SEQ_ID=6;	>GT_CONF=6;	unmapped
"""
        path_1 = create_tmp_file(contents_1)
        path_2 = create_tmp_file(contents_2)

        contents_1_input = StringIO(contents_1)
        contents_2_input = StringIO(contents_2)
        dataframes = [
            pd.read_csv(contents_1_input, sep="\t", keep_default_na=False),
            pd.read_csv(contents_2_input, sep="\t", keep_default_na=False),
        ]

        actual = PrecisionReport.from_files([path_1, path_2])
        expected = PrecisionReport(dataframes)

        path_1.unlink()
        path_2.unlink()

        assert actual == expected
    def test_calculatePrecision_OneReportWithThreeRowsTwoPartiallyCorrectOneBelowThreshold(
            self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(
            data=[
                create_precision_report_row(0.4, gt_conf=100),
                create_precision_report_row(0.8, gt_conf=20),
                create_precision_report_row(0.3, gt_conf=100),
            ],
            columns=columns,
        )
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        confidence_threshold = 80

        actual = calculator._calculate_precision_for_a_given_confidence(
            confidence_threshold)

        assert actual.precision == 0.7 / 2.0
        assert actual.true_positives == 0.7
        assert actual.total == 2.0
    def test_calculatePrecision_NoReportsRaisesEmptyReportError(self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(columns=columns)
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        with pytest.raises(EmptyReportError):
            calculator._calculate_precision_for_a_given_confidence()
    def test_calculatePrecision_OneReportWithOneRowCompletelyCorrectBelowConfThreasholdRaisesEmptyReportError(
            self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(data=[create_precision_report_row(1.0, gt_conf=10)],
                          columns=columns)
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        confidence_threshold = 60

        with pytest.raises(EmptyReportError):
            calculator._calculate_precision_for_a_given_confidence(
                confidence_threshold)
    def test_calculatePrecision_OneReportWithOneRowCompletelyIncorrectReturnsZero(
            self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(data=[create_precision_report_row(0.0, gt_conf=100)],
                          columns=columns)
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        actual = calculator._calculate_precision_for_a_given_confidence()

        assert actual.precision == 0.0
        assert actual.true_positives == 0.0
        assert actual.total == 1.0
    def test_calculatePrecision_OneReportWithOneRowCompletelyCorrectEqualConfThreasholdReturnsOne(
            self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(data=[create_precision_report_row(1.0, gt_conf=60)],
                          columns=columns)
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        confidence_threshold = 60

        actual = calculator._calculate_precision_for_a_given_confidence(
            confidence_threshold)

        assert actual.precision == 1.0
        assert actual.true_positives == 1.0
        assert actual.total == 1.0
    def test_init_gtconfIsExtractedCorrectly(self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        dfs = pd.DataFrame(
            data=[
                create_precision_report_row(0.0, gt_conf=100),
                create_precision_report_row(0.0, gt_conf=100),
                create_precision_report_row(0.0, gt_conf=10),
                create_precision_report_row(0.0, gt_conf=100),
            ],
            columns=columns,
        )
        report = PrecisionReport([dfs])
        actual = report.report.GT_CONF

        expected = pd.Series([100.0, 100.0, 10.0, 100.0])

        assert actual.equals(expected)
    def test_calculatePrecision_OneReportWithTwoRowsPartiallyCorrect(self):
        columns = [
            "sample", "query_probe_header", "ref_probe_header",
            "classification"
        ]
        df = pd.DataFrame(
            data=[
                create_precision_report_row(0.5, gt_conf=100),
                create_precision_report_row(0.7, gt_conf=100),
            ],
            columns=columns,
        )
        report = PrecisionReport([df])
        calculator = PrecisionCalculator(report)

        actual = calculator._calculate_precision_for_a_given_confidence()

        assert actual.precision == 1.2 / 2
        assert actual.true_positives == 1.2
        assert actual.total == 2.0
import pandas as pd

# setup
precision_report_files_for_all_samples = (
    snakemake.input.precision_report_files_for_all_samples)
output = Path(snakemake.output.precision_file_for_all_samples)
gt_conf_percentiles = snakemake.params.gt_conf_percentiles
tool = snakemake.wildcards.tool
coverage = snakemake.wildcards.coverage
coverage_threshold = snakemake.wildcards.coverage_threshold
strand_bias_threshold = snakemake.wildcards.strand_bias_threshold
gaps_threshold = snakemake.wildcards.gaps_threshold

# API usage
logging.info(f"Loading report")
precision_report = PrecisionReport.from_files(
    precision_report_files_for_all_samples)

logging.info(f"Creating calculator")
precision_calculator = PrecisionCalculator(precision_report)

logging.info(f"Calculating precision")
precision_df = precision_calculator.get_precision_report(gt_conf_percentiles)

metadata_df = pd.DataFrame(
    data={
        "tool": [tool] * len(precision_df),
        "coverage": [coverage] * len(precision_df),
        "coverage_threshold": [coverage_threshold] * len(precision_df),
        "strand_bias_threshold": [strand_bias_threshold] * len(precision_df),
        "gaps_threshold": [gaps_threshold] * len(precision_df),
    })
예제 #10
0
# setup
precision_report_files_for_one_sample = (
    snakemake.input.precision_report_files_for_one_sample)
output = Path(snakemake.output.precision_file_for_one_sample)
sample = snakemake.wildcards.sample
tool = snakemake.wildcards.tool
coverage = snakemake.wildcards.coverage
coverage_threshold = snakemake.wildcards.coverage_threshold
strand_bias_threshold = snakemake.wildcards.strand_bias_threshold
gaps_threshold = snakemake.wildcards.gaps_threshold
gt_conf_percentiles = [0]

# API usage
logging.info(f"Loading report")
precision_report = PrecisionReport.from_files(
    precision_report_files_for_one_sample)

logging.info(f"Creating calculator")
precision_calculator = PrecisionCalculator(precision_report)

logging.info(f"Calculating precision")
precision_df = precision_calculator.get_precision_report(gt_conf_percentiles)

metadata_df = pd.DataFrame(
    data={
        "sample": [sample] * len(precision_df),
        "tool": [tool] * len(precision_df),
        "coverage": [coverage] * len(precision_df),
        "coverage_threshold": [coverage_threshold] * len(precision_df),
        "strand_bias_threshold": [strand_bias_threshold] * len(precision_df),
        "gaps_threshold": [gaps_threshold] * len(precision_df),