def test___get_proportion_of_alleles_found_for_each_variant_with_nb_of_samples(
            self, *mocks):
        contents = StringIO(
            """sample,query_probe_header,PVID,ALL_SEQ_ID,NB_DIFF_ALL_SEQ,good_eval,ALL_ID,NB_ALL,NB_OF_SAMPLES
            S1,1,2,0,10,True,4,5,20
            S2,2,0,2,1,False,0,1,1
            S3,3,1,1,3,True,5,10,10
            S4,4,0,2,1,True,0,1,1
            S5,5,1,1,3,False,4,10,10
            S6,6,1,2,3,False,9,10,10
            S7,7,2,1,10,True,3,5,20
            S8,8,1,2,3,True,8,10,10
            S1,9,2,2,10,True,2,5,20
            S1,10,0,2,1,False,0,1,1
            S1,11,2,3,10,True,1,5,20
            S1,12,1,3,3,False,7,10,10
            S1,13,2,4,10,True,3,5,20
            S1,14,2,5,10,True,1,5,20
            S1,15,2,6,10,True,2,5,20
            S1,16,3,0,2,False,0,3,30
            S1,17,3,1,2,False,0,3,30
            """)
        report = RecallReport([pd.read_csv(contents)], False)

        actual = report.get_proportion_of_alleles_found_for_each_variant_with_nb_of_samples(
        )
        expected = pd.read_csv(
            StringIO("""PVID,proportion_of_alleles_found,NB_OF_SAMPLES
            0,1.0,1
            1,0.2,10
            2,0.8,20
            3,0.0,30
            """),
            index_col="PVID")
        assert actual.equals(expected)
    def test____get_id_to_nb_of_different_allele_sequences(self, *mocks):
        contents = StringIO(
            """sample,query_probe_header,PVID,NB_DIFF_ALL_SEQ,good_eval
S1,0,2,10,True
S2,1,0,1,False
S3,2,1,3,True
S4,3,0,1,True
S5,4,1,3,False
S6,5,1,3,False
S7,6,2,10,True
S8,7,1,3,True
S1,8,2,10,True
S1,9,0,1,False
S1,10,2,10,True
S1,11,1,3,False
S1,12,2,10,False
S1,13,2,10,False
S1,14,2,10,False
S1,15,3,2,False
S1,16,3,2,False
""")
        report = RecallReport([pd.read_csv(contents)], False)
        actual = report._get_id_to_nb_of_different_allele_sequences()
        expected = pd.read_csv(StringIO("""PVID,NB_DIFF_ALL_SEQ
0,1
1,3
2,10
3,2
"""),
                               index_col="PVID")

        assert actual.equals(expected)
    def test___get_proportion_of_alleles_found_for_each_variant(self, *mocks):
        contents = StringIO(
            """sample,query_probe_header,PVID,ALL_ID,NB_ALL,good_eval,ALL_SEQ_ID,NB_DIFF_ALL_SEQ
S1,0,2,0,10,True,0,0
S2,1,0,2,1,False,0,0
S3,2,1,1,3,True,0,0
S4,3,0,2,1,True,0,0
S5,4,1,1,3,False,0,0
S6,5,1,2,3,False,0,0
S7,6,2,1,10,True,0,0
S8,7,1,2,3,True,0,0
S1,8,2,2,10,True,0,0
S1,9,0,2,1,False,0,0
S1,10,2,3,10,True,0,0
S1,11,1,3,3,False,0,0
S1,12,2,4,10,False,0,0
S1,13,2,5,10,False,0,0
S1,14,2,6,10,False,0,0
S1,15,3,0,2,False,0,0
S1,16,3,1,2,False,0,0
""")
        report = RecallReport([pd.read_csv(contents)], False)
        actual = report.get_proportion_of_alleles_found_for_each_variant()
        expected = [1 / 1, 2 / 3, 4 / 10, 0 / 2]

        assert actual == expected
    def test_fromFiles_TwoFilesReturnsValidRecallReport(self):
        contents_1 = """sample	query_probe_header	ref_probe_header	classification
CFT073	>CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;	>GT_CONF=1;	unmapped
CFT073	>CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;	>CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133;	primary_correct
CFT073	>CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;	>GT_CONF=3;	unmapped
"""
        contents_2 = """sample	query_probe_header	ref_probe_header	classification
CFT073	>CHROM=1;POS=1260;IV=[50,60);PVID=4;NB_ALL=4;ALL_ID=4;NB_DIFF_ALL_SEQ=4;ALL_SEQ_ID=4;	>CHROM=GC00000578_3;SAMPLE=CFT073;POS=165;IV=[25,29);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=3;GT_CONF=3.22199;	primary_incorrect
CFT073	>CHROM=1;POS=1262;IV=[60,70);PVID=5;NB_ALL=5;ALL_ID=5;NB_DIFF_ALL_SEQ=5;ALL_SEQ_ID=5;	>GT_CONF=5;	unmapped
CFT073	>CHROM=1;POS=1281;IV=[70,80);PVID=6;NB_ALL=6;ALL_ID=6;NB_DIFF_ALL_SEQ=6;ALL_SEQ_ID=6;	>GT_CONF=6;	unmapped
"""
        path_1 = create_tmp_file(contents_1)
        path_2 = create_tmp_file(contents_2)

        contents_1_input = StringIO(contents_1)
        contents_2_input = StringIO(contents_2)
        dataframes = [
            pd.read_csv(contents_1_input, sep="\t", keep_default_na=False),
            pd.read_csv(contents_2_input, sep="\t", keep_default_na=False),
        ]

        actual = RecallReport.from_files([path_1, path_2])
        expected = RecallReport(dataframes)

        path_1.unlink()
        path_2.unlink()

        assert actual == expected
    def test___get_proportion_of_alleles_found_for_each_variant___duplicated_evaluation_is_disregarded(
            self, *mocks):
        contents = StringIO(
            """sample,query_probe_header,PVID,ALL_ID,NB_ALL,good_eval,ALL_SEQ_ID,NB_DIFF_ALL_SEQ
            S1,1,0,0,5,True,0,0
            S1,2,0,1,5,True,0,0
            S1,3,0,0,5,True,0,0
            S1,4,0,0,5,True,0,0
            S1,5,0,1,5,True,0,0
            """)
        report = RecallReport([pd.read_csv(contents)], False)
        actual = report.get_proportion_of_alleles_found_for_each_variant()
        expected = [2 / 5]

        assert actual == expected
 def test____calculate_info_wrt_variants(self, *mocks):
     report = RecallReport([pd.DataFrame()], False)
     nb_variants_where_all_allele_seqs_were_found, nb_variants_found_wrt_alleles, variants_total = \
         RecallCalculator._calculate_info_wrt_variants(report)
     assert nb_variants_where_all_allele_seqs_were_found == 6.3 and \
            nb_variants_found_wrt_alleles == 4.6 and \
            variants_total == 20
    def test____calculate_recall_for_a_given_confidence(
            self, calculate_info_wrt_variants_mock,
            calculate_info_wrt_truth_probes_mock,
            get_report_satisfying_confidence_threshold_mock, *other_mocks):
        # setup
        report_satisfying_confidence_threshold_mock = Mock()
        get_report_satisfying_confidence_threshold_mock.return_value = report_satisfying_confidence_threshold_mock
        report = RecallReport([pd.DataFrame()], False)
        calculator = RecallCalculator(report)

        recall_info_actual = calculator._calculate_recall_for_a_given_confidence(
            100)

        get_report_satisfying_confidence_threshold_mock.assert_called_once_with(
            100)
        calculate_info_wrt_truth_probes_mock.assert_called_once_with(
            report_satisfying_confidence_threshold_mock)
        calculate_info_wrt_variants_mock.assert_called_once_with(
            report_satisfying_confidence_threshold_mock)

        assert recall_info_actual.truth_probes_true_positives == 5
        assert recall_info_actual.truth_probes_total == 10
        assert recall_info_actual.nb_variants_where_all_allele_seqs_were_found == 4
        assert recall_info_actual.nb_variants_found_wrt_alleles == 8
        assert recall_info_actual.variants_total == 10
        assert recall_info_actual.recall_wrt_truth_probes == 0.5
        assert recall_info_actual.recall_wrt_variants_where_all_allele_seqs_were_found == 0.4
        assert recall_info_actual.recall_wrt_variants_found_wrt_alleles == 0.8
    def test____get_id_to_nb_of_samples(self, *mocks):
        contents = StringIO("""sample,query_probe_header,PVID,NB_OF_SAMPLES
            S1,0,2,3
            S2,1,0,4
            S3,2,1,10
            """)
        report = RecallReport([pd.read_csv(contents)], False)
        actual = report._get_id_to_nb_of_samples()
        expected = pd.read_csv(StringIO("""PVID,NB_OF_SAMPLES
            0,4
            1,10
            2,3
            """),
                               index_col="PVID")

        assert actual.equals(expected)
    def test_checkIfOnlyBestMappingIsKept_hasNoCorrectMapping_ChoosesTheOneWithHighestGTConf(
            self):
        dfs = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.UNMAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PARTIALLY_MAPPED,
                                     gt_conf=140,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PRIMARY_INCORRECT,
                                     gt_conf=150,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.SECONDARY_INCORRECT,
                                     gt_conf=110,
                                     with_gt_conf=True),
            create_recall_report_row(
                "truth_probe_1",
                AlignmentAssessment.SUPPLEMENTARY_INCORRECT,
                gt_conf=120,
                with_gt_conf=True),
        ], )

        report = RecallReport([dfs])
        actual = report.report
        expected = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PRIMARY_INCORRECT,
                                     gt_conf=150,
                                     with_gt_conf=True)
        ])
        assert_frame_equal(actual, expected, check_dtype=False)
    def test___get_recall_allele_seqs_vs_nb_of_samples_report___return_only_the_samples_given_in_parameter(
            self, *mocks):
        report = RecallReport([pd.DataFrame()], False)
        calculator = RecallCalculator(report)
        actual = calculator.get_recall_allele_seqs_vs_nb_of_samples_report(
            [2, 5])
        expected = pd.read_csv(
            StringIO("""NB_OF_SAMPLES,recall_PVR
            2,0.0
            5,0.25
            """))

        assert actual.equals(expected)
    def test___get_recall_alleles_vs_nb_of_samples_report(self, *mocks):
        report = RecallReport([pd.DataFrame()], False)
        calculator = RecallCalculator(report)
        actual = calculator.get_recall_alleles_vs_nb_of_samples_report(
            list(range(2, 8)))
        expected = pd.read_csv(
            StringIO("""NB_OF_SAMPLES,recall_AvgAR
            2,0.0
            3,0.95
            4,0.0
            5,0.55
            6,0.0
            7,0.6
            """))

        assert actual.equals(expected)
Exemplo n.º 12
0
    def test_checkIfOnlyBestMappingIsKept_hasPrimaryMapping_and_several_dfs(
            self):
        df_1 = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.UNMAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PARTIALLY_MAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )
        df_2 = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PRIMARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.SECONDARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )
        df_3 = pd.DataFrame(data=[
            create_recall_report_row(
                "truth_probe_1",
                AlignmentAssessment.SUPPLEMENTARY_INCORRECT,
                gt_conf=100,
                with_gt_conf=True),
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PRIMARY_CORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )

        report = RecallReport([df_1, df_2, df_3])
        actual = report.report
        expected = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.PRIMARY_CORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True)
        ])
        assert_frame_equal(actual, expected, check_dtype=False)
Exemplo n.º 13
0
    def test_init(self):
        contents_1 = """sample	query_probe_header	ref_probe_header	classification
CFT073	>CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;NB_OF_SAMPLES=10;	>GT_CONF=1;	unmapped
CFT073	>CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;NB_OF_SAMPLES=20;	>CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133;	primary_correct
CFT073	>CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;NB_OF_SAMPLES=30;	>GT_CONF=3;	unmapped
"""
        contents_1_input = StringIO(contents_1)
        dataframes = [
            pd.read_csv(contents_1_input, sep="\t", keep_default_na=False)
        ]
        report = RecallReport(dataframes)
        actual_df = report.report
        expected_df = pd.read_csv(StringIO(
            """sample	query_probe_header	ref_probe_header	classification	GT_CONF	PVID	NB_ALL	ALL_ID	NB_DIFF_ALL_SEQ	ALL_SEQ_ID	NB_OF_SAMPLES	good_eval
CFT073	>CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;NB_OF_SAMPLES=10;	>GT_CONF=1;	unmapped	1.0	1	1	1	1	1	10	False
CFT073	>CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;NB_OF_SAMPLES=20;	>CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133;	primary_correct	60.1133	2	2	2	2	2	20	True
CFT073	>CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;NB_OF_SAMPLES=30;	>GT_CONF=3;	unmapped	3.0	3	3	3	3	3	30	False
"""),
                                  sep="\t")

        assert actual_df.equals(expected_df)
recall_report_per_sample_for_calculator = (
    snakemake.input.recall_report_per_sample_for_calculator)
gt_conf_percentiles = snakemake.params.gt_conf_percentiles
tool = snakemake.wildcards.tool
coverage = snakemake.wildcards.coverage
coverage_threshold = snakemake.wildcards.coverage_threshold
strand_bias_threshold = snakemake.wildcards.strand_bias_threshold
gaps_threshold = snakemake.wildcards.gaps_threshold

recall_file_for_all_samples_and_all_gt_conf_percentile = Path(
    snakemake.output.recall_file_for_all_samples_and_all_gt_conf_percentile)

# API usage
logging.info(f"Loading report")
recall_report = RecallReport.from_files(
    recall_report_per_sample_for_calculator,
    concatenate_dfs_one_by_one_keeping_only_best_mappings=False)

logging.info(f"Creating calculator")
recall_calculator = RecallCalculator(recall_report)

logging.info(f"Calculating recall")
recall_df = recall_calculator.get_recall_report(gt_conf_percentiles)

metadata_df = pd.DataFrame(
    data={
        "tool": [tool] * len(recall_df),
        "coverage": [coverage] * len(recall_df),
        "coverage_threshold": [coverage_threshold] * len(recall_df),
        "strand_bias_threshold": [strand_bias_threshold] * len(recall_df),
        "gaps_threshold": [gaps_threshold] * len(recall_df),
 def test____calculate_info_wrt_truth_probes___some_duplicated_classifications(
         self, *mocks):
     report = RecallReport([pd.DataFrame()], False)
     true_positives, number_of_truth_probes = RecallCalculator._calculate_info_wrt_truth_probes(
         report)
     assert true_positives == 6 and number_of_truth_probes == 19
Exemplo n.º 16
0
    def test_simple_concatenation_with_several_dfs(self):
        df_1 = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.UNMAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_2",
                                     AlignmentAssessment.PARTIALLY_MAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )
        df_2 = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_3",
                                     AlignmentAssessment.PRIMARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_4",
                                     AlignmentAssessment.SECONDARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )
        df_3 = pd.DataFrame(data=[
            create_recall_report_row(
                "truth_probe_5",
                AlignmentAssessment.SUPPLEMENTARY_INCORRECT,
                gt_conf=100,
                with_gt_conf=True),
            create_recall_report_row("truth_probe_6",
                                     AlignmentAssessment.PRIMARY_CORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ], )

        report = RecallReport(
            [df_1, df_2, df_3],
            concatenate_dfs_one_by_one_keeping_only_best_mappings=False)
        actual = report.report
        expected = pd.DataFrame(data=[
            create_recall_report_row("truth_probe_1",
                                     AlignmentAssessment.UNMAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_2",
                                     AlignmentAssessment.PARTIALLY_MAPPED,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_3",
                                     AlignmentAssessment.PRIMARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row("truth_probe_4",
                                     AlignmentAssessment.SECONDARY_INCORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
            create_recall_report_row(
                "truth_probe_5",
                AlignmentAssessment.SUPPLEMENTARY_INCORRECT,
                gt_conf=100,
                with_gt_conf=True),
            create_recall_report_row("truth_probe_6",
                                     AlignmentAssessment.PRIMARY_CORRECT,
                                     gt_conf=100,
                                     with_gt_conf=True),
        ])
        assert_frame_equal(actual, expected, check_dtype=False)
 def test____calculate_info_wrt_truth_probes___one_classification_of_each(
         self, *mocks):
     report = RecallReport([pd.DataFrame()], False)
     true_positives, number_of_truth_probes = RecallCalculator._calculate_info_wrt_truth_probes(
         report)
     assert true_positives == 3 and number_of_truth_probes == 8
Exemplo n.º 18
0
all_recall_reports_for_one_sample_with_no_gt_conf_filter = (
    snakemake.input.all_recall_reports_for_one_sample_with_no_gt_conf_filter
)
sample = snakemake.wildcards.sample_id
tool = snakemake.wildcards.tool
coverage = snakemake.wildcards.coverage
coverage_threshold = snakemake.wildcards.coverage_threshold
strand_bias_threshold = snakemake.wildcards.strand_bias_threshold
gaps_threshold = snakemake.wildcards.gaps_threshold

recall_file_for_one_sample_with_no_gt_conf_filter = Path(snakemake.output.recall_file_for_one_sample_with_no_gt_conf_filter)


# API usage
logging.info(f"Loading report")
recall_report = RecallReport.from_files(all_recall_reports_for_one_sample_with_no_gt_conf_filter,
                                        concatenate_dfs_one_by_one_keeping_only_best_mappings=True)

logging.info(f"Creating calculator")
recall_calculator = RecallCalculator(recall_report)

logging.info(f"Calculating recall")
recall_df = recall_calculator.get_recall_report([0])

metadata_df = pd.DataFrame(
    data={
        "tool": [tool] * len(recall_df),
        "coverage": [coverage] * len(recall_df),
        "coverage_threshold": [coverage_threshold] * len(recall_df),
        "strand_bias_threshold": [strand_bias_threshold] * len(recall_df),
        "gaps_threshold": [gaps_threshold] * len(recall_df),
        "sample": [sample] * len(recall_df)
sys.path.append(str(Path().absolute()))
import logging
log_level = "INFO"
logging.basicConfig(
    filename=str(snakemake.log),
    filemode="w",
    level=log_level,
    format="[%(asctime)s]:%(levelname)s: %(message)s",
    datefmt="%d/%m/%Y %I:%M:%S %p",
)
from evaluate.report import RecallReport

# setup
recall_report_files_for_one_sample_and_all_gt_conf_percentiles = (
    snakemake.input.
    recall_report_files_for_one_sample_and_all_gt_conf_percentiles)
recall_report_per_sample_for_calculator = snakemake.output.recall_report_per_sample_for_calculator

# API usage
logging.info(f"Loading report")
recall_report = RecallReport.from_files(
    recall_report_files_for_one_sample_and_all_gt_conf_percentiles,
    concatenate_dfs_one_by_one_keeping_only_best_mappings=True)

with open(recall_report_per_sample_for_calculator,
          "w") as recall_report_per_sample_for_calculator_filehandler:
    recall_report.save_report(
        recall_report_per_sample_for_calculator_filehandler)

logging.info(f"Done")