def test___get_proportion_of_alleles_found_for_each_variant_with_nb_of_samples( self, *mocks): contents = StringIO( """sample,query_probe_header,PVID,ALL_SEQ_ID,NB_DIFF_ALL_SEQ,good_eval,ALL_ID,NB_ALL,NB_OF_SAMPLES S1,1,2,0,10,True,4,5,20 S2,2,0,2,1,False,0,1,1 S3,3,1,1,3,True,5,10,10 S4,4,0,2,1,True,0,1,1 S5,5,1,1,3,False,4,10,10 S6,6,1,2,3,False,9,10,10 S7,7,2,1,10,True,3,5,20 S8,8,1,2,3,True,8,10,10 S1,9,2,2,10,True,2,5,20 S1,10,0,2,1,False,0,1,1 S1,11,2,3,10,True,1,5,20 S1,12,1,3,3,False,7,10,10 S1,13,2,4,10,True,3,5,20 S1,14,2,5,10,True,1,5,20 S1,15,2,6,10,True,2,5,20 S1,16,3,0,2,False,0,3,30 S1,17,3,1,2,False,0,3,30 """) report = RecallReport([pd.read_csv(contents)], False) actual = report.get_proportion_of_alleles_found_for_each_variant_with_nb_of_samples( ) expected = pd.read_csv( StringIO("""PVID,proportion_of_alleles_found,NB_OF_SAMPLES 0,1.0,1 1,0.2,10 2,0.8,20 3,0.0,30 """), index_col="PVID") assert actual.equals(expected)
def test____get_id_to_nb_of_different_allele_sequences(self, *mocks): contents = StringIO( """sample,query_probe_header,PVID,NB_DIFF_ALL_SEQ,good_eval S1,0,2,10,True S2,1,0,1,False S3,2,1,3,True S4,3,0,1,True S5,4,1,3,False S6,5,1,3,False S7,6,2,10,True S8,7,1,3,True S1,8,2,10,True S1,9,0,1,False S1,10,2,10,True S1,11,1,3,False S1,12,2,10,False S1,13,2,10,False S1,14,2,10,False S1,15,3,2,False S1,16,3,2,False """) report = RecallReport([pd.read_csv(contents)], False) actual = report._get_id_to_nb_of_different_allele_sequences() expected = pd.read_csv(StringIO("""PVID,NB_DIFF_ALL_SEQ 0,1 1,3 2,10 3,2 """), index_col="PVID") assert actual.equals(expected)
def test___get_proportion_of_alleles_found_for_each_variant(self, *mocks): contents = StringIO( """sample,query_probe_header,PVID,ALL_ID,NB_ALL,good_eval,ALL_SEQ_ID,NB_DIFF_ALL_SEQ S1,0,2,0,10,True,0,0 S2,1,0,2,1,False,0,0 S3,2,1,1,3,True,0,0 S4,3,0,2,1,True,0,0 S5,4,1,1,3,False,0,0 S6,5,1,2,3,False,0,0 S7,6,2,1,10,True,0,0 S8,7,1,2,3,True,0,0 S1,8,2,2,10,True,0,0 S1,9,0,2,1,False,0,0 S1,10,2,3,10,True,0,0 S1,11,1,3,3,False,0,0 S1,12,2,4,10,False,0,0 S1,13,2,5,10,False,0,0 S1,14,2,6,10,False,0,0 S1,15,3,0,2,False,0,0 S1,16,3,1,2,False,0,0 """) report = RecallReport([pd.read_csv(contents)], False) actual = report.get_proportion_of_alleles_found_for_each_variant() expected = [1 / 1, 2 / 3, 4 / 10, 0 / 2] assert actual == expected
def test_fromFiles_TwoFilesReturnsValidRecallReport(self): contents_1 = """sample query_probe_header ref_probe_header classification CFT073 >CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1; >GT_CONF=1; unmapped CFT073 >CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2; >CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133; primary_correct CFT073 >CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3; >GT_CONF=3; unmapped """ contents_2 = """sample query_probe_header ref_probe_header classification CFT073 >CHROM=1;POS=1260;IV=[50,60);PVID=4;NB_ALL=4;ALL_ID=4;NB_DIFF_ALL_SEQ=4;ALL_SEQ_ID=4; >CHROM=GC00000578_3;SAMPLE=CFT073;POS=165;IV=[25,29);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=3;GT_CONF=3.22199; primary_incorrect CFT073 >CHROM=1;POS=1262;IV=[60,70);PVID=5;NB_ALL=5;ALL_ID=5;NB_DIFF_ALL_SEQ=5;ALL_SEQ_ID=5; >GT_CONF=5; unmapped CFT073 >CHROM=1;POS=1281;IV=[70,80);PVID=6;NB_ALL=6;ALL_ID=6;NB_DIFF_ALL_SEQ=6;ALL_SEQ_ID=6; >GT_CONF=6; unmapped """ path_1 = create_tmp_file(contents_1) path_2 = create_tmp_file(contents_2) contents_1_input = StringIO(contents_1) contents_2_input = StringIO(contents_2) dataframes = [ pd.read_csv(contents_1_input, sep="\t", keep_default_na=False), pd.read_csv(contents_2_input, sep="\t", keep_default_na=False), ] actual = RecallReport.from_files([path_1, path_2]) expected = RecallReport(dataframes) path_1.unlink() path_2.unlink() assert actual == expected
def test___get_proportion_of_alleles_found_for_each_variant___duplicated_evaluation_is_disregarded( self, *mocks): contents = StringIO( """sample,query_probe_header,PVID,ALL_ID,NB_ALL,good_eval,ALL_SEQ_ID,NB_DIFF_ALL_SEQ S1,1,0,0,5,True,0,0 S1,2,0,1,5,True,0,0 S1,3,0,0,5,True,0,0 S1,4,0,0,5,True,0,0 S1,5,0,1,5,True,0,0 """) report = RecallReport([pd.read_csv(contents)], False) actual = report.get_proportion_of_alleles_found_for_each_variant() expected = [2 / 5] assert actual == expected
def test____calculate_info_wrt_variants(self, *mocks): report = RecallReport([pd.DataFrame()], False) nb_variants_where_all_allele_seqs_were_found, nb_variants_found_wrt_alleles, variants_total = \ RecallCalculator._calculate_info_wrt_variants(report) assert nb_variants_where_all_allele_seqs_were_found == 6.3 and \ nb_variants_found_wrt_alleles == 4.6 and \ variants_total == 20
def test____calculate_recall_for_a_given_confidence( self, calculate_info_wrt_variants_mock, calculate_info_wrt_truth_probes_mock, get_report_satisfying_confidence_threshold_mock, *other_mocks): # setup report_satisfying_confidence_threshold_mock = Mock() get_report_satisfying_confidence_threshold_mock.return_value = report_satisfying_confidence_threshold_mock report = RecallReport([pd.DataFrame()], False) calculator = RecallCalculator(report) recall_info_actual = calculator._calculate_recall_for_a_given_confidence( 100) get_report_satisfying_confidence_threshold_mock.assert_called_once_with( 100) calculate_info_wrt_truth_probes_mock.assert_called_once_with( report_satisfying_confidence_threshold_mock) calculate_info_wrt_variants_mock.assert_called_once_with( report_satisfying_confidence_threshold_mock) assert recall_info_actual.truth_probes_true_positives == 5 assert recall_info_actual.truth_probes_total == 10 assert recall_info_actual.nb_variants_where_all_allele_seqs_were_found == 4 assert recall_info_actual.nb_variants_found_wrt_alleles == 8 assert recall_info_actual.variants_total == 10 assert recall_info_actual.recall_wrt_truth_probes == 0.5 assert recall_info_actual.recall_wrt_variants_where_all_allele_seqs_were_found == 0.4 assert recall_info_actual.recall_wrt_variants_found_wrt_alleles == 0.8
def test____get_id_to_nb_of_samples(self, *mocks): contents = StringIO("""sample,query_probe_header,PVID,NB_OF_SAMPLES S1,0,2,3 S2,1,0,4 S3,2,1,10 """) report = RecallReport([pd.read_csv(contents)], False) actual = report._get_id_to_nb_of_samples() expected = pd.read_csv(StringIO("""PVID,NB_OF_SAMPLES 0,4 1,10 2,3 """), index_col="PVID") assert actual.equals(expected)
def test_checkIfOnlyBestMappingIsKept_hasNoCorrectMapping_ChoosesTheOneWithHighestGTConf( self): dfs = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.UNMAPPED, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.PARTIALLY_MAPPED, gt_conf=140, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.PRIMARY_INCORRECT, gt_conf=150, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.SECONDARY_INCORRECT, gt_conf=110, with_gt_conf=True), create_recall_report_row( "truth_probe_1", AlignmentAssessment.SUPPLEMENTARY_INCORRECT, gt_conf=120, with_gt_conf=True), ], ) report = RecallReport([dfs]) actual = report.report expected = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.PRIMARY_INCORRECT, gt_conf=150, with_gt_conf=True) ]) assert_frame_equal(actual, expected, check_dtype=False)
def test___get_recall_allele_seqs_vs_nb_of_samples_report___return_only_the_samples_given_in_parameter( self, *mocks): report = RecallReport([pd.DataFrame()], False) calculator = RecallCalculator(report) actual = calculator.get_recall_allele_seqs_vs_nb_of_samples_report( [2, 5]) expected = pd.read_csv( StringIO("""NB_OF_SAMPLES,recall_PVR 2,0.0 5,0.25 """)) assert actual.equals(expected)
def test___get_recall_alleles_vs_nb_of_samples_report(self, *mocks): report = RecallReport([pd.DataFrame()], False) calculator = RecallCalculator(report) actual = calculator.get_recall_alleles_vs_nb_of_samples_report( list(range(2, 8))) expected = pd.read_csv( StringIO("""NB_OF_SAMPLES,recall_AvgAR 2,0.0 3,0.95 4,0.0 5,0.55 6,0.0 7,0.6 """)) assert actual.equals(expected)
def test_checkIfOnlyBestMappingIsKept_hasPrimaryMapping_and_several_dfs( self): df_1 = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.UNMAPPED, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.PARTIALLY_MAPPED, gt_conf=100, with_gt_conf=True), ], ) df_2 = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.PRIMARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.SECONDARY_INCORRECT, gt_conf=100, with_gt_conf=True), ], ) df_3 = pd.DataFrame(data=[ create_recall_report_row( "truth_probe_1", AlignmentAssessment.SUPPLEMENTARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_1", AlignmentAssessment.PRIMARY_CORRECT, gt_conf=100, with_gt_conf=True), ], ) report = RecallReport([df_1, df_2, df_3]) actual = report.report expected = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.PRIMARY_CORRECT, gt_conf=100, with_gt_conf=True) ]) assert_frame_equal(actual, expected, check_dtype=False)
def test_init(self): contents_1 = """sample query_probe_header ref_probe_header classification CFT073 >CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;NB_OF_SAMPLES=10; >GT_CONF=1; unmapped CFT073 >CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;NB_OF_SAMPLES=20; >CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133; primary_correct CFT073 >CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;NB_OF_SAMPLES=30; >GT_CONF=3; unmapped """ contents_1_input = StringIO(contents_1) dataframes = [ pd.read_csv(contents_1_input, sep="\t", keep_default_na=False) ] report = RecallReport(dataframes) actual_df = report.report expected_df = pd.read_csv(StringIO( """sample query_probe_header ref_probe_header classification GT_CONF PVID NB_ALL ALL_ID NB_DIFF_ALL_SEQ ALL_SEQ_ID NB_OF_SAMPLES good_eval CFT073 >CHROM=1;POS=1246;IV=[20,30);PVID=1;NB_ALL=1;ALL_ID=1;NB_DIFF_ALL_SEQ=1;ALL_SEQ_ID=1;NB_OF_SAMPLES=10; >GT_CONF=1; unmapped 1.0 1 1 1 1 1 10 False CFT073 >CHROM=1;POS=1248;IV=[30,40);PVID=2;NB_ALL=2;ALL_ID=2;NB_DIFF_ALL_SEQ=2;ALL_SEQ_ID=2;NB_OF_SAMPLES=20; >CHROM=GC00005358_3;SAMPLE=CFT073;POS=1;IV=[0,17);SVTYPE=PH_SNPs;MEAN_FWD_COVG=3;MEAN_REV_COVG=6;GT_CONF=60.1133; primary_correct 60.1133 2 2 2 2 2 20 True CFT073 >CHROM=1;POS=1252;IV=[40,50);PVID=3;NB_ALL=3;ALL_ID=3;NB_DIFF_ALL_SEQ=3;ALL_SEQ_ID=3;NB_OF_SAMPLES=30; >GT_CONF=3; unmapped 3.0 3 3 3 3 3 30 False """), sep="\t") assert actual_df.equals(expected_df)
recall_report_per_sample_for_calculator = ( snakemake.input.recall_report_per_sample_for_calculator) gt_conf_percentiles = snakemake.params.gt_conf_percentiles tool = snakemake.wildcards.tool coverage = snakemake.wildcards.coverage coverage_threshold = snakemake.wildcards.coverage_threshold strand_bias_threshold = snakemake.wildcards.strand_bias_threshold gaps_threshold = snakemake.wildcards.gaps_threshold recall_file_for_all_samples_and_all_gt_conf_percentile = Path( snakemake.output.recall_file_for_all_samples_and_all_gt_conf_percentile) # API usage logging.info(f"Loading report") recall_report = RecallReport.from_files( recall_report_per_sample_for_calculator, concatenate_dfs_one_by_one_keeping_only_best_mappings=False) logging.info(f"Creating calculator") recall_calculator = RecallCalculator(recall_report) logging.info(f"Calculating recall") recall_df = recall_calculator.get_recall_report(gt_conf_percentiles) metadata_df = pd.DataFrame( data={ "tool": [tool] * len(recall_df), "coverage": [coverage] * len(recall_df), "coverage_threshold": [coverage_threshold] * len(recall_df), "strand_bias_threshold": [strand_bias_threshold] * len(recall_df), "gaps_threshold": [gaps_threshold] * len(recall_df),
def test____calculate_info_wrt_truth_probes___some_duplicated_classifications( self, *mocks): report = RecallReport([pd.DataFrame()], False) true_positives, number_of_truth_probes = RecallCalculator._calculate_info_wrt_truth_probes( report) assert true_positives == 6 and number_of_truth_probes == 19
def test_simple_concatenation_with_several_dfs(self): df_1 = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.UNMAPPED, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_2", AlignmentAssessment.PARTIALLY_MAPPED, gt_conf=100, with_gt_conf=True), ], ) df_2 = pd.DataFrame(data=[ create_recall_report_row("truth_probe_3", AlignmentAssessment.PRIMARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_4", AlignmentAssessment.SECONDARY_INCORRECT, gt_conf=100, with_gt_conf=True), ], ) df_3 = pd.DataFrame(data=[ create_recall_report_row( "truth_probe_5", AlignmentAssessment.SUPPLEMENTARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_6", AlignmentAssessment.PRIMARY_CORRECT, gt_conf=100, with_gt_conf=True), ], ) report = RecallReport( [df_1, df_2, df_3], concatenate_dfs_one_by_one_keeping_only_best_mappings=False) actual = report.report expected = pd.DataFrame(data=[ create_recall_report_row("truth_probe_1", AlignmentAssessment.UNMAPPED, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_2", AlignmentAssessment.PARTIALLY_MAPPED, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_3", AlignmentAssessment.PRIMARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_4", AlignmentAssessment.SECONDARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row( "truth_probe_5", AlignmentAssessment.SUPPLEMENTARY_INCORRECT, gt_conf=100, with_gt_conf=True), create_recall_report_row("truth_probe_6", AlignmentAssessment.PRIMARY_CORRECT, gt_conf=100, with_gt_conf=True), ]) assert_frame_equal(actual, expected, check_dtype=False)
def test____calculate_info_wrt_truth_probes___one_classification_of_each( self, *mocks): report = RecallReport([pd.DataFrame()], False) true_positives, number_of_truth_probes = RecallCalculator._calculate_info_wrt_truth_probes( report) assert true_positives == 3 and number_of_truth_probes == 8
all_recall_reports_for_one_sample_with_no_gt_conf_filter = ( snakemake.input.all_recall_reports_for_one_sample_with_no_gt_conf_filter ) sample = snakemake.wildcards.sample_id tool = snakemake.wildcards.tool coverage = snakemake.wildcards.coverage coverage_threshold = snakemake.wildcards.coverage_threshold strand_bias_threshold = snakemake.wildcards.strand_bias_threshold gaps_threshold = snakemake.wildcards.gaps_threshold recall_file_for_one_sample_with_no_gt_conf_filter = Path(snakemake.output.recall_file_for_one_sample_with_no_gt_conf_filter) # API usage logging.info(f"Loading report") recall_report = RecallReport.from_files(all_recall_reports_for_one_sample_with_no_gt_conf_filter, concatenate_dfs_one_by_one_keeping_only_best_mappings=True) logging.info(f"Creating calculator") recall_calculator = RecallCalculator(recall_report) logging.info(f"Calculating recall") recall_df = recall_calculator.get_recall_report([0]) metadata_df = pd.DataFrame( data={ "tool": [tool] * len(recall_df), "coverage": [coverage] * len(recall_df), "coverage_threshold": [coverage_threshold] * len(recall_df), "strand_bias_threshold": [strand_bias_threshold] * len(recall_df), "gaps_threshold": [gaps_threshold] * len(recall_df), "sample": [sample] * len(recall_df)
sys.path.append(str(Path().absolute())) import logging log_level = "INFO" logging.basicConfig( filename=str(snakemake.log), filemode="w", level=log_level, format="[%(asctime)s]:%(levelname)s: %(message)s", datefmt="%d/%m/%Y %I:%M:%S %p", ) from evaluate.report import RecallReport # setup recall_report_files_for_one_sample_and_all_gt_conf_percentiles = ( snakemake.input. recall_report_files_for_one_sample_and_all_gt_conf_percentiles) recall_report_per_sample_for_calculator = snakemake.output.recall_report_per_sample_for_calculator # API usage logging.info(f"Loading report") recall_report = RecallReport.from_files( recall_report_files_for_one_sample_and_all_gt_conf_percentiles, concatenate_dfs_one_by_one_keeping_only_best_mappings=True) with open(recall_report_per_sample_for_calculator, "w") as recall_report_per_sample_for_calculator_filehandler: recall_report.save_report( recall_report_per_sample_for_calculator_filehandler) logging.info(f"Done")