def test_task2_system_level(self): summary_level_metrics = JsonlReader(_task2_metrics_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # updateEval/ROUGE/rouge2.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(14.499, 1e-2) assert system_level_metrics['C']['rouge-2_jk'][ 'recall'] == pytest.approx(14.330, 1e-2) assert system_level_metrics['G']['rouge-2_jk'][ 'recall'] == pytest.approx(13.942, 1e-2) assert system_level_metrics['40']['rouge-2_jk'][ 'recall'] == pytest.approx(11.189, 1e-2) assert system_level_metrics['55']['rouge-2_jk'][ 'recall'] == pytest.approx(9.851, 1e-2) # updateEval/ROUGE/rougeSU4.jk.m.avg assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.998, 1e-2) assert system_level_metrics['C']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.923, 1e-2) assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.689, 1e-2) assert system_level_metrics['40']['rouge-su4_jk'][ 'recall'] == pytest.approx(14.306, 1e-2) assert system_level_metrics['44']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.607, 1e-2) # updateEval/manual/Responsiveness/avg_content.all assert system_level_metrics['D'][ 'content_responsiveness'] == pytest.approx(4.833, 1e-2) assert system_level_metrics['C'][ 'content_responsiveness'] == pytest.approx(4.833, 1e-2) assert system_level_metrics['E'][ 'content_responsiveness'] == pytest.approx(4.750, 1e-2) assert system_level_metrics['40'][ 'content_responsiveness'] == pytest.approx(2.967, 1e-2) assert system_level_metrics['36'][ 'content_responsiveness'] == pytest.approx(2.800, 1e-2) # updateEval/BE/simple.jk.m.hm.avg assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(10.687, 1e-2) assert system_level_metrics['C']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(10.214, 1e-2) assert system_level_metrics['E']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(10.177, 1e-2) assert system_level_metrics['40']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.219, 1e-2) assert system_level_metrics['44']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(5.544, 1e-2)
def test_task1_system_level(self): summary_level_metrics = JsonlReader(_task1_metrics_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # mainEval/ROUGE/rouge2.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(17.528, 1e-2) assert system_level_metrics['C']['rouge-2_jk'][ 'recall'] == pytest.approx(15.055, 1e-2) assert system_level_metrics['B']['rouge-2_jk'][ 'recall'] == pytest.approx(13.992, 1e-2) assert system_level_metrics['15']['rouge-2_jk'][ 'recall'] == pytest.approx(12.448, 1e-2) assert system_level_metrics['29']['rouge-2_jk'][ 'recall'] == pytest.approx(12.028, 1e-2) # mainEval/ROUGE/rougeSU4.jk.m.avg assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(21.892, 1e-2) assert system_level_metrics['C']['rouge-su4_jk'][ 'recall'] == pytest.approx(19.921, 1e-2) assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(19.396, 1e-2) assert system_level_metrics['15']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.711, 1e-2) assert system_level_metrics['24']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.593, 1e-2) # mainEval/manual/avg_content assert system_level_metrics['D'][ 'content_responsiveness'] == pytest.approx(4.944, 1e-2) assert system_level_metrics['I'][ 'content_responsiveness'] == pytest.approx(4.889, 1e-2) assert system_level_metrics['G'][ 'content_responsiveness'] == pytest.approx(4.889, 1e-2) assert system_level_metrics['4'][ 'content_responsiveness'] == pytest.approx(3.400, 1e-2) assert system_level_metrics['23'][ 'content_responsiveness'] == pytest.approx(3.311, 1e-2) # mainEval/BE/simple.jk.m.hm.avg assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(12.284, 1e-2) assert system_level_metrics['C']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.593, 1e-2) assert system_level_metrics['B']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.146, 1e-2) assert system_level_metrics['15']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.632, 1e-2) assert system_level_metrics['24']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.577, 1e-2)
def test_system_level(self): summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2.jk.m.avg assert system_level_metrics['32']['rouge-1_jk']['recall'] == pytest.approx(32.4835488888889, 1e-2) assert system_level_metrics['32']['rouge-1_jk']['precision'] == pytest.approx(33.3402216666667, 1e-2) assert system_level_metrics['21']['rouge-2_jk']['recall'] == pytest.approx(5.72633166666666, 1e-2) assert system_level_metrics['21']['rouge-2_jk']['precision'] == pytest.approx(5.70379111111111, 1e-2) assert system_level_metrics['E']['rouge-2_jk']['recall'] == pytest.approx(10.5482258064516, 1e-2) assert system_level_metrics['E']['rouge-2_jk']['precision'] == pytest.approx(11.9319677419355, 1e-2) # ROUGE/rougeSU4.jk.m.avg assert system_level_metrics['H']['rouge-su4_jk']['recall'] == pytest.approx(14.8429, 1e-2) assert system_level_metrics['F']['rouge-su4_jk']['recall'] == pytest.approx(15.8717666666667, 1e-2) assert system_level_metrics['E']['rouge-su4_jk']['recall'] == pytest.approx(15.9369677419355, 1e-2) assert system_level_metrics['23']['rouge-su4_jk']['recall'] == pytest.approx(5.56854166666667, 1e-2) assert system_level_metrics['1']['rouge-su4_jk']['recall'] == pytest.approx(8.71616333333333, 1e-2)
def test_system_level(self): summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # NISTeval2/ROUGE/rouge2.jk.m.avg assert system_level_metrics['C']['rouge-2_jk']['recall'] == pytest.approx(13.260, 1e-2) assert system_level_metrics['D']['rouge-2_jk']['recall'] == pytest.approx(12.380, 1e-2) assert system_level_metrics['B']['rouge-2_jk']['recall'] == pytest.approx(11.788, 1e-2) assert system_level_metrics['24']['rouge-2_jk']['recall'] == pytest.approx(9.558, 1e-2) assert system_level_metrics['15']['rouge-2_jk']['recall'] == pytest.approx(9.097, 1e-2) # NISTeval2/ROUGE/rougeSU4.jk.m.avg assert system_level_metrics['C']['rouge-su4_jk']['recall'] == pytest.approx(18.385, 1e-2) assert system_level_metrics['D']['rouge-su4_jk']['recall'] == pytest.approx(17.814, 1e-2) assert system_level_metrics['B']['rouge-su4_jk']['recall'] == pytest.approx(17.665, 1e-2) assert system_level_metrics['24']['rouge-su4_jk']['recall'] == pytest.approx(15.529, 1e-2) assert system_level_metrics['12']['rouge-su4_jk']['recall'] == pytest.approx(14.755, 1e-2) # NISTeval/responsiveness/avg_content assert system_level_metrics['D']['content_responsiveness'] == pytest.approx(4.9, 1e-2) assert system_level_metrics['C']['content_responsiveness'] == pytest.approx(4.9, 1e-2) assert system_level_metrics['B']['content_responsiveness'] == pytest.approx(4.9, 1e-2) assert system_level_metrics['27']['content_responsiveness'] == pytest.approx(3.08, 1e-2) assert system_level_metrics['23']['content_responsiveness'] == pytest.approx(3.0, 1e-2) # NISTeval/responsiveness/avg_overall assert system_level_metrics['E']['overall_responsiveness'] == pytest.approx(4.9, 1e-2) assert system_level_metrics['D']['overall_responsiveness'] == pytest.approx(4.9, 1e-2) assert system_level_metrics['I']['overall_responsiveness'] == pytest.approx(4.8, 1e-2) assert system_level_metrics['27']['overall_responsiveness'] == pytest.approx(2.84, 1e-2) assert system_level_metrics['23']['overall_responsiveness'] == pytest.approx(2.76, 1e-2) # NISTeval2/BE/simple.jk.m.hm.avg assert system_level_metrics['C']['rouge-be-hm_jk']['recall'] == pytest.approx(9.905, 1e-2) assert system_level_metrics['B']['rouge-be-hm_jk']['recall'] == pytest.approx(7.847, 1e-2) assert system_level_metrics['D']['rouge-be-hm_jk']['recall'] == pytest.approx(7.466, 1e-2) assert system_level_metrics['24']['rouge-be-hm_jk']['recall'] == pytest.approx(5.107, 1e-2) assert system_level_metrics['23']['rouge-be-hm_jk']['recall'] == pytest.approx(5.049, 1e-2)
def test_system_level(self): summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2.m.avg assert system_level_metrics['43']['rouge-2'][ 'recall'] == pytest.approx(10.382, 1e-2) assert system_level_metrics['13']['rouge-2'][ 'recall'] == pytest.approx(9.900, 1e-2) assert system_level_metrics['14']['rouge-2'][ 'recall'] == pytest.approx(9.773, 1e-2) assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx( 9.610, 1e-2) assert system_level_metrics['65']['rouge-2'][ 'recall'] == pytest.approx(9.558, 1e-2) # ROUGE/rouge2.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(13.197, 1e-2) assert system_level_metrics['F']['rouge-2_jk'][ 'recall'] == pytest.approx(12.896, 1e-2) assert system_level_metrics['H']['rouge-2_jk'][ 'recall'] == pytest.approx(12.010, 1e-2) assert system_level_metrics['43']['rouge-2_jk'][ 'recall'] == pytest.approx(10.395, 1e-2) assert system_level_metrics['13']['rouge-2_jk'][ 'recall'] == pytest.approx(9.901, 1e-2) # ROUGE/rougeSU4.m.avg assert system_level_metrics['43']['rouge-su4'][ 'recall'] == pytest.approx(13.625, 1e-2) assert system_level_metrics['37']['rouge-su4'][ 'recall'] == pytest.approx(13.574, 1e-2) assert system_level_metrics['60']['rouge-su4'][ 'recall'] == pytest.approx(13.570, 1e-2) assert system_level_metrics['2']['rouge-su4'][ 'recall'] == pytest.approx(13.419, 1e-2) assert system_level_metrics['14']['rouge-su4'][ 'recall'] == pytest.approx(13.283, 1e-2) # ROUGE/rougeSU4.jk.m.avg assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.878, 1e-2) assert system_level_metrics['F']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.490, 1e-2) assert system_level_metrics['H']['rouge-su4_jk'][ 'recall'] == pytest.approx(15.565, 1e-2) assert system_level_metrics['43']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.646, 1e-2) assert system_level_metrics['37']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.592, 1e-2) # manual/manual.model.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 8.021, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 8.479, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 8.208, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.608, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.625, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.651, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(4.833, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(4.812, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(4.604, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(4.688, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(4.583, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(4.500, 1e-2) # manual/manual.peer.avg assert system_level_metrics['0'][ 'modified_pyramid_score'] == pytest.approx(0.166, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.265, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.280, 1e-2) assert system_level_metrics['0']['num_scus'] == pytest.approx( 2.635, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 3.854, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 4.000, 1e-2) assert system_level_metrics['0']['num_repetitions'] == pytest.approx( 0.688, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.885, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 1.156, 1e-2) assert system_level_metrics['0'][ 'modified_pyramid_score_jk'] == pytest.approx(0.163, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.261, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.276, 1e-2) assert system_level_metrics['0'][ 'linguistic_quality'] == pytest.approx(3.333, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(2.719, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(2.354, 1e-2) assert system_level_metrics['0'][ 'overall_responsiveness'] == pytest.approx(2.073, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(2.427, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(2.385, 1e-2) # BE/simple.m.hm.avg assert system_level_metrics['14']['rouge-be-hm'][ 'recall'] == pytest.approx(6.462, 1e-2) assert system_level_metrics['65']['rouge-be-hm'][ 'recall'] == pytest.approx(6.276, 1e-2) assert system_level_metrics['43']['rouge-be-hm'][ 'recall'] == pytest.approx(6.257, 1e-2) assert system_level_metrics['49']['rouge-be-hm'][ 'recall'] == pytest.approx(6.247, 1e-2) assert system_level_metrics['60']['rouge-be-hm'][ 'recall'] == pytest.approx(6.198, 1e-2) # BE/simple.jk.m.hm.avg assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.959, 1e-2) assert system_level_metrics['F']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.553, 1e-2) assert system_level_metrics['G']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.154, 1e-2) assert system_level_metrics['14']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.480, 1e-2) assert system_level_metrics['65']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.293, 1e-2)
def test_system_level_B(self): summary_level_metrics = JsonlReader(_metrics_B_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_B.m.avg assert system_level_metrics['43']['rouge-2'][ 'recall'] == pytest.approx(9.581, 1e-2) assert system_level_metrics['25']['rouge-2'][ 'recall'] == pytest.approx(9.259, 1e-2) assert system_level_metrics['17']['rouge-2'][ 'recall'] == pytest.approx(8.855, 1e-2) # ROUGE/rouge2_B.jk.m.avg assert system_level_metrics['E']['rouge-2_jk'][ 'recall'] == pytest.approx(11.474, 1e-2) assert system_level_metrics['H']['rouge-2_jk'][ 'recall'] == pytest.approx(10.865, 1e-1) assert system_level_metrics['43']['rouge-2_jk'][ 'recall'] == pytest.approx(9.589, 1e-2) # ROUGE/rougeSU4_B.m.avg assert system_level_metrics['43']['rouge-su4'][ 'recall'] == pytest.approx(13.080, 1e-2) assert system_level_metrics['24']['rouge-su4'][ 'recall'] == pytest.approx(12.803, 1e-2) assert system_level_metrics['17']['rouge-su4'][ 'recall'] == pytest.approx(12.792, 1e-2) # ROUGE/rougeSU4_B.jk.m.avg assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(14.941, 1e-2) assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(14.368, 1e-2) assert system_level_metrics['43']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.086, 1e-2) # manual/manual.model.B.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 6.682, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 5.409, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 5.864, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.663, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.554, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.565, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(4.909, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(4.909, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(4.955, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(4.773, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(4.500, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(4.682, 1e-2) # manual/manual.peer.B.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.237, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.284, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.327, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 2.636, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 3.136, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 3.682, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.364, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 0.568, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 0.727, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.234, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.280, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.322, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(3.455, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(2.841, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(2.886, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(2.091, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(2.114, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(2.500, 1e-2) # BE/simple_B.m.hm.avg assert system_level_metrics['43']['rouge-be-hm'][ 'recall'] == pytest.approx(6.473, 1e-2) assert system_level_metrics['25']['rouge-be-hm'][ 'recall'] == pytest.approx(5.937, 1e-2) assert system_level_metrics['26']['rouge-be-hm'][ 'recall'] == pytest.approx(5.717, 1e-1) # BE/simplejk_B.m.hm.avg assert system_level_metrics['E']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.970, 1e-2) assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.341, 1e-1) assert system_level_metrics['43']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.480, 1e-1) # aesop_allpeers_B assert system_level_metrics['B']['aesop']['2'] == pytest.approx( 0.1278890909, 1e-2) assert system_level_metrics['E']['aesop']['4'] == pytest.approx( 0.4831818182, 1e-2) assert system_level_metrics['6']['aesop']['8'] == pytest.approx( 1.003988068, 1e-2)
def test_system_level_A(self): summary_level_metrics = JsonlReader(_metrics_A_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_A.m.avg assert system_level_metrics['43']['rouge-2'][ 'recall'] == pytest.approx(13.440, 1e-2) assert system_level_metrics['17']['rouge-2'][ 'recall'] == pytest.approx(12.994, 1e-2) assert system_level_metrics['25']['rouge-2'][ 'recall'] == pytest.approx(12.821, 1e-2) # ROUGE/rouge2_A.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(12.820, 1e-2) assert system_level_metrics['43']['rouge-2_jk'][ 'recall'] == pytest.approx(13.447, 1e-2) assert system_level_metrics['17']['rouge-2_jk'][ 'recall'] == pytest.approx(12.993, 1e-2) # ROUGE/rougeSU4_A.m.avg assert system_level_metrics['43']['rouge-su4'][ 'recall'] == pytest.approx(16.519, 1e-2) assert system_level_metrics['17']['rouge-su4'][ 'recall'] == pytest.approx(15.984, 1e-2) assert system_level_metrics['24']['rouge-su4'][ 'recall'] == pytest.approx(15.975, 1e-2) # ROUGE/rougeSU4_A.jk.m.avg assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.412, 1e-2) assert system_level_metrics['A']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.118, 1e-2) assert system_level_metrics['43']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.519, 1e-2) # manual/manual.model.A.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 10.227, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 9.773, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 9.818, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.771, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.781, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.752, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(4.864, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(4.818, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(5.000, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(4.818, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(4.727, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(4.955, 1e-2) # manual/manual.peer.A.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.304, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.362, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.439, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 3.909, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 4.614, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 5.750, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.455, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 1.432, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 1.409, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.300, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.358, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.433, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(3.205, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(2.818, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(2.705, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(2.500, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(2.841, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(3.045, 1e-2) # BE/simple_A.m.hm.avg assert system_level_metrics['43']['rouge-be-hm'][ 'recall'] == pytest.approx(8.565, 1e-2) assert system_level_metrics['17']['rouge-be-hm'][ 'recall'] == pytest.approx(8.153, 1e-2) assert system_level_metrics['25']['rouge-be-hm'][ 'recall'] == pytest.approx(8.012, 1e-2) # BE/simplejk_A.m.hm.avg assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.085, 1e-2) assert system_level_metrics['E']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(8.628, 1e-2) assert system_level_metrics['43']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(8.553, 1e-2) # aesop_allpeers_A assert system_level_metrics['A']['aesop']['1'] == pytest.approx( 0.1191786364, 1e-2) assert system_level_metrics['C']['aesop']['8'] == pytest.approx( 3.853212409, 1e-2) assert system_level_metrics['4']['aesop']['13'] == pytest.approx( 0.4008335416, 1e-2)
def test_system_level_B(self): summary_level_metrics = JsonlReader(_metrics_B_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_B.m.avg assert system_level_metrics['16']['rouge-2'][ 'recall'] == pytest.approx(8.024, 1e-2) assert system_level_metrics['13']['rouge-2'][ 'recall'] == pytest.approx(7.913, 1e-2) assert system_level_metrics['36']['rouge-2'][ 'recall'] == pytest.approx(7.311, 1e-2) assert system_level_metrics['8']['rouge-2']['recall'] == pytest.approx( 7.251, 1e-2) assert system_level_metrics['4']['rouge-2']['recall'] == pytest.approx( 7.058, 1e-2) # ROUGE/rouge2_B.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(13.021, 1e-2) assert system_level_metrics['E']['rouge-2_jk'][ 'recall'] == pytest.approx(10.196, 1e-1) assert system_level_metrics['F']['rouge-2_jk'][ 'recall'] == pytest.approx(9.777, 1e-2) assert system_level_metrics['16']['rouge-2_jk'][ 'recall'] == pytest.approx(7.993, 1e-2) assert system_level_metrics['13']['rouge-2_jk'][ 'recall'] == pytest.approx(7.902, 1e-2) # ROUGE/rougeSU4_B.m.avg assert system_level_metrics['16']['rouge-su4'][ 'recall'] == pytest.approx(12.006, 1e-2) assert system_level_metrics['13']['rouge-su4'][ 'recall'] == pytest.approx(11.878, 1e-2) assert system_level_metrics['6']['rouge-su4'][ 'recall'] == pytest.approx(11.198, 1e-2) assert system_level_metrics['22']['rouge-su4'][ 'recall'] == pytest.approx(11.107, 1e-2) assert system_level_metrics['8']['rouge-su4'][ 'recall'] == pytest.approx(11.039, 1e-2) # ROUGE/rougeSU4_B.jk.m.avg assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.193, 1e-2) assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.978, 1e-2) assert system_level_metrics['G']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.573, 1e-2) assert system_level_metrics['16']['rouge-su4_jk'][ 'recall'] == pytest.approx(11.979, 1e-2) assert system_level_metrics['13']['rouge-su4_jk'][ 'recall'] == pytest.approx(11.869, 1e-2) # manual/manual.model.B.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 6.609, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 7.696, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 5.913, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.629, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.729, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.551, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(4.913, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(4.826, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(4.870, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(4.783, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(4.783, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(4.826, 1e-2) # manual/manual.peer.B.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.187, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.262, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.235, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 2.065, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 2.804, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 2.609, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.348, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 0.522, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 0.348, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.184, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.256, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.228, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(3.739, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(2.696, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(2.957, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(2.022, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(2.478, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(2.217, 1e-2) # BE/simple_B.m.hm.avg assert system_level_metrics['16']['rouge-be-hm'][ 'recall'] == pytest.approx(4.445, 1e-2) assert system_level_metrics['13']['rouge-be-hm'][ 'recall'] == pytest.approx(4.417, 1e-2) assert system_level_metrics['8']['rouge-be-hm'][ 'recall'] == pytest.approx(4.350, 1e-1) assert system_level_metrics['4']['rouge-be-hm'][ 'recall'] == pytest.approx(4.115, 1e-2) assert system_level_metrics['22']['rouge-be-hm'][ 'recall'] == pytest.approx(4.050, 1e-2) # BE/simplejk_B.m.hm.avg assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(8.842, 1e-2) assert system_level_metrics['F']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.842, 1e-1) assert system_level_metrics['B']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.081, 1e-1) assert system_level_metrics['16']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(4.411, 1e-2) assert system_level_metrics['13']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(4.402, 1e-2) # aesop_allpeers_B assert system_level_metrics['B']['aesop']['2'] == pytest.approx( 0.1358091304, 1e-2) assert system_level_metrics['E']['aesop']['4'] == pytest.approx( 0.1376682609, 1e-2) assert system_level_metrics['6']['aesop']['7'] == pytest.approx( 0.2641304348, 1e-2) assert system_level_metrics['9']['aesop']['20'] == pytest.approx( 0.09438347826, 1e-2) assert system_level_metrics['14']['aesop']['22'] == pytest.approx( 0.3394478261, 1e-2)
def test_system_level_A(self): summary_level_metrics = JsonlReader(_metrics_A_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_A.m.avg assert system_level_metrics['22']['rouge-2'][ 'recall'] == pytest.approx(9.574, 1e-2) assert system_level_metrics['18']['rouge-2'][ 'recall'] == pytest.approx(9.418, 1e-2) assert system_level_metrics['23']['rouge-2'][ 'recall'] == pytest.approx(9.404, 1e-2) assert system_level_metrics['24']['rouge-2'][ 'recall'] == pytest.approx(9.196, 1e-2) assert system_level_metrics['36']['rouge-2'][ 'recall'] == pytest.approx(9.194, 1e-2) # ROUGE/rouge2_A.jk.m.avg assert system_level_metrics['D']['rouge-2_jk'][ 'recall'] == pytest.approx(12.862, 1e-2) assert system_level_metrics['H']['rouge-2_jk'][ 'recall'] == pytest.approx(12.841, 1e-1) assert system_level_metrics['F']['rouge-2_jk'][ 'recall'] == pytest.approx(12.556, 1e-2) assert system_level_metrics['22']['rouge-2_jk'][ 'recall'] == pytest.approx(9.620, 1e-2) assert system_level_metrics['18']['rouge-2_jk'][ 'recall'] == pytest.approx(9.451, 1e-2) # ROUGE/rougeSU4_A.m.avg assert system_level_metrics['22']['rouge-su4'][ 'recall'] == pytest.approx(13.014, 1e-2) assert system_level_metrics['23']['rouge-su4'][ 'recall'] == pytest.approx(12.963, 1e-2) assert system_level_metrics['24']['rouge-su4'][ 'recall'] == pytest.approx(12.829, 1e-2) assert system_level_metrics['18']['rouge-su4'][ 'recall'] == pytest.approx(12.407, 1e-2) assert system_level_metrics['34']['rouge-su4'][ 'recall'] == pytest.approx(12.283, 1e-2) # ROUGE/rougeSU4_A.jk.m.avg assert system_level_metrics['H']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.294, 1e-2) assert system_level_metrics['F']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.212, 1e-2) assert system_level_metrics['D']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.200, 1e-2) assert system_level_metrics['22']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.049, 1e-2) assert system_level_metrics['23']['rouge-su4_jk'][ 'recall'] == pytest.approx(12.978, 1e-2) # manual/manual.model.A.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 10.870, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 11.087, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 9.826, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.779, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.747, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.661, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(4.913, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(4.870, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(4.826, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(4.783, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(4.696, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(4.565, 1e-2) # manual/manual.peer.A.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.233, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.296, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.399, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 3.304, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 4.217, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 5.500, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.522, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 1.217, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 1.413, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.229, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.291, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.393, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(3.652, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(2.717, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(3.043, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(2.174, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(2.500, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(2.978, 1e-2) # BE/simple_A.m.hm.avg assert system_level_metrics['22']['rouge-be-hm'][ 'recall'] == pytest.approx(5.937, 1e-2) assert system_level_metrics['23']['rouge-be-hm'][ 'recall'] == pytest.approx(5.809, 1e-2) assert system_level_metrics['18']['rouge-be-hm'][ 'recall'] == pytest.approx(5.749, 1e-2) assert system_level_metrics['13']['rouge-be-hm'][ 'recall'] == pytest.approx(5.553, 1e-2) assert system_level_metrics['16']['rouge-be-hm'][ 'recall'] == pytest.approx(5.497, 1e-2) # BE/simplejk_A.m.hm.avg assert system_level_metrics['F']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(9.114, 1e-2) assert system_level_metrics['H']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(8.690, 1e-1) assert system_level_metrics['D']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(8.449, 1e-1) assert system_level_metrics['22']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(5.973, 1e-2) assert system_level_metrics['23']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(5.828, 1e-2) # aesop_allpeers_A assert system_level_metrics['A']['aesop']['1'] == pytest.approx( 0.09517478261, 1e-2) assert system_level_metrics['C']['aesop']['8'] == pytest.approx( 0.0, 1e-2) assert system_level_metrics['4']['aesop']['13'] == pytest.approx( 0.6150630435, 1e-2) assert system_level_metrics['8']['aesop']['22'] == pytest.approx( 0.3684913043, 1e-2) assert system_level_metrics['16']['aesop']['27'] == pytest.approx( 11.80434783, 1e-2)
def test_system_level_B(self): summary_level_metrics = JsonlReader(_metrics_B_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_B.m.avg assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx( 31.956, 1e-2) assert system_level_metrics['34']['rouge-2'][ 'recall'] == pytest.approx(10.386, 1e-2) assert system_level_metrics['40']['rouge-2'][ 'recall'] == pytest.approx(10.373, 1e-2) assert system_level_metrics['35']['rouge-2'][ 'recall'] == pytest.approx(10.104, 1e-2) assert system_level_metrics['3']['rouge-2']['recall'] == pytest.approx( 9.820, 1e-2) # ROUGE/rouge2_B.jk.m.avg # C is off by a bit? # assert system_level_metrics['C']['rouge-2_jk']['recall'] == pytest.approx(12.550, 1e-2) assert system_level_metrics['H']['rouge-2_jk'][ 'recall'] == pytest.approx(12.436, 1e-2) assert system_level_metrics['E']['rouge-2_jk'][ 'recall'] == pytest.approx(11.001, 1e-2) assert system_level_metrics['2']['rouge-2_jk'][ 'recall'] == pytest.approx(31.932, 1e-2) assert system_level_metrics['34']['rouge-2_jk'][ 'recall'] == pytest.approx(10.417, 1e-2) # ROUGE/rougeSU4_B.m.avg assert system_level_metrics['2']['rouge-su4'][ 'recall'] == pytest.approx(33.688, 1e-2) assert system_level_metrics['40']['rouge-su4'][ 'recall'] == pytest.approx(13.948, 1e-2) assert system_level_metrics['34']['rouge-su4'][ 'recall'] == pytest.approx(13.851, 1e-2) assert system_level_metrics['35']['rouge-su4'][ 'recall'] == pytest.approx(13.839, 1e-2) assert system_level_metrics['51']['rouge-su4'][ 'recall'] == pytest.approx(13.650, 1e-2) # ROUGE/rougeSU4_B.jk.m.avg assert system_level_metrics['C']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.386, 1e-2) assert system_level_metrics['H']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.602, 1e-2) assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(15.152, 1e-2) assert system_level_metrics['2']['rouge-su4_jk'][ 'recall'] == pytest.approx(33.668, 1e-2) assert system_level_metrics['40']['rouge-su4_jk'][ 'recall'] == pytest.approx(13.959, 1e-2) # manual/manual.model.B.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 6.455, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 8.591, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 8.545, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.481, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.663, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.640, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(8.727, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(8.545, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(9.364, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(8.364, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(8.318, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(9.136, 1e-2) # manual/manual.peer.B.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.160, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.690, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.329, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 2.386, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 9.886, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 4.545, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 0.841, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 1.955, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 0.955, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.158, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.677, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.324, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(6.455, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(5.886, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(7.250, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(4.318, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(6.182, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(6.114, 1e-2) # BE/simple_B.m.hm.avg assert system_level_metrics['2']['rouge-be-hm'][ 'recall'] == pytest.approx(25.041, 1e-2) assert system_level_metrics['24']['rouge-be-hm'][ 'recall'] == pytest.approx(6.389, 1e-2) assert system_level_metrics['40']['rouge-be-hm'][ 'recall'] == pytest.approx(6.162, 1e-2) assert system_level_metrics['34']['rouge-be-hm'][ 'recall'] == pytest.approx(6.118, 1e-2) assert system_level_metrics['35']['rouge-be-hm'][ 'recall'] == pytest.approx(5.813, 1e-2) # BE/simplejk_B.m.hm.avg # F is off by a little assert system_level_metrics['C']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.795, 1e-2) assert system_level_metrics['H']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.040, 1e-2) assert system_level_metrics['F']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.094, 1e-1) assert system_level_metrics['2']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(25.042, 1e-2) assert system_level_metrics['34']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.134, 1e-2) # aesop_allpeers_B assert system_level_metrics['B']['aesop']['2'] == pytest.approx( 0.04890409091, 1e-2) assert system_level_metrics['E']['aesop']['4'] == pytest.approx( 0.2740872727, 1e-2) assert system_level_metrics['6']['aesop']['7'] == pytest.approx( 0.5850288957, 1e-2) assert system_level_metrics['9']['aesop']['20'] == pytest.approx( 0.06261788636, 1e-2) assert system_level_metrics['14']['aesop']['34'] == pytest.approx( 0.3664196656, 1e-2)
def test_system_level_A(self): summary_level_metrics = JsonlReader(_metrics_A_file_path, Metrics).read() system_level_metrics = aggregate_metrics(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # ROUGE/rouge2_A.m.avg assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx( 33.165, 1e-2) assert system_level_metrics['34']['rouge-2'][ 'recall'] == pytest.approx(12.163, 1e-2) assert system_level_metrics['40']['rouge-2'][ 'recall'] == pytest.approx(12.089, 1e-2) assert system_level_metrics['35']['rouge-2'][ 'recall'] == pytest.approx(10.869, 1e-2) assert system_level_metrics['3']['rouge-2']['recall'] == pytest.approx( 10.655, 1e-2) # ROUGE/rouge2_A.jk.m.avg assert system_level_metrics['C']['rouge-2_jk'][ 'recall'] == pytest.approx(14.864, 1e-2) assert system_level_metrics['H']['rouge-2_jk'][ 'recall'] == pytest.approx(13.457, 1e-2) assert system_level_metrics['E']['rouge-2_jk'][ 'recall'] == pytest.approx(13.341, 1e-2) assert system_level_metrics['2']['rouge-2_jk'][ 'recall'] == pytest.approx(33.133, 1e-2) assert system_level_metrics['34']['rouge-2_jk'][ 'recall'] == pytest.approx(12.184, 1e-2) # ROUGE/rougeSU4_A.m.avg assert system_level_metrics['2']['rouge-su4'][ 'recall'] == pytest.approx(34.421, 1e-2) assert system_level_metrics['40']['rouge-su4'][ 'recall'] == pytest.approx(15.101, 1e-2) assert system_level_metrics['34']['rouge-su4'][ 'recall'] == pytest.approx(15.030, 1e-2) assert system_level_metrics['35']['rouge-su4'][ 'recall'] == pytest.approx(14.487, 1e-2) assert system_level_metrics['51']['rouge-su4'][ 'recall'] == pytest.approx(14.165, 1e-2) # ROUGE/rougeSU4_A.jk.m.avg assert system_level_metrics['C']['rouge-su4_jk'][ 'recall'] == pytest.approx(18.355, 1e-2) assert system_level_metrics['H']['rouge-su4_jk'][ 'recall'] == pytest.approx(17.199, 1e-2) assert system_level_metrics['E']['rouge-su4_jk'][ 'recall'] == pytest.approx(16.917, 1e-2) assert system_level_metrics['2']['rouge-su4_jk'][ 'recall'] == pytest.approx(34.399, 1e-2) assert system_level_metrics['40']['rouge-su4_jk'][ 'recall'] == pytest.approx(15.131, 1e-2) # manual/manual.model.A.avg assert system_level_metrics['A']['num_scus_jk'] == pytest.approx( 10.364, 1e-2) assert system_level_metrics['B']['num_scus_jk'] == pytest.approx( 9.5, 1e-2) assert system_level_metrics['C']['num_scus_jk'] == pytest.approx( 12.364, 1e-2) assert system_level_metrics['A'][ 'modified_pyramid_score_jk'] == pytest.approx(0.685, 1e-2) assert system_level_metrics['B'][ 'modified_pyramid_score_jk'] == pytest.approx(0.616, 1e-2) assert system_level_metrics['C'][ 'modified_pyramid_score_jk'] == pytest.approx(0.720, 1e-2) assert system_level_metrics['A'][ 'linguistic_quality'] == pytest.approx(8.636, 1e-2) assert system_level_metrics['B'][ 'linguistic_quality'] == pytest.approx(9.136, 1e-2) assert system_level_metrics['C'][ 'linguistic_quality'] == pytest.approx(9.136, 1e-2) assert system_level_metrics['A'][ 'overall_responsiveness'] == pytest.approx(8.455, 1e-2) assert system_level_metrics['B'][ 'overall_responsiveness'] == pytest.approx(8.727, 1e-2) assert system_level_metrics['C'][ 'overall_responsiveness'] == pytest.approx(9.318, 1e-2) # manual/manual.peer.A.avg assert system_level_metrics['1'][ 'modified_pyramid_score'] == pytest.approx(0.175, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score'] == pytest.approx(0.646, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score'] == pytest.approx(0.358, 1e-2) assert system_level_metrics['1']['num_scus'] == pytest.approx( 3.182, 1e-2) assert system_level_metrics['2']['num_scus'] == pytest.approx( 11.977, 1e-2) assert system_level_metrics['3']['num_scus'] == pytest.approx( 6.00, 1e-2) assert system_level_metrics['1']['num_repetitions'] == pytest.approx( 1.318, 1e-2) assert system_level_metrics['2']['num_repetitions'] == pytest.approx( 2.455, 1e-2) assert system_level_metrics['3']['num_repetitions'] == pytest.approx( 1.568, 1e-2) assert system_level_metrics['1'][ 'modified_pyramid_score_jk'] == pytest.approx(0.172, 1e-2) assert system_level_metrics['2'][ 'modified_pyramid_score_jk'] == pytest.approx(0.635, 1e-2) assert system_level_metrics['3'][ 'modified_pyramid_score_jk'] == pytest.approx(0.352, 1e-2) assert system_level_metrics['1'][ 'linguistic_quality'] == pytest.approx(6.705, 1e-2) assert system_level_metrics['2'][ 'linguistic_quality'] == pytest.approx(5.477, 1e-2) assert system_level_metrics['3'][ 'linguistic_quality'] == pytest.approx(7.477, 1e-2) assert system_level_metrics['1'][ 'overall_responsiveness'] == pytest.approx(3.636, 1e-2) assert system_level_metrics['2'][ 'overall_responsiveness'] == pytest.approx(6.364, 1e-2) assert system_level_metrics['3'][ 'overall_responsiveness'] == pytest.approx(6.341, 1e-2) # BE/simple_A.m.hm.avg assert system_level_metrics['2']['rouge-be-hm'][ 'recall'] == pytest.approx(24.820, 1e-2) assert system_level_metrics['34']['rouge-be-hm'][ 'recall'] == pytest.approx(6.356, 1e-2) assert system_level_metrics['40']['rouge-be-hm'][ 'recall'] == pytest.approx(6.321, 1e-2) assert system_level_metrics['45']['rouge-be-hm'][ 'recall'] == pytest.approx(5.899, 1e-2) assert system_level_metrics['4']['rouge-be-hm'][ 'recall'] == pytest.approx(5.843, 1e-2) # BE/simplejk_A.m.hm.avg assert system_level_metrics['C']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(7.876, 1e-2) assert system_level_metrics['E']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.909, 1e-2) assert system_level_metrics['F']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.840, 1e-2) assert system_level_metrics['2']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(24.830, 1e-2) assert system_level_metrics['34']['rouge-be-hm_jk'][ 'recall'] == pytest.approx(6.379, 1e-2) # aesop_allpeers_A assert system_level_metrics['A']['aesop']['1'] == pytest.approx( 0.154895909090909, 1e-2) assert system_level_metrics['C']['aesop']['8'] == pytest.approx( 0.0419939626932389, 1e-2) assert system_level_metrics['4']['aesop']['13'] == pytest.approx( 0.2186197727, 1e-2) assert system_level_metrics['8']['aesop']['22'] == pytest.approx( 0.1286081818, 1e-2) assert system_level_metrics['16']['aesop']['30'] == pytest.approx( 0.2341865909, 1e-2)