def test_score_description_best_operation(self, description, best_operation): handler = tfidf_handler.TfidfDescriptionHandler() score_dict = handler.score_description(description) self.assertTrue(all(0 <= value <= 1 for value in score_dict.values())) max_score = max(score_dict.values()) self.assertEqual(score_dict[best_operation], max_score)
def test_repr(self): handler = tfidf_handler.TfidfDescriptionHandler(max_num_prioritized=12, min_tfidf_score=0.34, multiplier=0.75) self.assertEqual(repr(handler), 'TfidfDescriptionHandler(max_num_prioritized=12, ' 'min_tfidf_score=0.34, multiplier=0.75)')
def test_get_operation_multiplier_respects_max_num_prioritized( self, max_num_prioritized): handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=max_num_prioritized, min_tfidf_score=0) benchmark = benchmark_module.Benchmark( examples=[ benchmark_module.Example( inputs=[ [10], [20], ], output=[30], ), ], constants=[], description='Tile a tensor multiple times', target_program='', source='test', name='test_benchmark') multipliers = handler.get_operation_multipliers( benchmark, settings=settings_module.default_settings()) actual_num_prioritized = sum(multiplier < 1 for multiplier in multipliers.values()) self.assertEqual(actual_num_prioritized, max_num_prioritized) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue( all(0 < multiplier <= 1 for multiplier in multipliers.values()))
def test_get_operation_multipliers_respects_min_tfidf_score( self, min_tfidf_score): handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=1000000, min_tfidf_score=min_tfidf_score) description = 'Tile a tensor multiple times' scores = handler.score_description(description) benchmark = benchmark_module.Benchmark(examples=[ benchmark_module.Example( inputs=[ [10], [20], ], output=[30], ), ], constants=[], description=description, target_program='', source='test', name='test_benchmark') multipliers = handler.get_operation_multipliers( benchmark, settings=settings_module.default_settings()) prioritized_names = [ name for name in multipliers.keys() if multipliers[name] < 1 ] expected_prioritized_names = [ name for name in scores.keys() if scores[name] >= min_tfidf_score ] self.assertCountEqual(prioritized_names, expected_prioritized_names) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue( all(0 < multiplier <= 1 for multiplier in multipliers.values()))
def test_get_listed_operations_finds_correct_ops( self, description, should_find, should_not_find): handler = tfidf_handler.TfidfDescriptionHandler() listed = handler._get_listed_operations(description) for name in should_find: self.assertIn(name, listed) for name in should_not_find: self.assertNotIn(name, listed)
def test_get_operation_multiplier_respects_max_num_prioritized( self, max_num_prioritized): handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=max_num_prioritized, min_tfidf_score=0) multipliers = handler.get_operation_multipliers( 'Tile a tensor multiple times', settings=settings_module.default_settings()) actual_num_prioritized = sum(multiplier < 1 for multiplier in multipliers.values()) self.assertEqual(actual_num_prioritized, max_num_prioritized) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue( all(0 < multiplier <= 1 for multiplier in multipliers.values()))
def test_get_operation_multiplier_respects_max_num_prioritized( self, max_num_prioritized): # Note: More operations may be prioritized if they are chosen because they # are explicitly listed in the description. handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=max_num_prioritized, min_tfidf_score=0) benchmark = benchmark_module.Benchmark( examples=[benchmark_module.Example(inputs=[1], output=1)], description='Tiling a tensor multiple times') multipliers = handler.get_operation_multipliers( benchmark, settings=settings_module.default_settings()) actual_num_prioritized = sum(multiplier < 1 for multiplier in multipliers.values()) self.assertEqual(actual_num_prioritized, max_num_prioritized) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue(all(0 < multiplier <= 1 for multiplier in multipliers.values()))
def test_get_operation_multipliers_respects_min_tfidf_score( self, min_tfidf_score): handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=1000000, min_tfidf_score=min_tfidf_score) description = 'Tile a tensor multiple times' scores = handler.score_description(description) multipliers = handler.get_operation_multipliers( description, settings=settings_module.default_settings()) prioritized_names = [ name for name in multipliers.keys() if multipliers[name] < 1 ] expected_prioritized_names = [ name for name in scores.keys() if scores[name] >= min_tfidf_score ] self.assertCountEqual(prioritized_names, expected_prioritized_names) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue( all(0 < multiplier <= 1 for multiplier in multipliers.values()))
def test_score_description_exact_values(self): # This example is taken from the following link: # https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]] docstrings = [ ' '.join(['apple'] * i + ['banana'] * j + ['clementine'] * k) for i, j, k in counts ] operations = [ _create_operation('operation_{}'.format(i), docstring) for i, docstring in enumerate(docstrings) ] handler = tfidf_handler.TfidfDescriptionHandler(operations=operations) # Same counts as docstrings[0]. "dragonfruit" is not in the vocabulary. description = 'apple clementine apple dragonfruit apple' scores = handler.score_description(description) sorted_scores = [scores[name] for name in sorted(scores.keys())] # Keep the formatting as in the link above. pylint: disable=bad-whitespace expected_term_document_matrix = [[0.85151335, 0., 0.52433293], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [0.55422893, 0.83236428, 0.], [0.63035731, 0., 0.77630514]] # pylint: enable=bad-whitespace expected_scores = [ np.dot(expected_term_document_matrix[0], row) for row in expected_term_document_matrix ] self.assertLen(sorted_scores, len(expected_scores)) for actual_score, expected_score in zip(sorted_scores, expected_scores): self.assertAlmostEqual(actual_score, expected_score)
def test_get_operation_multipliers_respects_min_tfidf_score( self, min_tfidf_score): # Note: An operation may violate the `min_tfidf_score` threshold if it is # chosen because it is explicitly listed in the description. handler = tfidf_handler.TfidfDescriptionHandler( max_num_prioritized=1000000, min_tfidf_score=min_tfidf_score) description = 'Tiling a tensor multiple times' scores = handler.score_description(description) benchmark = benchmark_module.Benchmark( examples=[benchmark_module.Example(inputs=[1], output=1)], description=description) multipliers = handler.get_operation_multipliers( benchmark, settings=settings_module.default_settings()) prioritized_names = [name for name in multipliers.keys() if multipliers[name] < 1] expected_prioritized_names = [name for name in scores.keys() if scores[name] >= min_tfidf_score] self.assertCountEqual(prioritized_names, expected_prioritized_names) # All multipliers must be in (0, 1] (no operation is deprioritized). self.assertTrue(all(0 < multiplier <= 1 for multiplier in multipliers.values()))