def test_init_invalid_features(self): features = [ TestNumericFeaturesFrame.invalid_arguments ] try: NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) self.fail("NumericFeaturesFrame did not raise with invalid feature") except Exception as exc: self.assertEqual( str(exc), "invalid_arguments feature: TypeError: invalid_arguments() takes 0 positional arguments but 1 was given" ) features = [ TestNumericFeaturesFrame.invalid_return ] try: NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) self.fail("NumericFeaturesFrame did not raise with invalid feature return value") except ValueError as exc: self.assertEqual( str(exc), "invalid_return feature did not return float but <class 'str'>" )
def setUpClass(cls): for forbidden_feature in cls.blacklist_features: setattr(cls, forbidden_feature, getattr(MockRankProcessor, forbidden_feature)) setattr(MockRankProcessor, forbidden_feature, None) frame = NumericFeaturesFrame( lambda content: content[cls.identifier_key], MockRankProcessor.get_features(), lambda: cls.test_content ) frame.to_disk(cls.frame_path)
def test_resetting_features_no_content(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features ) frame.reset(features=[ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ]) self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1) assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True) sorted_feature_names = ["is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names )
def test_adding_features(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) frame.load_features([ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ]) assert_frame_equal(frame.data, self.test_frame, check_like=True) sorted_feature_names = ["is_dutch", "is_english", "value_number"] self.assertEquals( sorted(self.frame.features.keys()), sorted_feature_names )
def __init__(self, config): super().__init__(config) if "identifier_key" in self.config and "feature_frame_path" in self.config: self.feature_frame = NumericFeaturesFrame( identifier=lambda ind: ind[self.config.identifier_key], features=self.get_features(), file_path=self.config.feature_frame_path ) else: self.feature_frame = None if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config: self.text_frame = TextFeaturesFrame( get_identifier=lambda ind: ind[self.config.identifier_key], get_text=self.get_text, language=self.config.language, file_path=self.config.text_frame_path ) else: self.text_frame = None
def test_init_file(self): with patch("core.utils.data.numeric_features.NumericFeaturesFrame.from_disk", return_value=self.test_frame) as \ from_disk_patch: frame = NumericFeaturesFrame( self.get_identifier, self.features, file_path="test/path/to/frame.pkl" ) sorted_feature_names = ["is_dutch", "is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names ) from_disk_patch.assert_called_once_with("test/path/to/frame.pkl")
def test_init_immutable_content(self): content = list(self.get_iterator()) features = [ TestNumericFeaturesFrame.set_language_to_fr ] try: NumericFeaturesFrame( self.get_identifier, features, lambda: content ) self.fail("NumericFeaturesFrame did not raise when features modified content") except ValueError: pass
def test_resetting_features_and_content(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) frame.reset( features=[ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ], content=self.get_extra_iterator ) self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0) self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1) assert_frame_equal(frame.data, self.test_frame_extra, check_like=True) sorted_feature_names = ["is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names )
def setUp(self): super().setUp() self.test_fixture = Collective.objects.get(id=2) self.test_records = [ { "is_dutch": 1.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 1.0, "is_english": 0.0, "value_number": 2.0 }, { "is_dutch": 1.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 1.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 1.0, "value_number": 2.0 } ] test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8]) test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min()) self.test_frame = test_frame.fillna(0) self.test_records_extra = [ { "is_dutch": 0.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 0.0, "value_number": 2.0 } ] test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra, index=[4, 5, 6, 7, 8, 9, 10]) test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \ (test_frame_extra.max() - test_frame_extra.min()) self.test_frame_extra = test_frame_extra.fillna(0) self.features = [ TestNumericFeaturesFrame.is_dutch, TestNumericFeaturesFrame.is_english, TestNumericFeaturesFrame.value_number ] self.frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, self.features, self.get_iterator ) self.extra_individuals = [ Individual.objects.create( id=9, properties={ 'country': 'FR', 'language': 'fr', 'value': '1', 'word': 'pension' }, community=self.test_fixture.community, collective=self.test_fixture ), Individual.objects.create( id=10, properties={ 'country': 'FR', 'language': 'fr', 'value': '2', 'word': 'pension' }, community=self.test_fixture.community, collective=self.test_fixture ) ]
class TestNumericFeaturesFrame(TestCase): fixtures = ["test-organisms"] def setUp(self): super().setUp() self.test_fixture = Collective.objects.get(id=2) self.test_records = [ { "is_dutch": 1.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 1.0, "is_english": 0.0, "value_number": 2.0 }, { "is_dutch": 1.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 1.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 1.0, "value_number": 2.0 } ] test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8]) test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min()) self.test_frame = test_frame.fillna(0) self.test_records_extra = [ { "is_dutch": 0.0, "is_english": 0.0, "value_number": 1.0 }, { "is_dutch": 0.0, "is_english": 0.0, "value_number": 2.0 } ] test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra, index=[4, 5, 6, 7, 8, 9, 10]) test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \ (test_frame_extra.max() - test_frame_extra.min()) self.test_frame_extra = test_frame_extra.fillna(0) self.features = [ TestNumericFeaturesFrame.is_dutch, TestNumericFeaturesFrame.is_english, TestNumericFeaturesFrame.value_number ] self.frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, self.features, self.get_iterator ) self.extra_individuals = [ Individual.objects.create( id=9, properties={ 'country': 'FR', 'language': 'fr', 'value': '1', 'word': 'pension' }, community=self.test_fixture.community, collective=self.test_fixture ), Individual.objects.create( id=10, properties={ 'country': 'FR', 'language': 'fr', 'value': '2', 'word': 'pension' }, community=self.test_fixture.community, collective=self.test_fixture ) ] @staticmethod def get_identifier(test): return test.id def get_iterator(self): """ Returns content that is already in fixtures """ return self.test_fixture.individual_set.filter(id__lt=9).iterator() def get_extra_iterator(self): """ Returns content that is created in setUp """ return iter(self.extra_individuals) @staticmethod def is_dutch(test): return float(test["language"] == "nl") @staticmethod def is_english(test): return int(test["language"] == "en") # NB: features should return floats, but ints are allowed @staticmethod def value_number(test): return test["value"] @staticmethod def invalid_arguments(): return 0.0 @staticmethod def invalid_return(test): return "invalid" @staticmethod def set_language_to_fr(test): test["language"] = "fr" return 0.0 def test_init(self): sorted_feature_names = ["is_dutch", "is_english", "value_number"] self.assertEquals( sorted(self.frame.features.keys()), sorted_feature_names ) self.assertTrue(callable(self.frame.content)) assert_frame_equal(self.frame.data, self.test_frame, check_like=True) def test_init_invalid_features(self): features = [ TestNumericFeaturesFrame.invalid_arguments ] try: NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) self.fail("NumericFeaturesFrame did not raise with invalid feature") except Exception as exc: self.assertEqual( str(exc), "invalid_arguments feature: TypeError: invalid_arguments() takes 0 positional arguments but 1 was given" ) features = [ TestNumericFeaturesFrame.invalid_return ] try: NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) self.fail("NumericFeaturesFrame did not raise with invalid feature return value") except ValueError as exc: self.assertEqual( str(exc), "invalid_return feature did not return float but <class 'str'>" ) def test_init_immutable_content(self): content = list(self.get_iterator()) features = [ TestNumericFeaturesFrame.set_language_to_fr ] try: NumericFeaturesFrame( self.get_identifier, features, lambda: content ) self.fail("NumericFeaturesFrame did not raise when features modified content") except ValueError: pass def test_init_file(self): with patch("core.utils.data.numeric_features.NumericFeaturesFrame.from_disk", return_value=self.test_frame) as \ from_disk_patch: frame = NumericFeaturesFrame( self.get_identifier, self.features, file_path="test/path/to/frame.pkl" ) sorted_feature_names = ["is_dutch", "is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names ) from_disk_patch.assert_called_once_with("test/path/to/frame.pkl") def test_to_disk(self): self.frame.data.to_pickle = Mock() self.frame.to_disk("test/path/to/frame.pkl") self.frame.data.to_pickle.assert_called_once_with('test/path/to/frame.pkl') def test_from_disk(self): with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch: self.frame.from_disk("test/path/to/frame.pkl") pandas_patch.assert_called_once_with("test/path/to/frame.pkl") assert_frame_equal(self.frame.data, self.test_frame, check_like=True) def test_from_disk_invalid(self): self.test_frame["extra"] = self.test_frame["is_dutch"] with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch: try: self.frame.from_disk("test/path/to/frame.pkl") self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too much data") except DSFileLoadError as exc: pass pandas_patch.assert_called_once_with("test/path/to/frame.pkl") self.test_frame.drop("is_dutch", axis=1) with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch: try: self.frame.from_disk("test/path/to/frame.pkl") self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading wrong data") except DSFileLoadError: pass pandas_patch.assert_called_once_with("test/path/to/frame.pkl") self.test_frame.drop("extra", axis=1) with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch: try: self.frame.from_disk("test/path/to/frame.pkl") self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too little data") except DSFileLoadError: pass pandas_patch.assert_called_once_with("test/path/to/frame.pkl") def test_adding_features(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) frame.load_features([ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ]) assert_frame_equal(frame.data, self.test_frame, check_like=True) sorted_feature_names = ["is_dutch", "is_english", "value_number"] self.assertEquals( sorted(self.frame.features.keys()), sorted_feature_names ) def test_adding_content(self): self.frame.load_content(self.get_extra_iterator) assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True) def test_adding_content_mixed(self): self.skipTest("Bug: GH-109") old = list(self.get_iterator())[-2:] def update(ind): ind.properties["value"] = int(ind.properties["value"]) * 5 return ind updated = list(map(update, old)) self.frame.load_content( lambda: iter(list(self.get_extra_iterator()) + updated) ) self.test_frame_extra["value_number"].loc[[7, 8]] *= 5 assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True) def test_resetting_features_and_content(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) frame.reset( features=[ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ], content=self.get_extra_iterator ) self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0) self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1) assert_frame_equal(frame.data, self.test_frame_extra, check_like=True) sorted_feature_names = ["is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names ) def test_resetting_features(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features, self.get_iterator ) frame.reset(features=[ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ]) self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1) assert_frame_equal(frame.data, self.test_frame, check_like=True) sorted_feature_names = ["is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names ) def test_resetting_features_no_content(self): features = [ TestNumericFeaturesFrame.is_dutch ] frame = NumericFeaturesFrame( TestNumericFeaturesFrame.get_identifier, features ) frame.reset(features=[ TestNumericFeaturesFrame.value_number, TestNumericFeaturesFrame.is_english ]) self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1) assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True) sorted_feature_names = ["is_english", "value_number"] self.assertEquals( sorted(frame.features.keys()), sorted_feature_names ) def test_resetting_content(self): self.frame.reset(content=self.get_extra_iterator) self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0) assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True) def test_resetting_content_no_features(self): self.frame.features = None self.frame.reset(content=self.get_extra_iterator) self.assertEqual(self.frame.content.__name__, self.get_extra_iterator.__name__) # TODO: better equality test assert_frame_equal(self.frame.data, pd.DataFrame(dtype=np.float), check_like=True) def test_clean_params(self): test_params = { "is_dutch": "1", # get converted to float "is_french": 1.0, # gets skipped "$is_french": 1.0, # gets skipped (without errors) "value_number": None, # gets skipped a a non-numeric "is_english": "test", # gets skipped as a non-numeric "$value_number": 2.0 } for function in [str, int, float]: test_params["is_dutch"] = function(test_params["is_dutch"]) cleaned_params = self.frame.clean_params(test_params) self.assertEquals(cleaned_params, {"is_dutch": 1.0, "value_number": 2.0}) test_error_params = { "is_dutch": "1", "$is_dutch": 1.0, } try: self.frame.clean_params(test_error_params) self.fail("Clean params should have raised for invalid params") except ValueError: pass def test_rank_by_params(self): ranking = self.frame.rank_by_params({"is_dutch": 1, "value_number": 1}) self.assertEquals(ranking, [5, 8, 6, 4, 7]) ranking = self.frame.rank_by_params({"is_dutch": 0.5, "value_number": -1, "is_english": 2, "is_french": 100}) self.assertEquals(ranking, [7, 8, 6, 4, 5]) def test_get_content_hash(self): self.skipTest("not tested") def test_get_feature_value(self): self.skipTest("not tested") def test_get_feature_series(self): self.skipTest("not tested")
class RankProcessor(QuerySetProcessor): config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace="rank_processor" ) contextual_features = [] def __init__(self, config): super().__init__(config) if "identifier_key" in self.config and "feature_frame_path" in self.config: self.feature_frame = NumericFeaturesFrame( identifier=lambda ind: ind[self.config.identifier_key], features=self.get_features(), file_path=self.config.feature_frame_path ) else: self.feature_frame = None if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config: self.text_frame = TextFeaturesFrame( get_identifier=lambda ind: ind[self.config.identifier_key], get_text=self.get_text, language=self.config.language, file_path=self.config.text_frame_path ) else: self.text_frame = None @staticmethod def get_text(document): raise NotImplementedError("The get_text method should be implemented in its context") @classmethod def get_features(cls): mother = set(dir(RankProcessor)) own = set(dir(cls)) return [ getattr(cls, attr) for attr in (own - mother) if callable(getattr(cls, attr)) and attr not in cls.contextual_features ] def get_ranking_results(self, ranking, query_set, series): # TODO: assert identity? how? max_size = self.config.result_size if query_set.count() >= len(ranking): results = list(query_set.filter(identity__in=ranking.index[:max_size])) else: results = list(query_set) results.sort(key=lambda entry: ranking.at[entry.identity], reverse=True) results = results[:max_size] for individual in results: ix = individual[self.config.identifier_key] content = individual.content content["_rank"] = { "rank": ranking.at[ix] } for serie in series: value = serie.at[ix] content["_rank"][serie.name] = { "rank": value, # TODO: rank value should be multiplied by weight "value": value, "weight": 1.0 } yield content def default_ranking(self, query_set): raise NotImplementedError("The default_ranking method should be implemented in its context") def by_feature(self, query_set): assert "ranking_feature" in self.config, "RankProcessor.by_feature needs a ranking_feature from config" assert self.feature_frame, \ "RankProcessor needs a identifier_key and feature_frame_path configuration " \ "to perform RankProcessor.by_feature" ranking_feature = self.config.ranking_feature assert ranking_feature in self.feature_frame.features or ranking_feature in self.contextual_features, \ "The non-contextual feature '{}' is not loaded in the feature frame".format(ranking_feature) if ranking_feature not in self.contextual_features: ranked_feature = self.feature_frame.data[ranking_feature] else: ranked_feature = self.feature_frame.get_feature_series( ranking_feature, getattr(self, ranking_feature), content_callable=query_set.iterator, context=self.config.to_dict() ) ranked_feature = ranked_feature.fillna(0).sort_values(ascending=False) return self.get_ranking_results(ranked_feature, query_set, [ranked_feature]) def by_params(self, individuals): pass