Exemplo n.º 1
0
 def test_init_invalid_features(self):
     features = [
         TestNumericFeaturesFrame.invalid_arguments
     ]
     try:
         NumericFeaturesFrame(
             TestNumericFeaturesFrame.get_identifier,
             features,
             self.get_iterator
         )
         self.fail("NumericFeaturesFrame did not raise with invalid feature")
     except Exception as exc:
         self.assertEqual(
             str(exc),
             "invalid_arguments feature: TypeError: invalid_arguments() takes 0 positional arguments but 1 was given"
         )
     features = [
         TestNumericFeaturesFrame.invalid_return
     ]
     try:
         NumericFeaturesFrame(
             TestNumericFeaturesFrame.get_identifier,
             features,
             self.get_iterator
         )
         self.fail("NumericFeaturesFrame did not raise with invalid feature return value")
     except ValueError as exc:
         self.assertEqual(
             str(exc),
             "invalid_return feature did not return float but <class 'str'>"
         )
Exemplo n.º 2
0
 def setUpClass(cls):
     for forbidden_feature in cls.blacklist_features:
         setattr(cls, forbidden_feature, getattr(MockRankProcessor, forbidden_feature))
         setattr(MockRankProcessor, forbidden_feature, None)
     frame = NumericFeaturesFrame(
         lambda content: content[cls.identifier_key],
         MockRankProcessor.get_features(),
         lambda: cls.test_content
     )
     frame.to_disk(cls.frame_path)
Exemplo n.º 3
0
 def test_resetting_features_no_content(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features
     )
     frame.reset(features=[
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
     assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True)
     sorted_feature_names = ["is_english", "value_number"]
     self.assertEquals(
         sorted(frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 4
0
 def test_adding_features(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features,
         self.get_iterator
     )
     frame.load_features([
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     assert_frame_equal(frame.data, self.test_frame, check_like=True)
     sorted_feature_names = ["is_dutch", "is_english", "value_number"]
     self.assertEquals(
         sorted(self.frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 5
0
 def __init__(self, config):
     super().__init__(config)
     if "identifier_key" in self.config and "feature_frame_path" in self.config:
         self.feature_frame = NumericFeaturesFrame(
             identifier=lambda ind: ind[self.config.identifier_key],
             features=self.get_features(),
             file_path=self.config.feature_frame_path
         )
     else:
         self.feature_frame = None
     if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config:
         self.text_frame = TextFeaturesFrame(
             get_identifier=lambda ind: ind[self.config.identifier_key],
             get_text=self.get_text,
             language=self.config.language,
             file_path=self.config.text_frame_path
         )
     else:
         self.text_frame = None
Exemplo n.º 6
0
 def test_resetting_features_no_content(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features
     )
     frame.reset(features=[
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
     assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True)
     sorted_feature_names = ["is_english", "value_number"]
     self.assertEquals(
         sorted(frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 7
0
 def test_adding_features(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features,
         self.get_iterator
     )
     frame.load_features([
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     assert_frame_equal(frame.data, self.test_frame, check_like=True)
     sorted_feature_names = ["is_dutch", "is_english", "value_number"]
     self.assertEquals(
         sorted(self.frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 8
0
 def test_init_file(self):
     with patch("core.utils.data.numeric_features.NumericFeaturesFrame.from_disk", return_value=self.test_frame) as \
             from_disk_patch:
         frame = NumericFeaturesFrame(
             self.get_identifier,
             self.features,
             file_path="test/path/to/frame.pkl"
         )
         sorted_feature_names = ["is_dutch", "is_english", "value_number"]
         self.assertEquals(
             sorted(frame.features.keys()),
             sorted_feature_names
         )
         from_disk_patch.assert_called_once_with("test/path/to/frame.pkl")
Exemplo n.º 9
0
 def test_init_immutable_content(self):
     content = list(self.get_iterator())
     features = [
         TestNumericFeaturesFrame.set_language_to_fr
     ]
     try:
         NumericFeaturesFrame(
             self.get_identifier,
             features,
             lambda: content
         )
         self.fail("NumericFeaturesFrame did not raise when features modified content")
     except ValueError:
         pass
Exemplo n.º 10
0
 def test_resetting_features_and_content(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features,
         self.get_iterator
     )
     frame.reset(
         features=[
             TestNumericFeaturesFrame.value_number,
             TestNumericFeaturesFrame.is_english
         ],
         content=self.get_extra_iterator
     )
     self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
     self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1)
     assert_frame_equal(frame.data, self.test_frame_extra, check_like=True)
     sorted_feature_names = ["is_english", "value_number"]
     self.assertEquals(
         sorted(frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 11
0
 def test_resetting_features_and_content(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features,
         self.get_iterator
     )
     frame.reset(
         features=[
             TestNumericFeaturesFrame.value_number,
             TestNumericFeaturesFrame.is_english
         ],
         content=self.get_extra_iterator
     )
     self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
     self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1)
     assert_frame_equal(frame.data, self.test_frame_extra, check_like=True)
     sorted_feature_names = ["is_english", "value_number"]
     self.assertEquals(
         sorted(frame.features.keys()),
         sorted_feature_names
     )
Exemplo n.º 12
0
 def setUp(self):
     super().setUp()
     self.test_fixture = Collective.objects.get(id=2)
     self.test_records = [
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 2.0
         },
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 1.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 1.0,
             "value_number": 2.0
         }
     ]
     test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8])
     test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min())
     self.test_frame = test_frame.fillna(0)
     self.test_records_extra = [
         {
             "is_dutch": 0.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 0.0,
             "value_number": 2.0
         }
     ]
     test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra,
                                                       index=[4, 5, 6, 7, 8, 9, 10])
     test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \
                        (test_frame_extra.max() - test_frame_extra.min())
     self.test_frame_extra = test_frame_extra.fillna(0)
     self.features = [
         TestNumericFeaturesFrame.is_dutch,
         TestNumericFeaturesFrame.is_english,
         TestNumericFeaturesFrame.value_number
     ]
     self.frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         self.features,
         self.get_iterator
     )
     self.extra_individuals = [
         Individual.objects.create(
             id=9,
             properties={
                 'country': 'FR',
                 'language': 'fr',
                 'value': '1',
                 'word': 'pension'
             },
             community=self.test_fixture.community,
             collective=self.test_fixture
         ),
         Individual.objects.create(
             id=10,
             properties={
                 'country': 'FR',
                 'language': 'fr',
                 'value': '2',
                 'word': 'pension'
             },
             community=self.test_fixture.community,
             collective=self.test_fixture
         )
     ]
Exemplo n.º 13
0
class TestNumericFeaturesFrame(TestCase):

    fixtures = ["test-organisms"]

    def setUp(self):
        super().setUp()
        self.test_fixture = Collective.objects.get(id=2)
        self.test_records = [
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 2.0
            },
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 1.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 1.0,
                "value_number": 2.0
            }
        ]
        test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8])
        test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min())
        self.test_frame = test_frame.fillna(0)
        self.test_records_extra = [
            {
                "is_dutch": 0.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 0.0,
                "value_number": 2.0
            }
        ]
        test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra,
                                                          index=[4, 5, 6, 7, 8, 9, 10])
        test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \
                           (test_frame_extra.max() - test_frame_extra.min())
        self.test_frame_extra = test_frame_extra.fillna(0)
        self.features = [
            TestNumericFeaturesFrame.is_dutch,
            TestNumericFeaturesFrame.is_english,
            TestNumericFeaturesFrame.value_number
        ]
        self.frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            self.features,
            self.get_iterator
        )
        self.extra_individuals = [
            Individual.objects.create(
                id=9,
                properties={
                    'country': 'FR',
                    'language': 'fr',
                    'value': '1',
                    'word': 'pension'
                },
                community=self.test_fixture.community,
                collective=self.test_fixture
            ),
            Individual.objects.create(
                id=10,
                properties={
                    'country': 'FR',
                    'language': 'fr',
                    'value': '2',
                    'word': 'pension'
                },
                community=self.test_fixture.community,
                collective=self.test_fixture
            )
        ]

    @staticmethod
    def get_identifier(test):
        return test.id

    def get_iterator(self):
        """
        Returns content that is already in fixtures
        """
        return self.test_fixture.individual_set.filter(id__lt=9).iterator()

    def get_extra_iterator(self):
        """
        Returns content that is created in setUp
        """
        return iter(self.extra_individuals)

    @staticmethod
    def is_dutch(test):
        return float(test["language"] == "nl")

    @staticmethod
    def is_english(test):
        return int(test["language"] == "en")  # NB: features should return floats, but ints are allowed

    @staticmethod
    def value_number(test):
        return test["value"]

    @staticmethod
    def invalid_arguments():
        return 0.0

    @staticmethod
    def invalid_return(test):
        return "invalid"

    @staticmethod
    def set_language_to_fr(test):
        test["language"] = "fr"
        return 0.0

    def test_init(self):
        sorted_feature_names = ["is_dutch", "is_english", "value_number"]
        self.assertEquals(
            sorted(self.frame.features.keys()),
            sorted_feature_names
        )
        self.assertTrue(callable(self.frame.content))
        assert_frame_equal(self.frame.data, self.test_frame, check_like=True)

    def test_init_invalid_features(self):
        features = [
            TestNumericFeaturesFrame.invalid_arguments
        ]
        try:
            NumericFeaturesFrame(
                TestNumericFeaturesFrame.get_identifier,
                features,
                self.get_iterator
            )
            self.fail("NumericFeaturesFrame did not raise with invalid feature")
        except Exception as exc:
            self.assertEqual(
                str(exc),
                "invalid_arguments feature: TypeError: invalid_arguments() takes 0 positional arguments but 1 was given"
            )
        features = [
            TestNumericFeaturesFrame.invalid_return
        ]
        try:
            NumericFeaturesFrame(
                TestNumericFeaturesFrame.get_identifier,
                features,
                self.get_iterator
            )
            self.fail("NumericFeaturesFrame did not raise with invalid feature return value")
        except ValueError as exc:
            self.assertEqual(
                str(exc),
                "invalid_return feature did not return float but <class 'str'>"
            )

    def test_init_immutable_content(self):
        content = list(self.get_iterator())
        features = [
            TestNumericFeaturesFrame.set_language_to_fr
        ]
        try:
            NumericFeaturesFrame(
                self.get_identifier,
                features,
                lambda: content
            )
            self.fail("NumericFeaturesFrame did not raise when features modified content")
        except ValueError:
            pass

    def test_init_file(self):
        with patch("core.utils.data.numeric_features.NumericFeaturesFrame.from_disk", return_value=self.test_frame) as \
                from_disk_patch:
            frame = NumericFeaturesFrame(
                self.get_identifier,
                self.features,
                file_path="test/path/to/frame.pkl"
            )
            sorted_feature_names = ["is_dutch", "is_english", "value_number"]
            self.assertEquals(
                sorted(frame.features.keys()),
                sorted_feature_names
            )
            from_disk_patch.assert_called_once_with("test/path/to/frame.pkl")

    def test_to_disk(self):
        self.frame.data.to_pickle = Mock()
        self.frame.to_disk("test/path/to/frame.pkl")
        self.frame.data.to_pickle.assert_called_once_with('test/path/to/frame.pkl')

    def test_from_disk(self):
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            self.frame.from_disk("test/path/to/frame.pkl")
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
            assert_frame_equal(self.frame.data, self.test_frame, check_like=True)

    def test_from_disk_invalid(self):
        self.test_frame["extra"] = self.test_frame["is_dutch"]
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too much data")
            except DSFileLoadError as exc:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
        self.test_frame.drop("is_dutch", axis=1)
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading wrong data")
            except DSFileLoadError:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
        self.test_frame.drop("extra", axis=1)
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too little data")
            except DSFileLoadError:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")

    def test_adding_features(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.load_features([
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        assert_frame_equal(frame.data, self.test_frame, check_like=True)
        sorted_feature_names = ["is_dutch", "is_english", "value_number"]
        self.assertEquals(
            sorted(self.frame.features.keys()),
            sorted_feature_names
        )

    def test_adding_content(self):
        self.frame.load_content(self.get_extra_iterator)
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_adding_content_mixed(self):
        self.skipTest("Bug: GH-109")
        old = list(self.get_iterator())[-2:]

        def update(ind):
            ind.properties["value"] = int(ind.properties["value"]) * 5
            return ind

        updated = list(map(update, old))
        self.frame.load_content(
            lambda: iter(list(self.get_extra_iterator()) + updated)
        )
        self.test_frame_extra["value_number"].loc[[7, 8]] *= 5
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_resetting_features_and_content(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.reset(
            features=[
                TestNumericFeaturesFrame.value_number,
                TestNumericFeaturesFrame.is_english
            ],
            content=self.get_extra_iterator
        )
        self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
        self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame_extra, check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_features(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.reset(features=[
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame, check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_features_no_content(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features
        )
        frame.reset(features=[
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_content(self):
        self.frame.reset(content=self.get_extra_iterator)
        self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_resetting_content_no_features(self):
        self.frame.features = None
        self.frame.reset(content=self.get_extra_iterator)
        self.assertEqual(self.frame.content.__name__, self.get_extra_iterator.__name__)  # TODO: better equality test
        assert_frame_equal(self.frame.data, pd.DataFrame(dtype=np.float), check_like=True)

    def test_clean_params(self):
        test_params = {
            "is_dutch": "1",  # get converted to float
            "is_french": 1.0,  # gets skipped
            "$is_french": 1.0,  # gets skipped (without errors)
            "value_number": None,  # gets skipped a a non-numeric
            "is_english": "test",  # gets skipped as a non-numeric
            "$value_number": 2.0
        }
        for function in [str, int, float]:
            test_params["is_dutch"] = function(test_params["is_dutch"])
            cleaned_params = self.frame.clean_params(test_params)
            self.assertEquals(cleaned_params, {"is_dutch": 1.0, "value_number": 2.0})

        test_error_params = {
            "is_dutch": "1",
            "$is_dutch": 1.0,
        }
        try:
            self.frame.clean_params(test_error_params)
            self.fail("Clean params should have raised for invalid params")
        except ValueError:
            pass

    def test_rank_by_params(self):
        ranking = self.frame.rank_by_params({"is_dutch": 1, "value_number": 1})
        self.assertEquals(ranking, [5, 8, 6, 4, 7])
        ranking = self.frame.rank_by_params({"is_dutch": 0.5, "value_number": -1, "is_english": 2, "is_french": 100})
        self.assertEquals(ranking, [7, 8, 6, 4, 5])

    def test_get_content_hash(self):
        self.skipTest("not tested")

    def test_get_feature_value(self):
        self.skipTest("not tested")

    def test_get_feature_series(self):
        self.skipTest("not tested")
Exemplo n.º 14
0
class RankProcessor(QuerySetProcessor):

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=[],
        namespace="rank_processor"
    )

    contextual_features = []

    def __init__(self, config):
        super().__init__(config)
        if "identifier_key" in self.config and "feature_frame_path" in self.config:
            self.feature_frame = NumericFeaturesFrame(
                identifier=lambda ind: ind[self.config.identifier_key],
                features=self.get_features(),
                file_path=self.config.feature_frame_path
            )
        else:
            self.feature_frame = None
        if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config:
            self.text_frame = TextFeaturesFrame(
                get_identifier=lambda ind: ind[self.config.identifier_key],
                get_text=self.get_text,
                language=self.config.language,
                file_path=self.config.text_frame_path
            )
        else:
            self.text_frame = None

    @staticmethod
    def get_text(document):
        raise NotImplementedError("The get_text method should be implemented in its context")

    @classmethod
    def get_features(cls):
        mother = set(dir(RankProcessor))
        own = set(dir(cls))
        return [
            getattr(cls, attr) for attr in (own - mother)
            if callable(getattr(cls, attr)) and
            attr not in cls.contextual_features
        ]

    def get_ranking_results(self, ranking, query_set, series):

        # TODO: assert identity? how?
        max_size = self.config.result_size

        if query_set.count() >= len(ranking):
            results = list(query_set.filter(identity__in=ranking.index[:max_size]))
        else:
            results = list(query_set)
        results.sort(key=lambda entry: ranking.at[entry.identity], reverse=True)
        results = results[:max_size]

        for individual in results:
            ix = individual[self.config.identifier_key]
            content = individual.content
            content["_rank"] = {
                "rank": ranking.at[ix]
            }
            for serie in series:
                value = serie.at[ix]
                content["_rank"][serie.name] = {
                    "rank": value,  # TODO: rank value should be multiplied by weight
                    "value": value,
                    "weight": 1.0
                }
            yield content

    def default_ranking(self, query_set):
        raise NotImplementedError("The default_ranking method should be implemented in its context")

    def by_feature(self, query_set):
        assert "ranking_feature" in self.config, "RankProcessor.by_feature needs a ranking_feature from config"
        assert self.feature_frame, \
            "RankProcessor needs a identifier_key and feature_frame_path configuration " \
            "to perform RankProcessor.by_feature"
        ranking_feature = self.config.ranking_feature
        assert ranking_feature in self.feature_frame.features or ranking_feature in self.contextual_features, \
            "The non-contextual feature '{}' is not loaded in the feature frame".format(ranking_feature)
        if ranking_feature not in self.contextual_features:
            ranked_feature = self.feature_frame.data[ranking_feature]
        else:
            ranked_feature = self.feature_frame.get_feature_series(
                ranking_feature, getattr(self, ranking_feature),
                content_callable=query_set.iterator, context=self.config.to_dict()
            )
        ranked_feature = ranked_feature.fillna(0).sort_values(ascending=False)
        return self.get_ranking_results(ranked_feature, query_set, [ranked_feature])

    def by_params(self, individuals):
        pass
Exemplo n.º 15
0
 def setUp(self):
     super().setUp()
     self.test_fixture = Collective.objects.get(id=2)
     self.test_records = [
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 2.0
         },
         {
             "is_dutch": 1.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 1.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 1.0,
             "value_number": 2.0
         }
     ]
     test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8])
     test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min())
     self.test_frame = test_frame.fillna(0)
     self.test_records_extra = [
         {
             "is_dutch": 0.0,
             "is_english": 0.0,
             "value_number": 1.0
         },
         {
             "is_dutch": 0.0,
             "is_english": 0.0,
             "value_number": 2.0
         }
     ]
     test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra,
                                                       index=[4, 5, 6, 7, 8, 9, 10])
     test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \
                        (test_frame_extra.max() - test_frame_extra.min())
     self.test_frame_extra = test_frame_extra.fillna(0)
     self.features = [
         TestNumericFeaturesFrame.is_dutch,
         TestNumericFeaturesFrame.is_english,
         TestNumericFeaturesFrame.value_number
     ]
     self.frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         self.features,
         self.get_iterator
     )
     self.extra_individuals = [
         Individual.objects.create(
             id=9,
             properties={
                 'country': 'FR',
                 'language': 'fr',
                 'value': '1',
                 'word': 'pension'
             },
             community=self.test_fixture.community,
             collective=self.test_fixture
         ),
         Individual.objects.create(
             id=10,
             properties={
                 'country': 'FR',
                 'language': 'fr',
                 'value': '2',
                 'word': 'pension'
             },
             community=self.test_fixture.community,
             collective=self.test_fixture
         )
     ]
Exemplo n.º 16
0
class TestNumericFeaturesFrame(TestCase):

    fixtures = ["test-organisms"]

    def setUp(self):
        super().setUp()
        self.test_fixture = Collective.objects.get(id=2)
        self.test_records = [
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 2.0
            },
            {
                "is_dutch": 1.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 1.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 1.0,
                "value_number": 2.0
            }
        ]
        test_frame = pd.DataFrame.from_records(self.test_records, index=[4, 5, 6, 7, 8])
        test_frame = (test_frame - test_frame.min()) / (test_frame.max() - test_frame.min())
        self.test_frame = test_frame.fillna(0)
        self.test_records_extra = [
            {
                "is_dutch": 0.0,
                "is_english": 0.0,
                "value_number": 1.0
            },
            {
                "is_dutch": 0.0,
                "is_english": 0.0,
                "value_number": 2.0
            }
        ]
        test_frame_extra = pd.DataFrame.from_records(self.test_records + self.test_records_extra,
                                                          index=[4, 5, 6, 7, 8, 9, 10])
        test_frame_extra = (test_frame_extra - test_frame_extra.min()) / \
                           (test_frame_extra.max() - test_frame_extra.min())
        self.test_frame_extra = test_frame_extra.fillna(0)
        self.features = [
            TestNumericFeaturesFrame.is_dutch,
            TestNumericFeaturesFrame.is_english,
            TestNumericFeaturesFrame.value_number
        ]
        self.frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            self.features,
            self.get_iterator
        )
        self.extra_individuals = [
            Individual.objects.create(
                id=9,
                properties={
                    'country': 'FR',
                    'language': 'fr',
                    'value': '1',
                    'word': 'pension'
                },
                community=self.test_fixture.community,
                collective=self.test_fixture
            ),
            Individual.objects.create(
                id=10,
                properties={
                    'country': 'FR',
                    'language': 'fr',
                    'value': '2',
                    'word': 'pension'
                },
                community=self.test_fixture.community,
                collective=self.test_fixture
            )
        ]

    @staticmethod
    def get_identifier(test):
        return test.id

    def get_iterator(self):
        """
        Returns content that is already in fixtures
        """
        return self.test_fixture.individual_set.filter(id__lt=9).iterator()

    def get_extra_iterator(self):
        """
        Returns content that is created in setUp
        """
        return iter(self.extra_individuals)

    @staticmethod
    def is_dutch(test):
        return float(test["language"] == "nl")

    @staticmethod
    def is_english(test):
        return int(test["language"] == "en")  # NB: features should return floats, but ints are allowed

    @staticmethod
    def value_number(test):
        return test["value"]

    @staticmethod
    def invalid_arguments():
        return 0.0

    @staticmethod
    def invalid_return(test):
        return "invalid"

    @staticmethod
    def set_language_to_fr(test):
        test["language"] = "fr"
        return 0.0

    def test_init(self):
        sorted_feature_names = ["is_dutch", "is_english", "value_number"]
        self.assertEquals(
            sorted(self.frame.features.keys()),
            sorted_feature_names
        )
        self.assertTrue(callable(self.frame.content))
        assert_frame_equal(self.frame.data, self.test_frame, check_like=True)

    def test_init_invalid_features(self):
        features = [
            TestNumericFeaturesFrame.invalid_arguments
        ]
        try:
            NumericFeaturesFrame(
                TestNumericFeaturesFrame.get_identifier,
                features,
                self.get_iterator
            )
            self.fail("NumericFeaturesFrame did not raise with invalid feature")
        except Exception as exc:
            self.assertEqual(
                str(exc),
                "invalid_arguments feature: TypeError: invalid_arguments() takes 0 positional arguments but 1 was given"
            )
        features = [
            TestNumericFeaturesFrame.invalid_return
        ]
        try:
            NumericFeaturesFrame(
                TestNumericFeaturesFrame.get_identifier,
                features,
                self.get_iterator
            )
            self.fail("NumericFeaturesFrame did not raise with invalid feature return value")
        except ValueError as exc:
            self.assertEqual(
                str(exc),
                "invalid_return feature did not return float but <class 'str'>"
            )

    def test_init_immutable_content(self):
        content = list(self.get_iterator())
        features = [
            TestNumericFeaturesFrame.set_language_to_fr
        ]
        try:
            NumericFeaturesFrame(
                self.get_identifier,
                features,
                lambda: content
            )
            self.fail("NumericFeaturesFrame did not raise when features modified content")
        except ValueError:
            pass

    def test_init_file(self):
        with patch("core.utils.data.numeric_features.NumericFeaturesFrame.from_disk", return_value=self.test_frame) as \
                from_disk_patch:
            frame = NumericFeaturesFrame(
                self.get_identifier,
                self.features,
                file_path="test/path/to/frame.pkl"
            )
            sorted_feature_names = ["is_dutch", "is_english", "value_number"]
            self.assertEquals(
                sorted(frame.features.keys()),
                sorted_feature_names
            )
            from_disk_patch.assert_called_once_with("test/path/to/frame.pkl")

    def test_to_disk(self):
        self.frame.data.to_pickle = Mock()
        self.frame.to_disk("test/path/to/frame.pkl")
        self.frame.data.to_pickle.assert_called_once_with('test/path/to/frame.pkl')

    def test_from_disk(self):
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            self.frame.from_disk("test/path/to/frame.pkl")
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
            assert_frame_equal(self.frame.data, self.test_frame, check_like=True)

    def test_from_disk_invalid(self):
        self.test_frame["extra"] = self.test_frame["is_dutch"]
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too much data")
            except DSFileLoadError as exc:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
        self.test_frame.drop("is_dutch", axis=1)
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading wrong data")
            except DSFileLoadError:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")
        self.test_frame.drop("extra", axis=1)
        with patch("core.utils.data.numeric_features.pd.read_pickle", return_value=self.test_frame) as pandas_patch:
            try:
                self.frame.from_disk("test/path/to/frame.pkl")
                self.fail("NumericFeatureFrame.from_disk did not raise an assertion when loading too little data")
            except DSFileLoadError:
                pass
            pandas_patch.assert_called_once_with("test/path/to/frame.pkl")

    def test_adding_features(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.load_features([
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        assert_frame_equal(frame.data, self.test_frame, check_like=True)
        sorted_feature_names = ["is_dutch", "is_english", "value_number"]
        self.assertEquals(
            sorted(self.frame.features.keys()),
            sorted_feature_names
        )

    def test_adding_content(self):
        self.frame.load_content(self.get_extra_iterator)
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_adding_content_mixed(self):
        self.skipTest("Bug: GH-109")
        old = list(self.get_iterator())[-2:]

        def update(ind):
            ind.properties["value"] = int(ind.properties["value"]) * 5
            return ind

        updated = list(map(update, old))
        self.frame.load_content(
            lambda: iter(list(self.get_extra_iterator()) + updated)
        )
        self.test_frame_extra["value_number"].loc[[7, 8]] *= 5
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_resetting_features_and_content(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.reset(
            features=[
                TestNumericFeaturesFrame.value_number,
                TestNumericFeaturesFrame.is_english
            ],
            content=self.get_extra_iterator
        )
        self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
        self.test_frame_extra = self.test_frame_extra.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame_extra, check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_features(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features,
            self.get_iterator
        )
        frame.reset(features=[
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame, check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_features_no_content(self):
        features = [
            TestNumericFeaturesFrame.is_dutch
        ]
        frame = NumericFeaturesFrame(
            TestNumericFeaturesFrame.get_identifier,
            features
        )
        frame.reset(features=[
            TestNumericFeaturesFrame.value_number,
            TestNumericFeaturesFrame.is_english
        ])
        self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
        assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True)
        sorted_feature_names = ["is_english", "value_number"]
        self.assertEquals(
            sorted(frame.features.keys()),
            sorted_feature_names
        )

    def test_resetting_content(self):
        self.frame.reset(content=self.get_extra_iterator)
        self.test_frame_extra = self.test_frame_extra.drop([4, 5, 6, 7, 8], axis=0)
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)

    def test_resetting_content_no_features(self):
        self.frame.features = None
        self.frame.reset(content=self.get_extra_iterator)
        self.assertEqual(self.frame.content.__name__, self.get_extra_iterator.__name__)  # TODO: better equality test
        assert_frame_equal(self.frame.data, pd.DataFrame(dtype=np.float), check_like=True)

    def test_clean_params(self):
        test_params = {
            "is_dutch": "1",  # get converted to float
            "is_french": 1.0,  # gets skipped
            "$is_french": 1.0,  # gets skipped (without errors)
            "value_number": None,  # gets skipped a a non-numeric
            "is_english": "test",  # gets skipped as a non-numeric
            "$value_number": 2.0
        }
        for function in [str, int, float]:
            test_params["is_dutch"] = function(test_params["is_dutch"])
            cleaned_params = self.frame.clean_params(test_params)
            self.assertEquals(cleaned_params, {"is_dutch": 1.0, "value_number": 2.0})

        test_error_params = {
            "is_dutch": "1",
            "$is_dutch": 1.0,
        }
        try:
            self.frame.clean_params(test_error_params)
            self.fail("Clean params should have raised for invalid params")
        except ValueError:
            pass

    def test_rank_by_params(self):
        ranking = self.frame.rank_by_params({"is_dutch": 1, "value_number": 1})
        self.assertEquals(ranking, [5, 8, 6, 4, 7])
        ranking = self.frame.rank_by_params({"is_dutch": 0.5, "value_number": -1, "is_english": 2, "is_french": 100})
        self.assertEquals(ranking, [7, 8, 6, 4, 5])

    def test_get_content_hash(self):
        self.skipTest("not tested")

    def test_get_feature_value(self):
        self.skipTest("not tested")

    def test_get_feature_series(self):
        self.skipTest("not tested")