def test_dataset_analysis_report(self):
     da = DatasetAnalysis(REGRESSION,
                          class_count_threshold=10,
                          outliers_n_sigma=(3, 5),
                          nan_threshold_per_col=(0.1, 0.5, 0.9),
                          nan_threshold_per_sample=(0.1, 0.9))
     report = da.dataset_analysis_report(dataset=self.dataset)
     pprint(report.to_dict(printable=True))
     pprint(report.to_dict_meaning())
     pprint(report.plot())
Exemplo n.º 2
0
 def _create_dataset_analysis_report(self,
                                     **kwargs) -> DatasetAnalysisReport:
     self.dataset_analysis = DatasetAnalysis(
         problem_Type=CLASSIFICATION if self.
         is_classification else REGRESSION)
     return self.create_dataset_analysis_report(
         train=self.train,
         test=self.test,
         is_classification=self.is_classification,
         dataset_analysis=self.dataset_analysis)
    def test_count_outliers(self):
        da = DatasetAnalysis(REGRESSION,
                             class_count_threshold=10,
                             outliers_n_sigma=(3, 5),
                             nan_threshold_per_col=(0.1, 0.5, 0.9),
                             nan_threshold_per_sample=(0.1, 0.5, 0.9))
        out = da.count_outliers(self.dataset)
        print("count_outliers:")
        pprint(out)

        self.assertTrue(9 in out)
        self.assertTrue('3-sigma' in out[9])
        self.assertTrue('5-sigma' in out[9])
        self.assertTrue(out[9]['5-sigma']['n_outliers'] == 1)
        self.assertTrue(out[9]['3-sigma']['expected_outliers'] == 2)
        self.assertTrue(out[9]['5-sigma']['expected_outliers'] == 0)

        self.assertTrue('target' in out)
        self.assertTrue('3-sigma' not in out['target'])
        self.assertTrue('5-sigma' in out['target'])
        self.assertTrue(out['target']['5-sigma']['n_outliers'] == 1)
    def test_count_unique_classes(self):
        da = DatasetAnalysis(CLASSIFICATION,
                             class_count_threshold=10,
                             outliers_n_sigma=(3, 5),
                             nan_threshold_per_col=(0.1, 0.5, 0.9),
                             nan_threshold_per_sample=(0.1, 0.5, 0.9))
        out = da.count_unique_classes(self.dataset)
        print("count_unique_classes:")
        pprint(out)

        self.assertTrue(0 in out)
        self.assertTrue(10.0 in out[0])
        self.assertTrue(out[0][10.0] == 1)

        self.assertTrue('target' in out)
        self.assertTrue(10.0 in out['target'])
        self.assertTrue(out['target'][10.0] == 1)

        for feature in out:
            for value in out[feature].values():
                self.assertLessEqual(value, da._class_count_threshold)
Exemplo n.º 5
0
    def create_dataset_analysis_report(
            cls,
            train: DMD,
            is_classification,
            test: DMD = None,
            dataset_analysis: DatasetAnalysis = None,
            **kwargs) -> DatasetAnalysisReport:
        """
        Create dataset analysis report by analyzing train data

        Args:
            train - train data
            test - test data, if available
            is_classification - whether the target is categorical.

        Returns:
            dataset analysis report
        """
        dataset_analysis = dataset_analysis or DatasetAnalysis(
            problem_Type=CLASSIFICATION if is_classification else REGRESSION)
        report = dataset_analysis.dataset_analysis_report(train=train,
                                                          test=test)
        return report
    def test_count_missing_values(self):
        da = DatasetAnalysis(REGRESSION,
                             class_count_threshold=10,
                             outliers_n_sigma=(3, 5),
                             nan_threshold_per_col=(0.1, 0.5, 0.9),
                             nan_threshold_per_sample=(0.1, 0.9))
        nan_cols, nan_rows = da.count_missing_values(self.dataset)
        print("count_missing_values cols:")
        pprint(nan_cols)

        print("count_missing_values rows:")
        pprint(nan_rows)

        for th in da._nan_threshold_per_feature:
            self.assertTrue(th in nan_cols)

        for th in da._nan_threshold_per_sample:
            self.assertTrue(th in nan_rows)

        self.assertEqual(nan_cols, {
            0.1: {
                1: 1.0,
                2: 0.7
            },
            0.5: {
                1: 1.0,
                2: 0.7
            },
            0.9: {
                1: 1.0
            }
        })
        self.assertEqual(nan_rows[0.9], {10: 1.0})
        self.assertEqual(len(nan_rows[0.1]), self.dataset.n_samples)
        for th, vdict in nan_rows.items():
            print(th, vdict)
            self.assertTrue(th <= max(vdict.values()))
Exemplo n.º 7
0
 def create_dataset_analysis_report(cls, train: DMD, is_classification,
                                    **kwargs) -> DatasetAnalysisReport:
     da = DatasetAnalysis(
         problem_Type=CLASSIFICATION if is_classification else REGRESSION)
     report = da.dataset_analysis_report(dataset=train)
     return report