def test_raise_assertion_error_with_duplicate_features(self, tmp_path):
        config_STRING = """
            raw_data_dir: "dummy"
            dataset_name: "dummy"
            base_features:
              - name: "TIME"
                dtype: DATETIME
            transforming_features:
              - name: "weekday"
                index: 1
                dtype: STRING
                dependencies:
                - "TIME"
              - name: "weekday"
                index: 2
                dtype: STRING
        """
        config_path = tmp_path / "tmp.yaml"
        write_str_to_file(config_STRING, config_path)
        with AssertRaises(AssertionError) as assert_raises:
            feature_config_helper.FeatureConfigHelper(config_path)

        error_message = assert_raises.expected_exception_found
        assert_eq(
            True,
            error_message.args[0].startswith(
                "There are duplicate objects in the list: "
            ),
        )
 def test_extract_config_1(self, tmp_path):
     subset_features = ["e"]
     expected_STRING = """
         raw_data_dir: "dummy"
         dataset_name: "dummy"
         base_features:
           - name: "a"
             dtype: STRING
         transforming_features:
           - name: "b"
             index: 1
             dtype: STRING
             dependencies:
               - "a"
           - name: "c"
             index: 2
             dtype: STRING
             dependencies:
               - "a"
               - "b"
           - name: "e"
             index: 4
             dtype: STRING
             dependencies:
               - "c"
     """
     new_config_path = str(tmp_path / "new_tmp.yaml")
     write_str_to_file(expected_STRING, new_config_path)
     new_config = self.fm_helper.extract_config(selected_features=subset_features)
     assert_eq(parse_feature_config(new_config_path), new_config)
    def test_raise_value_error_with_invalid_indexes(self, tmp_path):
        invalid_index_STRING = """
            # invalid config with indexes are not continuous
            raw_data_dir: "dummy"
            dataset_name: "dummy"
            base_features:
              - name: "TIME"
                dtype: "DATETIME"
            transforming_features:
              - name: "weekday"
                index: 1
                dtype: INT32
                dependencies:
                  - "TIME"
              - name: "hour"
                index: 1
                dtype: INT32
                dependencies:
                  - "TIME"
        """
        config_path = tmp_path / "tmp.yaml"
        write_str_to_file(invalid_index_STRING, config_path)
        with AssertRaises(ValueError) as assert_raises:
            feature_config_helper.FeatureConfigHelper(config_path)

        error_message = assert_raises.expected_exception_found
        assert_eq(
            True,
            error_message.args[0].startswith(
                "Feature indexes must be a list of increasing positive integers. "
                "Got indexes = [1, 1]"
            ),
        )
 def setup_class(cls, tmp_path):
     feature_config_str = """
         raw_data_dir: "dummy"
         dataset_name: "dummy"
         base_features:
           - name: "a"
             dtype: STRING
         transforming_features:
           - name: "b"
             index: 1
             dtype: STRING
             dependencies:
               - "a"
           - name: "e"
             index: 4
             dtype: STRING
             dependencies:
               - "c"
           - name: "c"
             index: 2
             dtype: STRING
             dependencies:
               - "a"
               - "b"
           - name: "d"
             index: 3
             dtype: STRING
             dependencies:
               - "a"
     """
     config_path = tmp_path / "feature_config_str.yaml"
     write_str_to_file(feature_config_str, config_path)
     cls.fm_helper = feature_config_helper.FeatureConfigHelper(config_path)
    def test_raise_value_error_with_invalid_dependencies(self, tmp_path):
        invalid_dependency_STRING = """
            raw_data_dir: "dummy"
            dataset_name: "dummy"
            base_features:
              - name: "TIME"
                dtype: DATETIME
            transforming_features:
              - name: "weekday"
                index: 1
                dtype: STRING
                dependencies:
                  - "date"
        """
        config_path = tmp_path / "tmp.yaml"
        write_str_to_file(invalid_dependency_STRING, config_path)
        with AssertRaises(AssertionError) as assert_raises:
            feature_config_helper.FeatureConfigHelper(config_path)

        error_message = assert_raises.expected_exception_found
        assert_eq(
            True,
            error_message.args[0].startswith(
                "Feature weekday depends on feature date that is undefined."
            ),
        )
示例#6
0
 def test_from_lines_in_txt(self, tmp_path):
     file_content = """a
         d
         c
     """
     txt_path = tmp_path / "foo.txt"
     write_str_to_file(file_content, txt_path)
     got = data_processing.CategoryEncoder.from_lines_in_txt(txt_path).get_encoded(
         self.series
     )
     expected = pd.Series([0, 3, 2, 1])
     pd.testing.assert_series_equal(expected, got)
示例#7
0
 def test_from_mapping_in_csv(self, tmp_path):
     file_content = """
         a,0
         d, 2
         c,5
     """
     txt_path = tmp_path / "bar.txt"
     write_str_to_file(file_content, txt_path)
     got = data_processing.CategoryEncoder.from_mapping_in_csv(txt_path).get_encoded(
         self.series
     )
     expected = pd.Series([0, 6, 5, 2])
     pd.testing.assert_series_equal(expected, got)
示例#8
0
 def setup_class(self, tmp_path):
     pb_str = """
         config_name: "foo"
         data_loader:
           cls_name: "bar"
           feature_config_path: "rab"
           features_to_model: ["c"]
           label_col: "d"
           train_filters: []
           validation_filters: []
         model_wrapper:
           cls_name: "bobar"
         model_analysis:
           metrics: ["a"]
           by_features: ["b"]
     """
     self.pipeline_config_path = tmp_path / "pipeline_config.yaml"
     write_str_to_file(pb_str, self.pipeline_config_path)
示例#9
0
 def setup_class(cls, tmp_path):
     yaml_str = """
         raw_data_dir: dummy
         dataset_name: dummy
         base_features:
           - name: a
             dtype: INT32
           - name: d1
             dtype: DATETIME
         transforming_features:
           - name: b
             index: 1
             dependencies:
               - a
             dtype: INT32
           - name: c
             index: 2
             dependencies:
               - b
             dtype: INT32
           - name: d
             index: 3
             dependencies:
               - a
             dtype: INT32
           - name: e
             index: 4
             dependencies:
               - c
             dtype: INT32
           - name: d2
             index: 5
             dependencies:
               - d1
             dtype: DATETIME
     """
     yaml_config_path = str(tmp_path / "tmp.yaml")
     write_str_to_file(yaml_str, yaml_config_path)
     cls.fm = _DummyFeatureManager(yaml_config_path)
     cls.fm2 = _DummyFeatureManager2(yaml_config_path)
     cls.fm.initialize_dataframe()
     cls.fm2.initialize_dataframe()
示例#10
0
    def setup_class(self, tmp_path):
        # feature manager config
        dataset_dir = tmp_path / "dataset"
        fm_pb_str = f"""
        raw_data_dir: "{dataset_dir}"
        dataset_name: "dummy"
        base_features:
          - name: "a"
            dtype: INT32
        transforming_features:
          - name: "b"
            index: 1
            dtype: INT32
          - name: "label"
            index: 2
            dtype: INT32
          - name: "is_train"
            index: 3
            dtype: BOOL
          - name: "is_validation"
            index: 4
            dtype: BOOL
        """
        fm_pb_path = tmp_path / "feature_config.yaml"
        write_str_to_file(fm_pb_str, fm_pb_path)

        # create fake data
        df = pd.DataFrame(
            data={
                "a": [1, 2, 3, 4, 5],
                "b": [6, 7, 8, 9, 0],
                "c": [-1, -1, -1, -1, -1],
                "label": [0, 1, 1, 0, 1],
                "is_train": [True, False, True, True, False],
                "is_validation": [False, True, False, False, True],
            })
        dataset_path = BaseFeatureManager(fm_pb_path).get_dataset_path()
        Path(dataset_path).parent.mkdir(parents=True)
        df.to_csv(dataset_path, index=False)

        # pipeline config
        pipeline_config = f"""
        config_name: "dummy"
        data_loader:
            cls_name: "tabml.data_loaders.BaseDataLoader"
            feature_config_path: "{fm_pb_path}"
            label_col: "label"
            features_to_model: ["a", "b"]
            train_filters: ["is_train"]
            validation_filters: ["is_validation"]
        model_wrapper:
            cls_name: "a"
        model_analysis:
            metrics: ["foo"]
            by_features: ["bar"]
            by_label: "bar"
            training_size: 50
        """

        pipeline_config_path = tmp_path / "pipeline_config.yaml"
        write_str_to_file(pipeline_config, pipeline_config_path)
        self.config = parse_pipeline_config(pipeline_config_path)