Exemplo n.º 1
0
    def test_null_separation(self):

        inputs = [
            {
                "a": 1,
                "b": 1,
                "c": 1
            },
            {
                "a": None,
                "b": 1,
                "c": 1
            },
            {
                "a": 1,
                "b": None,
                "c": 1
            },
        ]

        expected_default = [
            {
                "a": 1,
                "b": 1,
                "c": 1
            },
        ]

        expected_contains_none = [
            {
                "a": None,
                "b": 1,
                "c": 1
            },
            {
                "a": 1,
                "b": None,
                "c": 1
            },
        ]

        with TestPipeline() as p:
            actual = (p
                      | "Create Input" >> beam.Create(inputs)
                      | PartitionRowsContainingNone())

            assert_that(actual[None],
                        pprint_equal_to(expected_default),
                        label="default")
            assert_that(
                actual["contains_none"],
                pprint_equal_to(expected_contains_none),
                label="contains none",
            )
Exemplo n.º 2
0
    def test_max_select_reverse(self):

        expected = [
            {
                "a": 1,
                "b": 1,
                "c": 1
            },
            {
                "a": 2,
                "b": 1,
                "c": 1
            },
            {
                "a": 1,
                "b": 2,
                "c": 1
            },
        ]

        with TestPipeline() as p:
            actual = (p
                      | "Create Input" >> beam.Create(self.inputs)
                      | MaxSelectPerKey(
                          ("a", "b"), lambda r: r["c"], reverse=True))
            assert_that(actual, pprint_equal_to(expected))
Exemplo n.º 3
0
    def test_creation(self):

        expected = [
            # Column 1
            ("column1", "A", 10),
            ("column1", "NOT IN INPUT", 11),
            ("column1", "B", 12),
            ("column1", "C", 13),
            # Column 2
            ("column2", "X", 2),
            ("column2", "Z", 3),
            ("column2", "Y", 4),
            # Column 3
            ("column3", "L", 1),
            ("column3", "M", 2),
            ("column3", "N", 3),
        ]

        with TestPipeline() as p:
            existing_dict_rows = p | "create existing dicts" >> beam.Create(
                existing_dict_rows_raw)

            inputs = p | "create inputs" >> beam.Create(inputs_raw)

            categorical_dicts = inputs | CreateCategoricalDicts(
                cat_cols, existing_dict_rows)

            assert_that(
                categorical_dicts,
                pprint_equal_to(expected),
            )
Exemplo n.º 4
0
    def test(self):

        inputs = [
            {
                "key": 1,
                "value1": "A",
                "value2": "X"
            },
            {
                "key": 2,
                "value1": "A",
                "value2": "Y"
            },
            {
                "key": 3,
                "value1": "B",
                "value2": "X"
            },
            {
                "key": 4,
                "value1": "B",
                "value2": "Y"
            },
        ]
        # FIXME: Order of the indices can be arbitrary.
        #        example: A -> 1, B -> 0, X -> 1, Y -> 0
        #        This test assumes specific order.
        expected = [
            {
                "key": 1,
                "value1": 0,
                "value2": 0
            },
            {
                "key": 2,
                "value1": 0,
                "value2": 1
            },
            {
                "key": 3,
                "value1": 1,
                "value2": 0
            },
            {
                "key": 4,
                "value1": 1,
                "value2": 1
            },
        ]
        cat_cols = ["value1", "value2"]

        with TestPipeline() as p:
            actual = p | beam.Create(inputs) | DigestCategoricalColumns(
                cat_cols)
            assert_that(actual, pprint_equal_to(expected, deepdiff=True))
Exemplo n.º 5
0
    def test_use(self):

        expected = [
            # Seen values
            {
                "key": 1,
                "column1": 10,
                "column2": 2,
                "column3": 1,
                "column4": "O"
            },
            # Unseen values
            {
                "key": 2,
                "column1": 12,
                "column2": 4,
                "column3": 2,
                "column4": "P"
            },
            # Mixed
            {
                "key": 3,
                "column1": 13,
                "column2": 3,
                "column3": 3,
                "column4": "Q"
            },
            # Repeat Row
            {
                "key": 4,
                "column1": 13,
                "column2": 3,
                "column3": 3,
                "column4": "Q"
            },
        ]

        with TestPipeline() as p:
            existing_dict_rows = p | "create existing dicts" >> beam.Create(
                existing_dict_rows_raw)

            inputs = p | "create inputs" >> beam.Create(inputs_raw)

            categorical_dicts = inputs | CreateCategoricalDicts(
                cat_cols, existing_dict_rows)

            actual = inputs | ReplaceCategoricalColumns(
                cat_cols, categorical_dicts)

            assert_that(
                actual,
                pprint_equal_to(expected),
            )
Exemplo n.º 6
0
    def _check(self,
               left_inputs,
               right_inputs,
               expected,
               keys=["key"],
               columns=["value"]):

        excluded = ["detail"]

        def filter_extra_info(row):
            error = {
                k: v
                for k, v in row["error"].items() if k not in excluded
            }
            return {**row, "error": error}

        with TestPipeline() as p:
            left = p | "create left" >> beam.Create(left_inputs)
            right = p | "create right" >> beam.Create(right_inputs)
            actual = ((left, right)
                      | DifferencePerKey(keys, columns)
                      | beam.Map(filter_extra_info))
            assert_that(actual, pprint_equal_to(expected))