示例#1
0
    def test_prop_person_denormalized(self):
        _create_person(distinct_ids=["some_id"],
                       team_id=self.team.pk,
                       properties={"email": "*****@*****.**"})
        _create_event(event="$pageview", team=self.team, distinct_id="some_id")

        materialize("person", "email")

        filter = Filter(
            data={
                "properties": [{
                    "key": "email",
                    "type": "person",
                    "value": "posthog",
                    "operator": "icontains"
                }],
            })
        self.assertEqual(len(self._run_query(filter, join_person_tables=True)),
                         1)

        filter = Filter(
            data={
                "properties": [{
                    "key": "email",
                    "type": "person",
                    "value": "posthog",
                    "operator": "not_icontains"
                }],
            })
        self.assertEqual(len(self._run_query(filter, join_person_tables=True)),
                         0)
示例#2
0
def testdata(db, team):
    materialize("person", "email")
    _create_person(
        distinct_ids=["1"],
        team_id=team.pk,
        properties={
            "email": "*****@*****.**",
            "$os": "windows",
            "$browser": "chrome"
        },
    )
    _create_person(
        distinct_ids=["2"],
        team_id=team.pk,
        properties={
            "email": "*****@*****.**",
            "$os": "Mac",
            "$browser": "firefox"
        },
    )
    _create_person(
        distinct_ids=["3"],
        team_id=team.pk,
        properties={
            "email": "*****@*****.**",
            "$os": "windows",
            "$browser": "mozilla"
        },
    )
示例#3
0
    def test_events_columns_in_inconsistent_state(self):
        materialize("events", "$session_id")
        materialize("events", "$window_id")

        sync_execute(
            "ALTER TABLE events RENAME COLUMN mat_$session_id TO $session_id")

        materialize_session_and_window_id(CLICKHOUSE_DATABASE)
        self.assert_desired_state()
示例#4
0
def create_materialized_columns(database):
    try:
        materialize("events", "$session_id", "$session_id")
    except ValueError:
        # session_id is already materialized, skip
        pass
    try:
        materialize("events", "$window_id", "$window_id")
    except ValueError:
        # window_id is already materialized, skip
        pass
    def test_create_missing_tables(self):
        self.recreate_database(create_tables=True)
        materialize("events", "some_property")
        _, create_table_queries, _ = Command().analyze_cluster_tables()
        sync_execute("DROP TABLE sharded_events SYNC")

        self.assertIn("mat_some_property",
                      create_table_queries["sharded_events"])
        Command().create_missing_tables({"test_host": {"sharded_events"}},
                                        create_table_queries)

        schema = sync_execute("SHOW CREATE TABLE sharded_events")[0][0]
        self.assertIn("mat_some_property", schema)
示例#6
0
    def test_column_types(self):
        materialize("events", "myprop")

        expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), '^\"|\"$', '')"
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("mat_myprop"))

        backfill_materialized_columns("events", ["myprop"], timedelta(days=50))
        self.assertEqual(("DEFAULT", expr),
                         self._get_column_types("mat_myprop"))

        mark_all_materialized()
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("mat_myprop"))
示例#7
0
    def test_denormalised_props(self):
        filters = {
            "events": [
                {
                    "id": "user signed up",
                    "type": "events",
                    "order": 0,
                    "properties": [{
                        "key": "test_prop",
                        "value": "hi"
                    }],
                },
            ],
            "date_from":
            "2020-01-01",
            "properties": [{
                "key": "test_prop",
                "value": "hi"
            }],
            "date_to":
            "2020-01-14",
        }

        materialize("events", "test_prop")

        Person.objects.create(team_id=self.team.pk,
                              distinct_ids=["p1"],
                              properties={"key": "value"})
        _create_event(
            team=self.team,
            event="$pageview",
            distinct_id="p1",
            timestamp="2020-01-02T12:00:00Z",
            properties={"test_prop": "hi"},
        )

        Person.objects.create(team_id=self.team.pk,
                              distinct_ids=["p2"],
                              properties={"key_2": "value_2"})
        _create_event(
            team=self.team,
            event="$pageview",
            distinct_id="p2",
            timestamp="2020-01-02T12:00:00Z",
            properties={"test_prop": "hi"},
        )

        filter = Filter(data=filters)
        _, query = self._run_query(filter)
        self.assertIn("mat_test_prop", query)
示例#8
0
    def test_caching_and_materializing(self):
        with freeze_time("2020-01-04T13:01:01Z"):
            materialize("events", "$foo")
            materialize("events", "$bar")
            materialize("person", "$zeta")

            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS],
            )
            self.assertCountEqual(
                get_materialized_columns("person", use_cache=True).keys(),
                ["$zeta"])

            materialize("events", "abc")

            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS],
            )

        with freeze_time("2020-01-04T14:00:01Z"):
            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                [
                    "$foo", "$bar", "abc",
                    *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS
                ],
            )
示例#9
0
def materialize_properties_task(
    columns_to_materialize: Optional[List[Suggestion]] = None,
    time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS,
    maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE,
    min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME,
    backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS,
    dry_run: bool = False,
) -> None:
    """
    Creates materialized columns for event and person properties based off of slow queries
    """

    if columns_to_materialize is None:
        columns_to_materialize = analyze(
            get_queries(time_to_analyze_hours, min_query_time))
    result = []
    for suggestion in columns_to_materialize:
        table, property_name, _ = suggestion
        if property_name not in get_materialized_columns(table):
            result.append(suggestion)

    if len(result) > 0:
        logger.info(
            f"Calculated columns that could be materialized. count={len(result)}"
        )
    else:
        logger.info("Found no columns to materialize.")

    properties: Dict[TableWithProperties, List[PropertyName]] = {
        "events": [],
        "person": [],
    }
    for table, property_name, cost in result[:maximum]:
        logger.info(
            f"Materializing column. table={table}, property_name={property_name}, cost={cost}"
        )

        if not dry_run:
            materialize(table, property_name)
        properties[table].append(property_name)

    if backfill_period_days > 0 and not dry_run:
        logger.info(
            f"Starting backfill for new materialized columns. period_days={backfill_period_days}"
        )
        backfill_materialized_columns("events", properties["events"],
                                      timedelta(days=backfill_period_days))
        backfill_materialized_columns("person", properties["person"],
                                      timedelta(days=backfill_period_days))
示例#10
0
    def test_column_types(self):
        materialize("events", "myprop")

        # :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this
        expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')"
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("events", "mat_myprop"))

        backfill_materialized_columns("events", ["myprop"], timedelta(days=50))
        self.assertEqual(("DEFAULT", expr),
                         self._get_column_types("events", "mat_myprop"))

        mark_all_materialized()
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("events", "mat_myprop"))
示例#11
0
    def test_prop_event_denormalized_ints(self):
        _create_event(
            event="$pageview",
            team=self.team,
            distinct_id="whatever",
            properties={"test_prop": 0},
        )

        _create_event(
            event="$pageview",
            team=self.team,
            distinct_id="whatever",
            properties={"test_prop": 2},
        )

        materialize("events", "test_prop")
        materialize("events", "something_else")

        filter = Filter(data={
            "properties": [{
                "key": "test_prop",
                "value": 1,
                "operator": "gt"
            }],
        })
        self.assertEqual(len(self._run_query(filter)), 1)

        filter = Filter(data={
            "properties": [{
                "key": "test_prop",
                "value": 1,
                "operator": "lt"
            }],
        })
        self.assertEqual(len(self._run_query(filter)), 1)

        filter = Filter(data={
            "properties": [{
                "key": "test_prop",
                "value": 0
            }],
        })
        self.assertEqual(len(self._run_query(filter)), 1)
示例#12
0
def test_prop_filter_json_extract_materialized(test_events, property,
                                               expected_event_indexes, team):
    materialize("events", "attr")
    materialize("events", "email")

    query, params = prop_filter_json_extract(property,
                                             0,
                                             allow_denormalized_props=True)

    assert "JSONExtract" not in query

    uuids = list(
        sorted([
            uuid for (uuid, ) in sync_execute(
                f"SELECT uuid FROM events WHERE team_id = %(team_id)s {query}",
                {
                    "team_id": team.pk,
                    **params
                })
        ]))
    expected = list(
        sorted([test_events[index] for index in expected_event_indexes]))

    assert uuids == expected
示例#13
0
    def test_materialized_columns_checks(self):
        optimizer = lambda: EnterpriseColumnOptimizer(FILTER_WITH_PROPERTIES,
                                                      self.team.id)
        optimizer_groups = lambda: EnterpriseColumnOptimizer(
            FILTER_WITH_GROUPS, self.team.id)

        self.assertEqual(optimizer().event_columns_to_query, {"properties"})
        self.assertEqual(optimizer().person_columns_to_query, {"properties"})
        self.assertEqual(optimizer_groups().event_columns_to_query,
                         {"properties"})
        self.assertEqual(optimizer_groups().person_columns_to_query,
                         {"properties"})

        materialize("events", "event_prop")
        materialize("person", "person_prop")

        self.assertEqual(optimizer().event_columns_to_query,
                         {"mat_event_prop"})
        self.assertEqual(optimizer().person_columns_to_query,
                         {"pmat_person_prop"})
        self.assertEqual(optimizer_groups().event_columns_to_query,
                         {"mat_event_prop"})
        self.assertEqual(optimizer_groups().person_columns_to_query,
                         {"pmat_person_prop"})
示例#14
0
    def test_materialized_column_naming(self):
        random.seed(0)

        materialize("events", "$foO();--sqlinject")
        materialize("events", "$foO();ääsqlinject")
        materialize("events", "$foO_____sqlinject")
        materialize("person", "SoMePrOp")

        self.assertDictContainsSubset(
            {
                "$foO();--sqlinject": "mat_$foO_____sqlinject",
                "$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc",
                "$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz",
            },
            get_materialized_columns("events"),
        )

        self.assertEqual(get_materialized_columns("person"),
                         {"SoMePrOp": "pmat_SoMePrOp"})
示例#15
0
    def test_columns_already_materialized_prior_to_migration(self):
        materialize("events", "$session_id")
        materialize("events", "$window_id")

        materialize_session_and_window_id(CLICKHOUSE_DATABASE)
        self.assert_desired_state()
示例#16
0
    def test_prop_event_denormalized(self):
        _create_event(
            event="$pageview",
            team=self.team,
            distinct_id="whatever",
            properties={"test_prop": "some_other_val"},
        )

        _create_event(
            event="$pageview",
            team=self.team,
            distinct_id="whatever",
            properties={"test_prop": "some_val"},
        )

        materialize("events", "test_prop")
        materialize("events", "something_else")

        filter = Filter(data={
            "properties": [{
                "key": "test_prop",
                "value": "some_val"
            }],
        })
        self.assertEqual(len(self._run_query(filter)), 1)

        filter = Filter(
            data={
                "properties": [{
                    "key": "test_prop",
                    "value": "some_val",
                    "operator": "is_not"
                }],
            })
        self.assertEqual(len(self._run_query(filter)), 1)

        filter = Filter(
            data={
                "properties": [{
                    "key": "test_prop",
                    "value": "some_val",
                    "operator": "is_set"
                }],
            })
        self.assertEqual(len(self._run_query(filter)), 2)

        filter = Filter(
            data={
                "properties": [{
                    "key": "test_prop",
                    "value": "some_val",
                    "operator": "is_not_set"
                }],
            })
        self.assertEqual(len(self._run_query(filter)), 0)

        filter = Filter(
            data={
                "properties": [{
                    "key": "test_prop",
                    "value": "_other_",
                    "operator": "icontains"
                }],
            })
        self.assertEqual(len(self._run_query(filter)), 1)

        filter = Filter(
            data={
                "properties": [{
                    "key": "test_prop",
                    "value": "_other_",
                    "operator": "not_icontains"
                }],
            })
        self.assertEqual(len(self._run_query(filter)), 1)
示例#17
0
    def test_backfilling_data(self):
        sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop")
        sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another")

        _create_event(event="some_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2020-01-01 00:00:00",
                      properties={"prop": 1})
        _create_event(
            event="some_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-02 00:00:00",
            properties={
                "prop": 2,
                "another": 5
            },
        )
        _create_event(event="some_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2021-05-03 00:00:00",
                      properties={"prop": 3})
        _create_event(event="another_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2021-05-04 00:00:00")
        _create_event(
            event="third_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-05 00:00:00",
            properties={"prop": 4},
        )
        _create_event(
            event="fourth_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-06 00:00:00",
            properties={"another": 6},
        )

        materialize("events", "prop")
        materialize("events", "another")

        self.assertEqual(self._count_materialized_rows("mat_prop"), 0)
        self.assertEqual(self._count_materialized_rows("mat_another"), 0)

        with freeze_time("2021-05-10T14:00:01Z"):
            backfill_materialized_columns(
                "events", ["prop", "another"],
                timedelta(days=50),
                test_settings={"mutations_sync": "0"})

        _create_event(
            event="fifth_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-07 00:00:00",
            properties={"another": 7},
        )

        iterations = 0
        while self._get_count_of_mutations_running() > 0 and iterations < 100:
            sleep(0.1)
            iterations += 1

        self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4)
        self.assertGreaterEqual(self._count_materialized_rows("mat_another"),
                                4)

        self.assertEqual(
            sync_execute(
                "SELECT mat_prop, mat_another FROM events ORDER BY timestamp"),
            [("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"),
             ("", "7")],
        )