def test_column_types(self): materialize("events", "myprop") expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), '^\"|\"$', '')" self.assertEqual(("MATERIALIZED", expr), self._get_column_types("mat_myprop")) backfill_materialized_columns("events", ["myprop"], timedelta(days=50)) self.assertEqual(("DEFAULT", expr), self._get_column_types("mat_myprop")) mark_all_materialized() self.assertEqual(("MATERIALIZED", expr), self._get_column_types("mat_myprop"))
def materialize_properties_task( columns_to_materialize: Optional[List[Suggestion]] = None, time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE, min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, dry_run: bool = False, ) -> None: """ Creates materialized columns for event and person properties based off of slow queries """ if columns_to_materialize is None: columns_to_materialize = analyze( get_queries(time_to_analyze_hours, min_query_time)) result = [] for suggestion in columns_to_materialize: table, property_name, _ = suggestion if property_name not in get_materialized_columns(table): result.append(suggestion) if len(result) > 0: logger.info( f"Calculated columns that could be materialized. count={len(result)}" ) else: logger.info("Found no columns to materialize.") properties: Dict[TableWithProperties, List[PropertyName]] = { "events": [], "person": [], } for table, property_name, cost in result[:maximum]: logger.info( f"Materializing column. table={table}, property_name={property_name}, cost={cost}" ) if not dry_run: materialize(table, property_name) properties[table].append(property_name) if backfill_period_days > 0 and not dry_run: logger.info( f"Starting backfill for new materialized columns. period_days={backfill_period_days}" ) backfill_materialized_columns("events", properties["events"], timedelta(days=backfill_period_days)) backfill_materialized_columns("person", properties["person"], timedelta(days=backfill_period_days))
def test_column_types(self): materialize("events", "myprop") # :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')" self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop")) backfill_materialized_columns("events", ["myprop"], timedelta(days=50)) self.assertEqual(("DEFAULT", expr), self._get_column_types("events", "mat_myprop")) mark_all_materialized() self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop"))
def test_backfilling_data(self): sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop") sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another") _create_event(event="some_event", distinct_id="1", team=self.team, timestamp="2020-01-01 00:00:00", properties={"prop": 1}) _create_event( event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-02 00:00:00", properties={ "prop": 2, "another": 5 }, ) _create_event(event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-03 00:00:00", properties={"prop": 3}) _create_event(event="another_event", distinct_id="1", team=self.team, timestamp="2021-05-04 00:00:00") _create_event( event="third_event", distinct_id="1", team=self.team, timestamp="2021-05-05 00:00:00", properties={"prop": 4}, ) _create_event( event="fourth_event", distinct_id="1", team=self.team, timestamp="2021-05-06 00:00:00", properties={"another": 6}, ) materialize("events", "prop") materialize("events", "another") self.assertEqual(self._count_materialized_rows("mat_prop"), 0) self.assertEqual(self._count_materialized_rows("mat_another"), 0) with freeze_time("2021-05-10T14:00:01Z"): backfill_materialized_columns( "events", ["prop", "another"], timedelta(days=50), test_settings={"mutations_sync": "0"}) _create_event( event="fifth_event", distinct_id="1", team=self.team, timestamp="2021-05-07 00:00:00", properties={"another": 7}, ) iterations = 0 while self._get_count_of_mutations_running() > 0 and iterations < 100: sleep(0.1) iterations += 1 self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4) self.assertGreaterEqual(self._count_materialized_rows("mat_another"), 4) self.assertEqual( sync_execute( "SELECT mat_prop, mat_another FROM events ORDER BY timestamp"), [("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"), ("", "7")], )