def load(self, writer: BufferedWriterWrapper) -> None: logger = logging.getLogger("snuba.bulk-loader") clickhouse_tables = clickhouse_ro.execute("show tables") if (self.__dest_table, ) not in clickhouse_tables: raise ValueError("Destination table %s does not exists" % self.__dest_table) table_content = clickhouse_ro.execute("select count(*) from %s" % self.__dest_table) if table_content != [(0, )]: raise ValueError("Destination Table is not empty") descriptor = self.__source.get_descriptor() logger.info("Loading snapshot %s", descriptor.id) with self.__source.get_table_file(self.__source_table) as table: logger.info("Loading table %s from file", self.__source_table) row_count = 0 with writer as buffer_writer: for row in table: clickhouse_data = self.__row_processor(row) buffer_writer.write(clickhouse_data) row_count += 1 logger.info("Load complete %d records loaded", row_count)
def test_messages(self): processor = GroupedMessageProcessor("sentry_groupedmessage") message_filter = CdcTableNameMessageFilter( postgres_table=POSTGRES_TABLE) metadata = KafkaMessageMetadata( offset=42, partition=0, ) assert not message_filter.should_drop( self.__make_msg(0, 42, self.BEGIN_MSG, [])) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None assert not message_filter.should_drop( self.__make_msg(0, 42, self.COMMIT_MSG, [])) commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None assert not message_filter.should_drop( self.__make_msg(0, 42, self.INSERT_MSG, [("table", "sentry_groupedmessage".encode())])) insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret.data == [self.PROCESSED] self.write_processed_records(ret.data) ret = clickhouse_ro.execute("SELECT * FROM test_groupedmessage_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) assert not message_filter.should_drop( self.__make_msg(0, 42, self.UPDATE_MSG, [("table", "sentry_groupedmessage".encode())])) update_msg = json.loads(self.UPDATE_MSG) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.PROCESSED] assert not message_filter.should_drop( self.__make_msg(0, 42, self.DELETE_MSG, [])) delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret.data == [self.DELETED]
def check_clickhouse() -> bool: """ Checks if all the tables in all the enabled datasets exist in ClickHouse """ try: clickhouse_tables = clickhouse_ro.execute("show tables") for name in get_enabled_dataset_names(): dataset = get_dataset(name) source = dataset.get_dataset_schemas().get_read_schema() if isinstance(source, TableSchema): table_name = source.get_table_name() if (table_name, ) not in clickhouse_tables: return False return True except Exception: return False
def test_messages(self): processor = GroupAssigneeProcessor("sentry_groupasignee") metadata = KafkaMessageMetadata( offset=42, partition=0, ) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret.data == [self.PROCESSED] self.write_processed_records(ret.data) ret = clickhouse_ro.execute("SELECT * FROM test_groupassignee_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id ) update_msg = json.loads(self.UPDATE_MSG_NO_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.PROCESSED] # Tests an update with key change which becomes a two inserts: # one deletion and the insertion of the new row. update_msg = json.loads(self.UPDATE_MSG_WITH_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.DELETED, self.PROCESSED_UPDATE] delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret.data == [self.DELETED]
def test_bulk_load(self): row = GroupAssigneeRow.from_bulk({ "project_id": "2", "group_id": "1359", "date_added": "2019-09-19 00:17:55+00", "user_id": "1", "team_id": "", }) self.write_processed_records(row.to_clickhouse()) ret = clickhouse_ro.execute("SELECT * FROM test_groupassignee_local;") assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id )
def test_messages(self): processor = GroupedMessageProcessor("sentry_groupedmessage") metadata = KafkaMessageMetadata( offset=42, partition=0, ) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret.data == [self.PROCESSED] self.write_processed_records(ret.data) ret = clickhouse_ro.execute("SELECT * FROM test_groupedmessage_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) update_msg = json.loads(self.UPDATE_MSG) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.PROCESSED] delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret.data == [self.DELETED]
def test_bulk_load(self): row = GroupedMessageRow.from_bulk({ "project_id": "2", "id": "10", "status": "0", "last_seen": "2019-06-28 17:57:32+00", "first_seen": "2019-06-28 06:40:17+00", "active_at": "2019-06-28 06:40:17+00", "first_release_id": "26", }) self.write_processed_records(row.to_clickhouse()) ret = clickhouse_ro.execute("SELECT * FROM test_groupedmessage_local;") assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 10, # id 0, # status datetime(2019, 6, 28, 17, 57, 32), datetime(2019, 6, 28, 6, 40, 17), datetime(2019, 6, 28, 6, 40, 17), 26, )
def test_messages(self): processor = GroupAssigneeProcessor("sentry_groupasignee") message_filter = CdcTableNameMessageFilter( postgres_table=POSTGRES_TABLE) metadata = KafkaMessageMetadata( offset=42, partition=0, ) assert not message_filter.should_drop( self.__make_msg(0, 42, self.BEGIN_MSG, [])) begin_msg = json.loads(self.BEGIN_MSG) ret = processor.process_message(begin_msg, metadata) assert ret is None assert not message_filter.should_drop( self.__make_msg(0, 42, self.COMMIT_MSG, [])) commit_msg = json.loads(self.COMMIT_MSG) ret = processor.process_message(commit_msg, metadata) assert ret is None assert not message_filter.should_drop( self.__make_msg(0, 42, self.INSERT_MSG, [("table", POSTGRES_TABLE.encode())])) insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret.data == [self.PROCESSED] self.write_processed_records(ret.data) ret = clickhouse_ro.execute("SELECT * FROM test_groupassignee_local;") assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id ) assert not message_filter.should_drop( self.__make_msg( 0, 42, self.UPDATE_MSG_NO_KEY_CHANGE, [("table", POSTGRES_TABLE.encode())], )) update_msg = json.loads(self.UPDATE_MSG_NO_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.PROCESSED] # Tests an update with key change which becomes a two inserts: # one deletion and the insertion of the new row. assert not message_filter.should_drop( self.__make_msg( 0, 42, self.UPDATE_MSG_WITH_KEY_CHANGE, [("table", POSTGRES_TABLE.encode())], )) update_msg = json.loads(self.UPDATE_MSG_WITH_KEY_CHANGE) ret = processor.process_message(update_msg, metadata) assert ret.data == [self.DELETED, self.PROCESSED_UPDATE] assert not message_filter.should_drop( self.__make_msg(0, 42, self.DELETE_MSG, [])) delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret.data == [self.DELETED]