class InsertStatementGenerator(object): def __init__(self, profile): # QueryProfile-like object self.profile = profile # used to generate SELECT queries for INSERT ... SELECT statements; # to ensure state is completely reset, this is created anew with each call to # generate_statement() self.select_stmt_generator = None def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT statement. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all( (isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) if dml_table.primary_keys: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE) else: insert_statement = InsertStatement( conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT) insert_statement.execution = StatementExecutionMode.DML_TEST # Choose whether this is a # INSERT INTO table SELECT/VALUES # or # INSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT VALUES or INSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause( insert_column_list) else: raise Exception('unsupported INSERT source clause: {0}'.format( insert_source_clause)) return insert_statement def _generate_values_clause(self, columns): """ Return a VALUES clause containing a variable number of rows. The values corresponding to primary keys will be non-null constants. Any other columns could be null, constants, or function trees that may or may not evaluate to null. """ values_rows = [] for _ in xrange(self.profile.choose_insert_values_row_count()): values_row = [] for col in columns: if col.is_primary_key: val = self.profile.choose_constant( return_type=col.exact_type, allow_null=False) elif 'constant' == self.profile.choose_values_item_expr(): val = self.profile.choose_constant( return_type=col.exact_type, allow_null=True) else: func_tree = self.select_stmt_generator.create_func_tree( col.type, allow_subquery=False) val = self.select_stmt_generator.populate_func_with_vals( func_tree) # Only the generic type, not the exact type, of the value will be known. To # avoid a lot of failed queries due to precision errors, we cast the val to # the exact type of the column. This will still not prevent "out of range" # conditions, as we don't try to evaluate the random expressions. val = CastFunc(val, col.exact_type) values_row.append(val) values_rows.append(ValuesRow(values_row)) return ValuesClause(values_rows) def _cast_select_items(self, select_query, column_list): """ For a given Query select_query and a column_list (list of Columns), cast each select item in select_query to the exact type of the column. A Query may have a UNION, recursively do this down the line. """ for col_idx, select_item in enumerate( select_query.select_clause.items): cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type) select_item.val_expr = cast_val_expr if select_query.union_clause: self._cast_select_items(select_query.union_clause.query, column_list)
class InsertStatementGenerator(object): def __init__(self, profile): # QueryProfile-like object self.profile = profile # used to generate SELECT queries for INSERT/UPSERT ... SELECT statements; # to ensure state is completely reset, this is created anew with each call to # generate_statement() self.select_stmt_generator = None def generate_statement(self, tables, dml_table): """ Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very similar to INSERTs, which is why this generator handles both. tables should be a list of Table objects. A typical source of such a list comes from db_connection.DbCursor.describe_common_tables(). This list describes the possible "sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses. dml_table is a required Table object. The INSERT/UPSERT will be into this table. """ if not (isinstance(tables, list) and len(tables) > 0 and all((isinstance(t, Table) for t in tables))): raise Exception('tables must be a not-empty list of Table objects') if not isinstance(dml_table, Table): raise Exception('dml_table must be a Table') self.select_stmt_generator = QueryGenerator(self.profile) insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST) # Choose whether this is a # INSERT/UPSERT INTO table SELECT/VALUES # or # INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES # If the method returns None, it's the former. insert_column_list = self.profile.choose_insert_column_list(dml_table) if dml_table.primary_keys: # Having primary keys implies the table is a Kudu table, which makes it subject to # both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs. conflict_action = self.profile.choose_insert_vs_upsert() else: conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT insert_statement.insert_clause = InsertClause( dml_table, column_list=insert_column_list, conflict_action=conflict_action) # We still need to internally track the columns we're inserting. Keep in mind None # means "all" without an explicit column list. Since we've already created the # InsertClause object though, we can fill this in for ourselves. if insert_column_list is None: insert_column_list = dml_table.cols insert_item_data_types = [col.type for col in insert_column_list] # Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT insert_source_clause = self.profile.choose_insert_source_clause() if issubclass(insert_source_clause, Query): # Use QueryGenerator()'s public interface to generate the SELECT. select_query = self.select_stmt_generator.generate_statement( tables, select_item_data_types=insert_item_data_types) # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The # generator's type system is not near sophisticated enough to know how random # expressions will be implicitly casted in the databases. This requires less work # to implement. IMPALA-4693 considers alternative approaches. self._cast_select_items(select_query, insert_column_list) insert_statement.with_clause = deepcopy(select_query.with_clause) select_query.with_clause = None insert_statement.select_query = select_query elif issubclass(insert_source_clause, ValuesClause): insert_statement.values_clause = self._generate_values_clause(insert_column_list) else: raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format( insert_source_clause)) return insert_statement def _generate_values_clause(self, columns): """ Return a VALUES clause containing a variable number of rows. The values corresponding to primary keys will be non-null constants. Any other columns could be null, constants, or function trees that may or may not evaluate to null. """ values_rows = [] for _ in xrange(self.profile.choose_insert_values_row_count()): values_row = [] for col in columns: if col.is_primary_key: val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False) elif 'constant' == self.profile.choose_values_item_expr(): val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True) else: func_tree = self.select_stmt_generator.create_func_tree( col.type, allow_subquery=False) val = self.select_stmt_generator.populate_func_with_vals(func_tree) # Only the generic type, not the exact type, of the value will be known. To # avoid a lot of failed queries due to precision errors, we cast the val to # the exact type of the column. This will still not prevent "out of range" # conditions, as we don't try to evaluate the random expressions. val = CastFunc(val, col.exact_type) values_row.append(val) values_rows.append(ValuesRow(values_row)) return ValuesClause(values_rows) def _cast_select_items(self, select_query, column_list): """ For a given Query select_query and a column_list (list of Columns), cast each select item in select_query to the exact type of the column. A Query may have a UNION, recursively do this down the line. """ for col_idx, select_item in enumerate(select_query.select_clause.items): cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type) select_item.val_expr = cast_val_expr if select_query.union_clause: self._cast_select_items(select_query.union_clause.query, column_list)