def __init__(self, catalog_table_name=None, table_override_map=None, left_join_list=None, inner_join_list=None, implicit_join=False): self.catalog_table_name = catalog_table_name # Key: group # Value: set of column names self.registry = {} # Key: group # Value: dict # Key: column name # Value: set of SimpleQuery self.simple_query_dict = {} # Key: (group, column name) # Value: table name self.column_map = {} # Key: (group, table name) # Value: table alias self.table_alias_dict = {} # Key: related key name # Value: (group, column name) self.related_key_dict = {} # Key: related_column # Value: last used alias order self.related_key_order_dict = {} # Key: group # Value: relate_key self.related_group_dict = {} # Key: table alias # Value: table name self.table_map = {} # Key: raw column # Value: (function, column) self.raw_column_dict = {} # Entries: column name self.column_ignore_set = set() self.join_table_map = {} # BBB: Remove join_query_list and its uses when all RelatedKey # methods have been converted to properly return each Join # condition separately, and all uses of catalog's from_expression # have been removed. self.join_query_list = [] self.table_override_map = table_override_map or {} self.table_definition = PlaceHolderTableDefinition() # We need to keep track of the original definition to do inner joins on it self._inner_table_definition = self.table_definition self.left_join_list = left_join_list self.implicit_join = implicit_join assert not (self.implicit_join and self.left_join_list), ( "Cannot do left_joins while forcing implicit join" ) self.inner_join_list = inner_join_list assert not set(left_join_list).intersection(inner_join_list), ( "left_join_list and inner_join_list intersect" )
def __init__(self, catalog_table_name=None, table_override_map=None, left_join_list=None, implicit_join=False): self.catalog_table_name = catalog_table_name # Key: group # Value: set of column names self.registry = {} # Key: group # Value: dict # Key: column name # Value: set of SimpleQuery self.simple_query_dict = {} # Key: (group, column name) # Value: table name self.column_map = {} # Key: (group, table name) # Value: table alias self.table_alias_dict = {} # Key: related key name # Value: (group, column name) self.related_key_dict = {} # Key: related_column # Value: last used alias order self.related_key_order_dict = {} # Key: group # Value: relate_key self.related_group_dict = {} # Key: table alias # Value: table name self.table_map = {} # Key: raw column # Value: (function, column) self.raw_column_dict = {} # Entries: column name self.column_ignore_set = set() self.join_table_map = dict() # BBB: Remove join_query_list and its uses when all RelatedKey # methods have been converted to properly return each Join # condition separately, and all uses of catalog's from_expression # have been removed. self.join_query_list = [] self.table_override_map = table_override_map or {} self.table_definition = PlaceHolderTableDefinition() # We need to keep track of the original definition to do inner joins on it self._inner_table_definition = self.table_definition self.left_join_list = left_join_list self.implicit_join = implicit_join assert not (self.implicit_join and self.left_join_list), ( "Cannot do left_joins while forcing implicit join" )
class ColumnMap(object): implements(IColumnMap) @profiler_decorator def __init__(self, catalog_table_name=None, table_override_map=None, left_join_list=None, implicit_join=False): self.catalog_table_name = catalog_table_name # Key: group # Value: set of column names self.registry = {} # Key: group # Value: dict # Key: column name # Value: set of SimpleQuery self.simple_query_dict = {} # Key: (group, column name) # Value: table name self.column_map = {} # Key: (group, table name) # Value: table alias self.table_alias_dict = {} # Key: related key name # Value: (group, column name) self.related_key_dict = {} # Key: related_column # Value: last used alias order self.related_key_order_dict = {} # Key: group # Value: relate_key self.related_group_dict = {} # Key: table alias # Value: table name self.table_map = {} # Key: raw column # Value: (function, column) self.raw_column_dict = {} # Entries: column name self.column_ignore_set = set() self.join_table_map = dict() # BBB: Remove join_query_list and its uses when all RelatedKey # methods have been converted to properly return each Join # condition separately, and all uses of catalog's from_expression # have been removed. self.join_query_list = [] self.table_override_map = table_override_map or {} self.table_definition = PlaceHolderTableDefinition() # We need to keep track of the original definition to do inner joins on it self._inner_table_definition = self.table_definition self.left_join_list = left_join_list self.implicit_join = implicit_join assert not (self.implicit_join and self.left_join_list), ( "Cannot do left_joins while forcing implicit join") @profiler_decorator def registerColumn(self, raw_column, group=DEFAULT_GROUP_ID, simple_query=None): assert ' as ' not in raw_column.lower() # Sanitize input: extract column from raw column (might contain COUNT, ...). # XXX This is not enough to parse something like: # GROUP_CONCAT(DISTINCT foo ORDER BY bar) if '(' in raw_column: function, column = raw_column.split('(') column = column.strip() assert column[-1] == ')' column = column[:-1].strip() else: function = None column = raw_column # Remove 'DISTINCT ' etc. from column. column = column.split()[-1] # Remove '`' from column. column = column.replace('`', '') # Extract table name from column, if any. if '.' in column: # Assuming the part before the dot is a real table name, not an alias. table, column = column.split('.') else: table = None self.raw_column_dict[raw_column] = (function, column) self.registry.setdefault(group, set()).add(column) self.simple_query_dict.setdefault(group, {}).setdefault( column, set()).add(simple_query) if table is not None: # Register table alias and mark column as resolved. self.registerTable(table, alias=table, group=group) self.resolveColumn(column, table, group=group) if group is DEFAULT_GROUP_ID and table != self.catalog_table_name: # When a column is registered in default group and is explicitely # mapped to a table, we must mark its table as requiring a join with # catalog table (unless it's the catalog table, of course). self._addJoinTableForColumn(table, table + "." + column, group) def ignoreColumn(self, column): self.column_ignore_set.add(column) @profiler_decorator def registerRelatedKey(self, related_column, column): # XXX: should we store the group, or directly the table on which the column is mapped ? # The former avoids duplicating data, but requires one more lookup (group + column -> table) # The latter makes it harder (?) to split the mapping in multiple queries (if splitting by groups turns out to be a good idea) real_related_column = related_column order = self.related_key_order_dict.get(real_related_column, 0) + 1 related_column = '%s_%s' % (related_column, order) group = 'related_%s' % (related_column, ) assert group not in self.registry assert group not in self.related_group_dict self.related_key_order_dict[real_related_column] = order self.related_key_dict[real_related_column] = (group, column) self.registerColumn(column, group=group) self.related_group_dict[group] = related_column return group @profiler_decorator def registerCatalog(self): """ Register catalog as being in use in query, and aliased with its own name. This is used by SearchKey/RelatedKey.py: there is no way to reliably detect if catalog table is used in a related key, so the catalog table might be absent from final table mapping. DO NOT USE IT ANYWHERE ELSE, this will go away... This must be changed by designing a new related key API, which must: - state *all* tables they use in their definition - return Query instances instead of raw SQL code This will allow chaining related keys and consequently allow simplifying redundant code. """ assert self.catalog_table_name is not None self.registerTable(self.catalog_table_name) self.resolveTable(self.catalog_table_name, self.catalog_table_name) @profiler_decorator def registerRelatedKeyColumn(self, related_column, position, group): assert group in self.related_group_dict group = self.getRelatedKeyGroup(position, group) assert group not in self.related_group_dict self.related_group_dict[group] = related_column return group def getRelatedKeyGroup(self, position, group): return '%s_column_%s' % (group, position) @profiler_decorator def registerTable(self, table_name, alias=None, group=DEFAULT_GROUP_ID): table_alias_dict = self.table_alias_dict table_alias_key = (group, table_name) existing_value = table_alias_dict.get(table_alias_key) # alias = None, existing = None -> store # alias = None, existing ! None -> skip # alias ! None, existing = None -> store & resolve # alias ! None, existing ! None -> skip if alias = existing, raise otherwise if existing_value is None: table_alias_dict[table_alias_key] = alias if alias is not None: self.resolveTable(table_name, alias, group=group) elif alias is not None and alias != existing_value: raise ValueError, 'Table %r for group %r is aliased as %r, can\'t alias it now as %r' % ( table_name, group, existing_value, alias) @profiler_decorator def _mapColumns(self, column_table_map, table_usage_dict, column_name_set, group, vote_result_dict): mapping_dict = {} catalog_table_name = self.catalog_table_name # Map all columns to tables decided by vote. for column_name, candidate_dict in vote_result_dict.iteritems(): # candidate_dict is never empty max_score = 0 for table_name, score in candidate_dict.iteritems(): if score > max_score: max_score = score best_count = 0 best_choice = table_name elif score == max_score: best_count += 1 if best_count: LOG( 'ColumnMap', WARNING, 'Mapping vote led to a tie. Mapping to %r' % (best_choice, )) if MAPPING_TRACE: LOG('ColumnMap', INFO, 'Mapping by vote %r to %r' % (column_name, best_choice)) mapping_dict[column_name] = best_choice column_name_set.remove(column_name) for table_name, column_set in table_usage_dict.iteritems(): if table_name != best_choice: column_set.discard(column_name) # Map all remaning columns. def table_weight(a): """ Compute each table weight. """ if (group, a[0]) in self.table_alias_dict: result = (2, ) elif a[0] == catalog_table_name: result = (1, ) else: result = (0, len(a[1])) return result # Sort table name list, first has the most required columns weighted_table_list = sorted(table_usage_dict.iteritems(), key=table_weight) while len(weighted_table_list): table_name, column_set = weighted_table_list.pop() if len(column_set): common_column_set = column_name_set.intersection(column_set) if len(common_column_set): # Only allow usage of this table if any of those is true: # - current table is the catalog (if any catalog was provided) # - there are column used on that table which are already mapped # (this does not include columns mapped by this code) # If columns are mapped to this table in current group, then using # it will not require a new join, so it should be allowed. # Note: it would be good to take indexes into account when there # are multiple candidate tables. # - any of those columns belongs exclusively to this table # Although the list of tables those columns belong to is known # earlier (in "build"), mapping them here # - avoids code duplication (registerTable, resolveColumn, # _addJoinTableForColumn) # - offers user to vote for an unknown table, overriding this # forced mapping. use_allowed = table_name == catalog_table_name or \ len(common_column_set) < len(column_set) if not use_allowed: for column_name in column_set: if len(column_table_map.get(column_name, [])) == 1: # There is no alternative, mark as required use_allowed = True break if use_allowed: for column_name in common_column_set: if MAPPING_TRACE: LOG('ColumnMap', INFO, 'Mapping by default %r to %r' % \ (column_name, table_name)) mapping_dict[column_name] = table_name # This column must not be resolved any longer column_name_set.remove(column_name) # Remove this column from sets containing it. This prevents from # giving a high score to a table which columns would already have # been mapped to another table. for ignored, other_column_set in weighted_table_list: other_column_set.discard(column_name) weighted_table_list.sort(key=table_weight) else: # All column which are mappable on that table are to-be-mapped # columns. This means that this table was not explicitely used, and # as each table contain a different amount of lines, we should not # join with any non-explicit table. Hence, we skip this mapping. LOG('ColumnMap', INFO, 'Skipping possible map of %r on %r as that table' \ ' is not explicitely used.' % (common_column_set, table_name)) # Detect incomplete mappings if len(column_name_set): raise ValueError, 'Could not map those columns: %r' % ( column_name_set, ) # Do the actual mapping for column_name, table_name in mapping_dict.iteritems(): # Mark this column as resolved if MAPPING_TRACE: LOG( 'ColumnMap', INFO, 'Mapping column %s to table %s' % (column_name, table_name)) self.registerTable(table_name, group=group) self.resolveColumn(column_name, table_name, group=group) if table_name != catalog_table_name: self._addJoinTableForColumn(table_name, column_name, group) @profiler_decorator def build(self, sql_catalog): join_query_to_build_list = [] catalog_table_name = self.catalog_table_name if catalog_table_name is None: return column_table_map = sql_catalog.getColumnMap() table_vote_method_list = [ getattr(sql_catalog, x) for x in sql_catalog.sql_catalog_table_vote_scripts ] # Generate missing joins from default group (this is required to allow using related keys outside of queries: order_by, sort_on, ...) column_set = self.registry.get(DEFAULT_GROUP_ID, []) for column_name in column_set: if column_name not in column_table_map and column_name not in self.related_key_dict: related_key_definition = sql_catalog.getRelatedKeyDefinition( column_name) if related_key_definition is not None: join_query = sql_catalog.getSearchKey( column_name, 'RelatedKey').buildQuery( sql_catalog=sql_catalog, related_key_definition=related_key_definition) join_query.registerColumnMap(sql_catalog, self) join_query_to_build_list.append(join_query) # List all possible tables, with all used column for each for group, column_set in self.registry.iteritems(): # unique needed column name set column_name_set = set() # table -> column_set, including alternatives table_usage_dict = {} for column_name in column_set: if column_name == '*' or column_name in self.column_ignore_set: continue table_name_list = column_table_map.get(column_name, []) if len(table_name_list) == 0: if not (group is DEFAULT_GROUP_ID and column_name in self.related_key_dict): LOG('ColumnMap', WARNING, 'Not a known column name: %r' % (column_name, )) continue column_map_key = (group, column_name) if column_map_key in self.column_map: # Column is already mapped, so we must count this column as being available only on that table. Its mapping will not change, and it will impact table schema choice. table_name = self.column_map[column_map_key] assert table_name in table_name_list, '%r not in %r' % ( table_name, table_name_list) table_name_list = [table_name] else: # Mark this column as requiring to be mapped. column_name_set.add(column_name) for table_name in table_name_list: table_usage_dict.setdefault(table_name, set()).add(column_name) # XXX: mutable datatypes are provided to vote method. if it modifies # them, it can introduce mapping bugs. Copying them might be costly, # especialy if done before each call, since they also contain mutable # types. # XXX: the API of vote methods is not stable yet. Parameters should # always be passed and expected by name, to make it less painful to # change API. # XXX: there is no check that the table voted for contains mapped # column. It is up to the user not to do stupid things. vote_result_dict = {} simple_query_dict = self.simple_query_dict[group] for table_vote_method in table_vote_method_list: vote_dict = table_vote_method( column_name_set=column_name_set, simple_query_dict=simple_query_dict, table_usage_dict=table_usage_dict, group=group) if isinstance(vote_dict, dict): for column, table in vote_dict.iteritems(): if column in column_name_set: column_vote_dict = vote_result_dict.setdefault( column, {}) column_vote_dict[table] = column_vote_dict.get( table, 0) + 1 else: LOG('ColumnMap', WARNING, 'Vote script %r voted for a ' \ 'non-candidate column: %r, candidates are: %r. Ignored.' % (table_vote_method, column, column_name_set)) else: LOG('ColumnMap', WARNING, 'Vote script %r returned invalid data: %r. ' \ 'Ignored.' % (table_vote_method, vote_dict)) self._mapColumns(column_table_map, table_usage_dict, column_name_set, group, vote_result_dict) table_alias_number_dict = {} for (group, table_name), alias in self.table_alias_dict.iteritems(): if alias is None: if group in self.related_group_dict: alias_table_name = 'related_%s_%s' % ( self.related_group_dict[group], table_name) else: alias_table_name = table_name table_alias_number = table_alias_number_dict.get( alias_table_name, 0) while True: if table_alias_number == 0: alias = alias_table_name else: alias = '%s_%s' % (alias_table_name, table_alias_number) table_alias_number += 1 if alias not in self.table_map: break table_alias_number_dict[alias_table_name] = table_alias_number self.resolveTable(table_name, alias, group=group) # now that we have all aliases, calculate missing joins comming from # non-RelatedKey relationships (like full_text). self.registerCatalog() self._calculateMissingJoins() # and all left joins that did not come from explicit queries # (i.e. joins comming from 'sort_on', 'select_dict', etc.) for join_query in join_query_to_build_list: # XXX ugly use of inner attribute of join_query. Please Refactor: # search_keys don't actually return SQLExpressions, but they add # join definitions in the column_map join_query.search_key.buildSQLExpression( sql_catalog=sql_catalog, column_map=self, only_group_columns=False, group=join_query.group, ) if MAPPING_TRACE: # Key: group # Value: 2-tuple # dict # Key: column # Value: table name # dict # Key: table name # Value: table alias summary_dict = {} for (group, column), table_name in self.column_map.iteritems(): column_dict = summary_dict.setdefault(group, ({}, {}))[0] assert column not in column_dict, '%r in %r' % (column, column_dict) column_dict[column] = table_name for (group, table_name), table_alias in self.table_alias_dict.iteritems(): table_dict = summary_dict.setdefault(group, ({}, {}))[1] assert table_name not in table_dict, '%r in %r' % (table_name, table_dict) table_dict[table_name] = table_alias for group, (column_dict, table_dict) in summary_dict.iteritems(): LOG('ColumnMap', INFO, 'Group %r:' % (group, )) LOG('ColumnMap', INFO, ' Columns:') for column, table_name in column_dict.iteritems(): LOG('ColumnMap', INFO, ' %r from table %r' % (column, table_name)) LOG('ColumnMap', INFO, ' Tables:') for table_name, table_alias in table_dict.iteritems(): LOG('ColumnMap', INFO, ' %r as %r' % (table_name, table_alias)) def asSQLColumn(self, raw_column, group=DEFAULT_GROUP_ID): if self.catalog_table_name is None or raw_column in self.column_ignore_set or \ '.' in raw_column or '*' in raw_column: result = raw_column else: function, column = self.raw_column_dict.get( raw_column, (None, raw_column)) if group is DEFAULT_GROUP_ID: group, column = self.related_key_dict.get( column, (group, raw_column)) alias = self.table_alias_dict[(group, self.column_map[(group, column)])] result = '`%s`.`%s`' % (alias, column) if function is not None: result = '%s(%s)' % (function, result) return result def getCatalogTableAlias(self, group=DEFAULT_GROUP_ID): return self.table_alias_dict[(group, self.catalog_table_name)] def _isBackwardCompatibilityRequired(self): return bool( # if they explicitly ask for implicit self.implicit_join or # if they don't pass a catalog alias, we cannot do explicit joins not self._setMinimalTableDefinition() or # If one or more RelatedKey methods weren't converted, we'll get # queries for an implicit inner join, so we have to do all joins # as implicit. self.join_query_list or # for now, work in BW compat mode if a table_override # is passed. It only works for simple subselect # definitions anyway, and it's being used primarily # for writing left-joins manually. self.table_override_map) def getTableAliasDict(self): if self._isBackwardCompatibilityRequired(): # BBB: Using implicit joins or explicit from_expression return self.table_map.copy() else: return None @profiler_decorator def resolveColumn(self, column, table_name, group=DEFAULT_GROUP_ID): assert group in self.registry assert column in self.registry[group] column_map_key = (group, column) column_map = self.column_map assert (group, table_name) in self.table_alias_dict previous_value = column_map.get(column_map_key) if previous_value is None: column_map[column_map_key] = table_name elif previous_value != table_name: if column == 'uid': LOG( 'ColumnMap', WARNING, 'Attempt to remap uid from %r to %r ignored.' % (previous_value, table_name)) else: raise ValueError, 'Cannot remap a column to another table. column_map[%r] = %r, new = %r' % ( column_map_key, column_map.get(column_map_key), table_name) @profiler_decorator def resolveTable(self, table_name, alias, group=DEFAULT_GROUP_ID): table_alias_key = (group, table_name) assert table_alias_key in self.table_alias_dict assert self.table_alias_dict[table_alias_key] in (None, alias) self.table_alias_dict[table_alias_key] = alias assert self.table_map.get(alias) in (None, table_name) self.table_map[alias] = table_name def getTableAlias(self, table_name, group=DEFAULT_GROUP_ID): return self.table_alias_dict[(group, table_name)] def _addJoinQueryForColumn(self, column, query): # BBB: This is a backward compatibility method that will be # removed in the future, when all related key methods have been adapted # to provide all Join conditions separately if column in self.left_join_list: raise RuntimeError( 'Left Join requested for column: %r, but rendered ' 'join query is not compatible and would result in an ' 'Implicit Inner Join:\n%s' % ( column, query, )) self.join_query_list.append(query) def iterJoinQueryList(self): if self._isBackwardCompatibilityRequired(): # Return all join queries for implicit join, and all the other # queries we were using to build explicit joins, but won't be able to. return itertools.chain( self.join_query_list, self.table_definition.getJoinConditionQueryList()) return [] @profiler_decorator def _addJoinTableForColumn(self, table_name, column, group=DEFAULT_GROUP_ID): """ Declare given table as requiring to be joined with catalog table on uid. table_name (string) Table name. group (string) Group id of given table. """ catalog_table = self.catalog_table_name if catalog_table is not None: # Only join tables when there is a catalog table # Register unconditionaly catalog table self.registerTable(catalog_table) if 'uid' not in self.registry.get(DEFAULT_GROUP_ID, ()): # Register uid column if it is not already self.registerColumn('uid') self.resolveColumn('uid', catalog_table) self.join_table_map.setdefault((group, table_name), set()).add(column) def getJoinTableAliasList(self): return [ self.getTableAlias(table_name, group=group) for (group, table_name) in self.join_table_map.keys() ] def _getTableOverride(self, table_name): # self.table_override_map is a dictionary mapping table names to # strings containing aliases of arbitrary table definitions # (including subselects). So we split the alias and discard it # since we do our own aliasing. table_override_w_alias = self.table_override_map.get(table_name) if table_override_w_alias is None: return table_name # XXX move the cleanup of table alias overrides to EntireQuery # class or ZSQLCatalog, so we don't need SQL syntax knowledge in # ColumnMap. # # Normalise the AS sql keyword to remove the last # aliasing in the string if present. E.g.: # # '(SELECT sub_catalog.* # FROM catalog AS sub_catalog # WHERE sub_catalog.parent_uid=183) AS catalog' # # becomes: # # '(SELECT sub_catalog.* # FROM catalog AS sub_catalog # WHERE sub_catalog.parent_uid=183)' table_override, removed = re_sql_as.subn('', table_override_w_alias) assert removed < 2, ( 'More than one table aliasing was removed from %r' % table_override_w_alias) if removed: LOG( 'ColumnMap', WARNING, 'Table overrides should not contain aliasing: %r' % table_override) return table_override def makeTableAliasDefinition(self, table_name, table_alias): """Make a table alias, giving a change to ColumnMap to override the original table definition with another expression""" table_name = self._getTableOverride(table_name) assert table_name and table_alias, ( "table_name (%r) and table_alias (%r) " "must both be defined" % (table_name, table_alias)) return TableAlias(table_name, table_alias) def _setMinimalTableDefinition(self): """ Set a minimal table definition: the main catalog alias We don't do this at __init__ because we have neither the catalog table name nor its intended alias at that point. """ inner_def = self._inner_table_definition if inner_def.table_definition is None: try: catalog_table_alias = self.getCatalogTableAlias() except KeyError: LOG( 'ColumnMap', WARNING, '_setMinimalTableDefinition called but the main catalog has not ' 'yet received an alias!') return False inner_def.replace( self.makeTableAliasDefinition(self.catalog_table_name, catalog_table_alias)) return True def getTableDefinition(self): if self._isBackwardCompatibilityRequired(): # BBB: One of the RelatedKeys registered an implicit join, do # not return a table definition, self.getTableAliasDict() should # be used instead return None self.table_definition.checkTableAliases() return self.table_definition def addRelatedKeyJoin(self, column, right_side, condition): """ Wraps the current table_definition in the left-side of a new join. Use an InnerJoin or a LeftJoin depending on whether the column is in the left_join_list or not. """ # XXX: to fix TestERP5Catalog.test_52_QueryAndTableAlias, create # here a list of joins and try to merge each new entry into one of # the pre-existing entries by comparing their right-sides. # # XXX 2: This is the place were we could do ordering of inner and left # joins so as to get better performance. For instance, a quick win is to # add all inner-joins first, and all left-joins later. We could also decide # on the order of left-joins based on the order of self.left_join_list or # even a catalog property/configuration/script. # # XXX 3: This is also the place where we could check if explicit # table aliases should cause some of these table definitions to be # collapsed into others. assert self._setMinimalTableDefinition() Join = (column in self.left_join_list or (not self.implicit_join and column in self.registry.get(DEFAULT_GROUP_ID, ())))\ and LeftJoin or InnerJoin join_definition = Join(self.table_definition, right_side, condition=condition) self.table_definition = join_definition # def getFinalTableDefinition(self): # self._calculateMissingJoins() # return self.getTableDefinition() def _calculateMissingJoins(self): left_join_set = set(self.left_join_list) self._setMinimalTableDefinition() catalog_table_alias = self.getCatalogTableAlias() for (group, table_name), column_set in self.join_table_map.items(): # if any of the columns for this implicit join was requested as a # left-join, then all columns will be subject to a left-join. # XXX What if one of the columns was an actual query, as opposed to a # sort column or select_dict? This would cause results in the main # catalog that don't match the query to be present as well. We expect # the user which passes a left_join_list to know what he is doing. if column_set.intersection(left_join_set): Join = LeftJoin else: Join = InnerJoin table_alias = self.getTableAlias(table_name, group=group) table_alias_def = self.makeTableAliasDefinition( table_name, table_alias) # XXX: perhaps refactor some of the code below to do: # self._inner_table_definition.addInnerJoin(TableAlias(...), # condition=(...)) self._inner_table_definition.replace( Join( self._inner_table_definition.table_definition, table_alias_def, # XXX ColumnMap shouldn't have SQL knowledge condition=('`%s`.`uid` = `%s`.`uid`' % (table_alias, catalog_table_alias)), ))
class ColumnMap(object): implements(IColumnMap) def __init__(self, catalog_table_name=None, table_override_map=None, left_join_list=None, implicit_join=False): self.catalog_table_name = catalog_table_name # Key: group # Value: set of column names self.registry = {} # Key: group # Value: dict # Key: column name # Value: set of SimpleQuery self.simple_query_dict = {} # Key: (group, column name) # Value: table name self.column_map = {} # Key: (group, table name) # Value: table alias self.table_alias_dict = {} # Key: related key name # Value: (group, column name) self.related_key_dict = {} # Key: related_column # Value: last used alias order self.related_key_order_dict = {} # Key: group # Value: relate_key self.related_group_dict = {} # Key: table alias # Value: table name self.table_map = {} # Key: raw column # Value: (function, column) self.raw_column_dict = {} # Entries: column name self.column_ignore_set = set() self.join_table_map = {} # BBB: Remove join_query_list and its uses when all RelatedKey # methods have been converted to properly return each Join # condition separately, and all uses of catalog's from_expression # have been removed. self.join_query_list = [] self.table_override_map = table_override_map or {} self.table_definition = PlaceHolderTableDefinition() # We need to keep track of the original definition to do inner joins on it self._inner_table_definition = self.table_definition self.left_join_list = left_join_list self.implicit_join = implicit_join assert not (self.implicit_join and self.left_join_list), "Cannot do left_joins while forcing implicit join" def registerColumn(self, raw_column, group=DEFAULT_GROUP_ID, simple_query=None): assert " as " not in raw_column.lower() # Sanitize input: extract column from raw column (might contain COUNT, ...). # XXX This is not enough to parse something like: # GROUP_CONCAT(DISTINCT foo ORDER BY bar) if "(" in raw_column: function, column = raw_column.split("(") column = column.strip() assert column[-1] == ")" column = column[:-1].strip() else: function = None column = raw_column # Remove 'DISTINCT ' etc. from column. column = column.split()[-1] # Remove '`' from column. column = column.replace("`", "") # Extract table name from column, if any. if "." in column: # Assuming the part before the dot is a real table name, not an alias. table, column = column.split(".") else: table = None self.raw_column_dict[raw_column] = (function, column) self.registry.setdefault(group, set()).add(column) self.simple_query_dict.setdefault(group, {}).setdefault(column, set()).add(simple_query) if table is not None: # Register table alias and mark column as resolved. self.registerTable(table, alias=table, group=group) self.resolveColumn(column, table, group=group) if group is DEFAULT_GROUP_ID and table != self.catalog_table_name: # When a column is registered in default group and is explicitely # mapped to a table, we must mark its table as requiring a join with # catalog table (unless it's the catalog table, of course). self._addJoinTableForColumn(table, table + "." + column, group) def ignoreColumn(self, column): self.column_ignore_set.add(column) def registerRelatedKey(self, related_column, column): # XXX: should we store the group, or directly the table on which the column is mapped ? # The former avoids duplicating data, but requires one more lookup (group + column -> table) # The latter makes it harder (?) to split the mapping in multiple queries (if splitting by groups turns out to be a good idea) real_related_column = related_column order = self.related_key_order_dict.get(real_related_column, 0) + 1 related_column = "%s_%s" % (related_column, order) group = "related_%s" % (related_column,) assert group not in self.registry assert group not in self.related_group_dict self.related_key_order_dict[real_related_column] = order self.related_key_dict[real_related_column] = (group, column) self.registerColumn(column, group=group) self.related_group_dict[group] = related_column return group def registerCatalog(self): """ Register catalog as being in use in query, and aliased with its own name. This is used by SearchKey/RelatedKey.py: there is no way to reliably detect if catalog table is used in a related key, so the catalog table might be absent from final table mapping. DO NOT USE IT ANYWHERE ELSE, this will go away... This must be changed by designing a new related key API, which must: - state *all* tables they use in their definition - return Query instances instead of raw SQL code This will allow chaining related keys and consequently allow simplifying redundant code. """ assert self.catalog_table_name is not None self.registerTable(self.catalog_table_name) self.resolveTable(self.catalog_table_name, self.catalog_table_name) def registerRelatedKeyColumn(self, related_column, position, group): assert group in self.related_group_dict group = self.getRelatedKeyGroup(position, group) assert group not in self.related_group_dict self.related_group_dict[group] = related_column return group def getRelatedKeyGroup(self, position, group): return "%s_column_%s" % (group, position) def registerTable(self, table_name, alias=None, group=DEFAULT_GROUP_ID): table_alias_dict = self.table_alias_dict table_alias_key = (group, table_name) existing_value = table_alias_dict.get(table_alias_key) # alias = None, existing = None -> store # alias = None, existing ! None -> skip # alias ! None, existing = None -> store & resolve # alias ! None, existing ! None -> skip if alias = existing, raise otherwise if existing_value is None: table_alias_dict[table_alias_key] = alias if alias is not None: self.resolveTable(table_name, alias, group=group) elif alias is not None and alias != existing_value: raise ValueError, "Table %r for group %r is aliased as %r, can't alias it now as %r" % ( table_name, group, existing_value, alias, ) def _mapColumns(self, column_table_map, table_usage_dict, column_name_set, group, vote_result_dict): mapping_dict = {} catalog_table_name = self.catalog_table_name # Map all columns to tables decided by vote. for column_name, candidate_dict in vote_result_dict.iteritems(): # candidate_dict is never empty max_score = 0 for table_name, score in candidate_dict.iteritems(): if score > max_score: max_score = score best_count = 0 best_choice = table_name elif score == max_score: best_count += 1 if best_count: LOG("ColumnMap", WARNING, "Mapping vote led to a tie. Mapping to %r" % (best_choice,)) if MAPPING_TRACE: LOG("ColumnMap", INFO, "Mapping by vote %r to %r" % (column_name, best_choice)) mapping_dict[column_name] = best_choice column_name_set.remove(column_name) for table_name, column_set in table_usage_dict.iteritems(): if table_name != best_choice: column_set.discard(column_name) # Map all remaning columns. def table_weight(a): """ Compute each table weight. """ if (group, a[0]) in self.table_alias_dict: result = (2,) elif a[0] == catalog_table_name: result = (1,) else: result = (0, len(a[1])) return result # Sort table name list, first has the most required columns weighted_table_list = sorted(table_usage_dict.iteritems(), key=table_weight) while len(weighted_table_list): table_name, column_set = weighted_table_list.pop() if len(column_set): common_column_set = column_name_set.intersection(column_set) if len(common_column_set): # Only allow usage of this table if any of those is true: # - current table is the catalog (if any catalog was provided) # - there are column used on that table which are already mapped # (this does not include columns mapped by this code) # If columns are mapped to this table in current group, then using # it will not require a new join, so it should be allowed. # Note: it would be good to take indexes into account when there # are multiple candidate tables. # - any of those columns belongs exclusively to this table # Although the list of tables those columns belong to is known # earlier (in "build"), mapping them here # - avoids code duplication (registerTable, resolveColumn, # _addJoinTableForColumn) # - offers user to vote for an unknown table, overriding this # forced mapping. use_allowed = table_name == catalog_table_name or len(common_column_set) < len(column_set) if not use_allowed: for column_name in column_set: if len(column_table_map.get(column_name, [])) == 1: # There is no alternative, mark as required use_allowed = True break if use_allowed: for column_name in common_column_set: if MAPPING_TRACE: LOG("ColumnMap", INFO, "Mapping by default %r to %r" % (column_name, table_name)) mapping_dict[column_name] = table_name # This column must not be resolved any longer column_name_set.remove(column_name) # Remove this column from sets containing it. This prevents from # giving a high score to a table which columns would already have # been mapped to another table. for ignored, other_column_set in weighted_table_list: other_column_set.discard(column_name) weighted_table_list.sort(key=table_weight) else: # All column which are mappable on that table are to-be-mapped # columns. This means that this table was not explicitely used, and # as each table contain a different amount of lines, we should not # join with any non-explicit table. Hence, we skip this mapping. LOG( "ColumnMap", INFO, "Skipping possible map of %r on %r as that table" " is not explicitely used." % (common_column_set, table_name), ) # Detect incomplete mappings if len(column_name_set): raise ValueError, "Could not map those columns: %r" % (column_name_set,) # Do the actual mapping for column_name, table_name in mapping_dict.iteritems(): # Mark this column as resolved if MAPPING_TRACE: LOG("ColumnMap", INFO, "Mapping column %s to table %s" % (column_name, table_name)) self.registerTable(table_name, group=group) self.resolveColumn(column_name, table_name, group=group) if table_name != catalog_table_name: self._addJoinTableForColumn(table_name, column_name, group) def build(self, sql_catalog): join_query_to_build_list = [] catalog_table_name = self.catalog_table_name if catalog_table_name is None: return column_table_map = sql_catalog.getColumnMap() table_vote_method_list = [getattr(sql_catalog, x) for x in sql_catalog.sql_catalog_table_vote_scripts] # Generate missing joins from default group (this is required to allow using related keys outside of queries: order_by, sort_on, ...) column_set = self.registry.get(DEFAULT_GROUP_ID, []) for column_name in column_set: if column_name not in column_table_map and column_name not in self.related_key_dict: related_key_definition = sql_catalog.getRelatedKeyDefinition(column_name) if related_key_definition is not None: join_query = sql_catalog.getSearchKey(column_name, "RelatedKey").buildQuery( sql_catalog=sql_catalog, related_key_definition=related_key_definition ) join_query.registerColumnMap(sql_catalog, self) join_query_to_build_list.append(join_query) # List all possible tables, with all used column for each for group, column_set in self.registry.iteritems(): # unique needed column name set column_name_set = set() # table -> column_set, including alternatives table_usage_dict = {} for column_name in column_set: if column_name == "*" or column_name in self.column_ignore_set: continue table_name_list = column_table_map.get(column_name, []) if len(table_name_list) == 0: if not (group is DEFAULT_GROUP_ID and column_name in self.related_key_dict): LOG("ColumnMap", WARNING, "Not a known column name: %r" % (column_name,)) continue column_map_key = (group, column_name) if column_map_key in self.column_map: # Column is already mapped, so we must count this column as being available only on that table. Its mapping will not change, and it will impact table schema choice. table_name = self.column_map[column_map_key] assert table_name in table_name_list, "%r not in %r" % (table_name, table_name_list) table_name_list = [table_name] else: # Mark this column as requiring to be mapped. column_name_set.add(column_name) for table_name in table_name_list: table_usage_dict.setdefault(table_name, set()).add(column_name) # XXX: mutable datatypes are provided to vote method. if it modifies # them, it can introduce mapping bugs. Copying them might be costly, # especialy if done before each call, since they also contain mutable # types. # XXX: the API of vote methods is not stable yet. Parameters should # always be passed and expected by name, to make it less painful to # change API. # XXX: there is no check that the table voted for contains mapped # column. It is up to the user not to do stupid things. vote_result_dict = {} simple_query_dict = self.simple_query_dict[group] for table_vote_method in table_vote_method_list: vote_dict = table_vote_method( column_name_set=column_name_set, simple_query_dict=simple_query_dict, table_usage_dict=table_usage_dict, group=group, ) if isinstance(vote_dict, dict): for column, table in vote_dict.iteritems(): if column in column_name_set: column_vote_dict = vote_result_dict.setdefault(column, {}) column_vote_dict[table] = column_vote_dict.get(table, 0) + 1 else: LOG( "ColumnMap", WARNING, "Vote script %r voted for a " "non-candidate column: %r, candidates are: %r. Ignored." % (table_vote_method, column, column_name_set), ) else: LOG( "ColumnMap", WARNING, "Vote script %r returned invalid data: %r. " "Ignored." % (table_vote_method, vote_dict), ) self._mapColumns(column_table_map, table_usage_dict, column_name_set, group, vote_result_dict) table_alias_number_dict = {} for (group, table_name), alias in self.table_alias_dict.iteritems(): if alias is None: if group in self.related_group_dict: alias_table_name = "related_%s_%s" % (self.related_group_dict[group], table_name) else: alias_table_name = table_name table_alias_number = table_alias_number_dict.get(alias_table_name, 0) while True: if table_alias_number == 0: alias = alias_table_name else: alias = "%s_%s" % (alias_table_name, table_alias_number) table_alias_number += 1 if alias not in self.table_map: break table_alias_number_dict[alias_table_name] = table_alias_number self.resolveTable(table_name, alias, group=group) # now that we have all aliases, calculate missing joins comming from # non-RelatedKey relationships (like full_text). self.registerCatalog() self._calculateMissingJoins() # and all left joins that did not come from explicit queries # (i.e. joins comming from 'sort_on', 'select_dict', etc.) for join_query in join_query_to_build_list: # XXX ugly use of inner attribute of join_query. Please Refactor: # search_keys don't actually return SQLExpressions, but they add # join definitions in the column_map join_query.search_key.buildSQLExpression( sql_catalog=sql_catalog, column_map=self, only_group_columns=False, group=join_query.group ) if MAPPING_TRACE: # Key: group # Value: 2-tuple # dict # Key: column # Value: table name # dict # Key: table name # Value: table alias summary_dict = {} for (group, column), table_name in self.column_map.iteritems(): column_dict = summary_dict.setdefault(group, ({}, {}))[0] assert column not in column_dict, "%r in %r" % (column, column_dict) column_dict[column] = table_name for (group, table_name), table_alias in self.table_alias_dict.iteritems(): table_dict = summary_dict.setdefault(group, ({}, {}))[1] assert table_name not in table_dict, "%r in %r" % (table_name, table_dict) table_dict[table_name] = table_alias for group, (column_dict, table_dict) in summary_dict.iteritems(): LOG("ColumnMap", INFO, "Group %r:" % (group,)) LOG("ColumnMap", INFO, " Columns:") for column, table_name in column_dict.iteritems(): LOG("ColumnMap", INFO, " %r from table %r" % (column, table_name)) LOG("ColumnMap", INFO, " Tables:") for table_name, table_alias in table_dict.iteritems(): LOG("ColumnMap", INFO, " %r as %r" % (table_name, table_alias)) def asSQLColumn(self, raw_column, group=DEFAULT_GROUP_ID): if ( self.catalog_table_name is None or raw_column in self.column_ignore_set or "." in raw_column or "*" in raw_column ): result = raw_column else: if raw_column.endswith("__score__"): raw_column = raw_column[:-9] column_suffix = "__score__" else: column_suffix = "" function, column = self.raw_column_dict.get(raw_column, (None, raw_column)) if group is DEFAULT_GROUP_ID: group, column = self.related_key_dict.get(column, (group, raw_column)) alias = self.table_alias_dict[(group, self.column_map[(group, column)])] result = "`%s`.`%s%s`" % (alias, column, column_suffix) if function is not None: result = "%s(%s)" % (function, result) return result def getCatalogTableAlias(self, group=DEFAULT_GROUP_ID): return self.table_alias_dict[(group, self.catalog_table_name)] def _isBackwardCompatibilityRequired(self): return bool( # if they explicitly ask for implicit self.implicit_join or # if they don't pass a catalog alias, we cannot do explicit joins not self._setMinimalTableDefinition() or # If one or more RelatedKey methods weren't converted, we'll get # queries for an implicit inner join, so we have to do all joins # as implicit. self.join_query_list or # for now, work in BW compat mode if a table_override # is passed. It only works for simple subselect # definitions anyway, and it's being used primarily # for writing left-joins manually. self.table_override_map ) def getTableAliasDict(self): if self._isBackwardCompatibilityRequired(): # BBB: Using implicit joins or explicit from_expression return self.table_map.copy() else: return None def resolveColumn(self, column, table_name, group=DEFAULT_GROUP_ID): assert group in self.registry assert column in self.registry[group] column_map_key = (group, column) column_map = self.column_map assert (group, table_name) in self.table_alias_dict previous_value = column_map.get(column_map_key) if previous_value is None: column_map[column_map_key] = table_name elif previous_value != table_name: if column == "uid": LOG("ColumnMap", WARNING, "Attempt to remap uid from %r to %r ignored." % (previous_value, table_name)) else: raise ValueError, "Cannot remap a column to another table. column_map[%r] = %r, new = %r" % ( column_map_key, column_map.get(column_map_key), table_name, ) def resolveTable(self, table_name, alias, group=DEFAULT_GROUP_ID): table_alias_key = (group, table_name) assert table_alias_key in self.table_alias_dict assert self.table_alias_dict[table_alias_key] in (None, alias) self.table_alias_dict[table_alias_key] = alias assert self.table_map.get(alias) in (None, table_name) self.table_map[alias] = table_name def getTableAlias(self, table_name, group=DEFAULT_GROUP_ID): return self.table_alias_dict[(group, table_name)] def _addJoinQueryForColumn(self, column, query): # BBB: This is a backward compatibility method that will be # removed in the future, when all related key methods have been adapted # to provide all Join conditions separately if column in self.left_join_list: raise RuntimeError( "Left Join requested for column: %r, but rendered " "join query is not compatible and would result in an " "Implicit Inner Join:\n%s" % (column, query) ) self.join_query_list.append(query) def iterJoinQueryList(self): if self._isBackwardCompatibilityRequired(): # Return all join queries for implicit join, and all the other # queries we were using to build explicit joins, but won't be able to. return itertools.chain(self.join_query_list, self.table_definition.getJoinConditionQueryList()) return [] def _addJoinTableForColumn(self, table_name, column, group=DEFAULT_GROUP_ID): """ Declare given table as requiring to be joined with catalog table on uid. table_name (string) Table name. group (string) Group id of given table. """ catalog_table = self.catalog_table_name if catalog_table is not None: # Only join tables when there is a catalog table # Register unconditionaly catalog table self.registerTable(catalog_table) if "uid" not in self.registry.get(DEFAULT_GROUP_ID, ()): # Register uid column if it is not already self.registerColumn("uid") self.resolveColumn("uid", catalog_table) self.join_table_map.setdefault((group, table_name), set()).add(column) def getJoinTableAliasList(self): return [self.getTableAlias(table_name, group=group) for (group, table_name) in self.join_table_map.keys()] def _getTableOverride(self, table_name): # self.table_override_map is a dictionary mapping table names to # strings containing aliases of arbitrary table definitions # (including subselects). So we split the alias and discard it # since we do our own aliasing. table_override_w_alias = self.table_override_map.get(table_name) if table_override_w_alias is None: return table_name # XXX move the cleanup of table alias overrides to EntireQuery # class or ZSQLCatalog, so we don't need SQL syntax knowledge in # ColumnMap. # # Normalise the AS sql keyword to remove the last # aliasing in the string if present. E.g.: # # '(SELECT sub_catalog.* # FROM catalog AS sub_catalog # WHERE sub_catalog.parent_uid=183) AS catalog' # # becomes: # # '(SELECT sub_catalog.* # FROM catalog AS sub_catalog # WHERE sub_catalog.parent_uid=183)' table_override, removed = re_sql_as.subn("", table_override_w_alias) assert removed < 2, "More than one table aliasing was removed from %r" % table_override_w_alias if removed: LOG("ColumnMap", WARNING, "Table overrides should not contain aliasing: %r" % table_override) return table_override def makeTableAliasDefinition(self, table_name, table_alias): """Make a table alias, giving a change to ColumnMap to override the original table definition with another expression""" table_name = self._getTableOverride(table_name) assert table_name and table_alias, "table_name (%r) and table_alias (%r) " "must both be defined" % ( table_name, table_alias, ) return TableAlias(table_name, table_alias) def _setMinimalTableDefinition(self): """ Set a minimal table definition: the main catalog alias We don't do this at __init__ because we have neither the catalog table name nor its intended alias at that point. """ inner_def = self._inner_table_definition if inner_def.table_definition is None: try: catalog_table_alias = self.getCatalogTableAlias() except KeyError: LOG( "ColumnMap", WARNING, "_setMinimalTableDefinition called but the main catalog has not " "yet received an alias!", ) return False inner_def.replace(self.makeTableAliasDefinition(self.catalog_table_name, catalog_table_alias)) return True def getTableDefinition(self): if self._isBackwardCompatibilityRequired(): # BBB: One of the RelatedKeys registered an implicit join, do # not return a table definition, self.getTableAliasDict() should # be used instead return None self.table_definition.checkTableAliases() return self.table_definition def addRelatedKeyJoin(self, column, right_side, condition): """ Wraps the current table_definition in the left-side of a new join. Use an InnerJoin or a LeftJoin depending on whether the column is in the left_join_list or not. """ # XXX: to fix TestERP5Catalog.test_52_QueryAndTableAlias, create # here a list of joins and try to merge each new entry into one of # the pre-existing entries by comparing their right-sides. # # XXX 2: This is the place were we could do ordering of inner and left # joins so as to get better performance. For instance, a quick win is to # add all inner-joins first, and all left-joins later. We could also decide # on the order of left-joins based on the order of self.left_join_list or # even a catalog property/configuration/script. # # XXX 3: This is also the place where we could check if explicit # table aliases should cause some of these table definitions to be # collapsed into others. assert self._setMinimalTableDefinition() Join = ( ( column in self.left_join_list or (not self.implicit_join and column in self.registry.get(DEFAULT_GROUP_ID, ())) ) and LeftJoin or InnerJoin ) join_definition = Join(self.table_definition, right_side, condition=condition) self.table_definition = join_definition # def getFinalTableDefinition(self): # self._calculateMissingJoins() # return self.getTableDefinition() def _calculateMissingJoins(self): left_join_set = set(self.left_join_list) self._setMinimalTableDefinition() catalog_table_alias = self.getCatalogTableAlias() for (group, table_name), column_set in self.join_table_map.items(): # if any of the columns for this implicit join was requested as a # left-join, then all columns will be subject to a left-join. # XXX What if one of the columns was an actual query, as opposed to a # sort column or select_dict? This would cause results in the main # catalog that don't match the query to be present as well. We expect # the user which passes a left_join_list to know what he is doing. if column_set.intersection(left_join_set): Join = LeftJoin else: Join = InnerJoin table_alias = self.getTableAlias(table_name, group=group) table_alias_def = self.makeTableAliasDefinition(table_name, table_alias) # XXX: perhaps refactor some of the code below to do: # self._inner_table_definition.addInnerJoin(TableAlias(...), # condition=(...)) self._inner_table_definition.replace( Join( self._inner_table_definition.table_definition, table_alias_def, # XXX ColumnMap shouldn't have SQL knowledge condition=("`%s`.`uid` = `%s`.`uid`" % (table_alias, catalog_table_alias)), ) )