예제 #1
0
    def __init__(self):
        # Bounds are (min, max) values, the actual value used will be selected from the
        # bounds and each value within the range has an equal probability of being selected.
        self._bounds = {
            'MAX_NESTED_QUERY_COUNT': (0, 2),
            'MAX_NESTED_EXPR_COUNT': (0, 2),
            'SELECT_ITEM_COUNT': (1, 5),
            'WITH_TABLE_COUNT': (1, 3),
            'TABLE_COUNT': (1, 2),
            'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
            'ANALYTIC_WINDOW_OFFSET': (1, 100)
        }

        # Below are interdependent weights used to determine probabilities. The probability
        # of any item being selected should be (item weight) / sum(weights). A weight of
        # zero means the item will never be selected.
        self._weights = {
            'SELECT_ITEM_CATEGORY': {
                'AGG': 3,
                'ANALYTIC': 1,
                'BASIC': 10
            },
            'TYPES': {
                Boolean: 1,
                Char: 1,
                Decimal: 1,
                Float: 1,
                Int: 10,
                Timestamp: 1
            },
            'ANALYTIC_WINDOW': {
                ('ROWS', UNBOUNDED_PRECEDING, None): 1,
                ('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
                ('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
                ('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
                ('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', PRECEDING, None): 1,
                ('ROWS', PRECEDING, PRECEDING): 2,
                ('ROWS', PRECEDING, CURRENT_ROW): 1,
                ('ROWS', PRECEDING, FOLLOWING): 2,
                ('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', CURRENT_ROW, None): 1,
                ('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
                ('ROWS', CURRENT_ROW, FOLLOWING): 2,
                ('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', FOLLOWING, FOLLOWING): 2,
                ('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
                # Ranges not yet supported
                ('RANGE', UNBOUNDED_PRECEDING, None): 0,
                ('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
                ('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
                ('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
                ('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', PRECEDING, None): 0,
                ('RANGE', PRECEDING, PRECEDING): 0,
                ('RANGE', PRECEDING, CURRENT_ROW): 0,
                ('RANGE', PRECEDING, FOLLOWING): 0,
                ('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', CURRENT_ROW, None): 0,
                ('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
                ('RANGE', CURRENT_ROW, FOLLOWING): 0,
                ('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', FOLLOWING, FOLLOWING): 0,
                ('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0
            },
            'JOIN': {
                'INNER': 90,
                'LEFT': 30,
                'RIGHT': 10,
                'FULL_OUTER': 3,
                'CROSS': 1
            },
            'SUBQUERY_PREDICATE': {
                ('Exists', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('Exists', 'AGG', 'UNCORRELATED'): 1,
                ('Exists', 'NON_AGG', 'CORRELATED'): 1,
                ('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
                ('NotExists', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('NotExists', 'AGG', 'UNCORRELATED'): 0,  # Not supported
                ('NotExists', 'NON_AGG', 'CORRELATED'): 1,
                ('NotExists', 'NON_AGG', 'UNCORRELATED'): 0,  # Not supported
                ('In', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('In', 'AGG', 'UNCORRELATED'): 0,  # Not supported
                ('In', 'NON_AGG', 'CORRELATED'): 1,
                ('In', 'NON_AGG', 'UNCORRELATED'): 1,
                ('NotIn', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('NotIn', 'AGG', 'UNCORRELATED'): 1,
                ('NotIn', 'NON_AGG', 'CORRELATED'): 1,
                ('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
                ('Scalar', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('Scalar', 'AGG', 'UNCORRELATED'): 1,
                ('Scalar', 'NON_AGG', 'CORRELATED'): 0,  # Not supported
                ('Scalar', 'NON_AGG', 'UNCORRELATED'): 1
            },
            'QUERY_EXECUTION': {  # Used by the discrepancy searcher
                'CREATE_TABLE_AS': 1,
                'RAW': 10,
                'VIEW': 1
            }
        }

        # On/off switches
        self._flags = {
            'ANALYTIC_DESIGNS': {
                'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
                'DETERMINISTIC_ORDER_BY': True,
                'NO_ORDER_BY': True,
                'ONLY_SELECT_ITEM': True,
                'UNBOUNDED_WINDOW': True,
                'RANK_FUNC': True
            }
        }

        # Independent probabilities where 1 means 100%. These values may be ignored depending
        # on the context. For example, GROUP_BY is almost always ignored and instead
        # determined by the SELECT item weights above, since mixing aggregate and
        # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
        # only applied if all of the SELECT items are non-aggregate.
        self._probabilities = {
            'OPTIONAL_QUERY_CLAUSES': {
                'WITH': 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                'FROM': 1,
                'WHERE': 0.5,
                'GROUP_BY':
                0.1,  # special case, doesn't really do much, see comment above
                'HAVING': 0.25,
                'UNION': 0.1,
                'ORDER_BY': 0.1
            },
            'OPTIONAL_ANALYTIC_CLAUSES': {
                'PARTITION_BY': 0.5,
                'ORDER_BY': 0.5,
                'WINDOW': 0.5
            },  # will only be used if ORDER BY is chosen
            'MISC': {
                'INLINE_VIEW':
                0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                'SELECT_DISTINCT': 0.1,
                'SCALAR_SUBQUERY': 0.1,
                'UNION_ALL': 0.5
            }
        }  # Determines use of "ALL" but not "UNION"

        self.__type_weights = {}

        self.constant_generator = RandomValGenerator()
예제 #2
0
class DefaultProfile(object):
    def __init__(self):
        # Bounds are (min, max) values, the actual value used will be selected from the
        # bounds and each value within the range has an equal probability of being selected.
        self._bounds = {
            'MAX_NESTED_QUERY_COUNT': (0, 2),
            'MAX_NESTED_EXPR_COUNT': (0, 2),
            'SELECT_ITEM_COUNT': (1, 5),
            'WITH_TABLE_COUNT': (1, 3),
            'TABLE_COUNT': (1, 2),
            'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
            'ANALYTIC_WINDOW_OFFSET': (1, 100)
        }

        # Below are interdependent weights used to determine probabilities. The probability
        # of any item being selected should be (item weight) / sum(weights). A weight of
        # zero means the item will never be selected.
        self._weights = {
            'SELECT_ITEM_CATEGORY': {
                'AGG': 3,
                'ANALYTIC': 1,
                'BASIC': 10
            },
            'TYPES': {
                Boolean: 1,
                Char: 1,
                Decimal: 1,
                Float: 1,
                Int: 10,
                Timestamp: 1
            },
            'ANALYTIC_WINDOW': {
                ('ROWS', UNBOUNDED_PRECEDING, None): 1,
                ('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
                ('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
                ('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
                ('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', PRECEDING, None): 1,
                ('ROWS', PRECEDING, PRECEDING): 2,
                ('ROWS', PRECEDING, CURRENT_ROW): 1,
                ('ROWS', PRECEDING, FOLLOWING): 2,
                ('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', CURRENT_ROW, None): 1,
                ('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
                ('ROWS', CURRENT_ROW, FOLLOWING): 2,
                ('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
                ('ROWS', FOLLOWING, FOLLOWING): 2,
                ('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
                # Ranges not yet supported
                ('RANGE', UNBOUNDED_PRECEDING, None): 0,
                ('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
                ('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
                ('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
                ('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', PRECEDING, None): 0,
                ('RANGE', PRECEDING, PRECEDING): 0,
                ('RANGE', PRECEDING, CURRENT_ROW): 0,
                ('RANGE', PRECEDING, FOLLOWING): 0,
                ('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', CURRENT_ROW, None): 0,
                ('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
                ('RANGE', CURRENT_ROW, FOLLOWING): 0,
                ('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
                ('RANGE', FOLLOWING, FOLLOWING): 0,
                ('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0
            },
            'JOIN': {
                'INNER': 90,
                'LEFT': 30,
                'RIGHT': 10,
                'FULL_OUTER': 3,
                'CROSS': 1
            },
            'SUBQUERY_PREDICATE': {
                ('Exists', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('Exists', 'AGG', 'UNCORRELATED'): 1,
                ('Exists', 'NON_AGG', 'CORRELATED'): 1,
                ('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
                ('NotExists', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('NotExists', 'AGG', 'UNCORRELATED'): 0,  # Not supported
                ('NotExists', 'NON_AGG', 'CORRELATED'): 1,
                ('NotExists', 'NON_AGG', 'UNCORRELATED'): 0,  # Not supported
                ('In', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('In', 'AGG', 'UNCORRELATED'): 0,  # Not supported
                ('In', 'NON_AGG', 'CORRELATED'): 1,
                ('In', 'NON_AGG', 'UNCORRELATED'): 1,
                ('NotIn', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('NotIn', 'AGG', 'UNCORRELATED'): 1,
                ('NotIn', 'NON_AGG', 'CORRELATED'): 1,
                ('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
                ('Scalar', 'AGG', 'CORRELATED'): 0,  # Not supported
                ('Scalar', 'AGG', 'UNCORRELATED'): 1,
                ('Scalar', 'NON_AGG', 'CORRELATED'): 0,  # Not supported
                ('Scalar', 'NON_AGG', 'UNCORRELATED'): 1
            },
            'QUERY_EXECUTION': {  # Used by the discrepancy searcher
                'CREATE_TABLE_AS': 1,
                'RAW': 10,
                'VIEW': 1
            }
        }

        # On/off switches
        self._flags = {
            'ANALYTIC_DESIGNS': {
                'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
                'DETERMINISTIC_ORDER_BY': True,
                'NO_ORDER_BY': True,
                'ONLY_SELECT_ITEM': True,
                'UNBOUNDED_WINDOW': True,
                'RANK_FUNC': True
            }
        }

        # Independent probabilities where 1 means 100%. These values may be ignored depending
        # on the context. For example, GROUP_BY is almost always ignored and instead
        # determined by the SELECT item weights above, since mixing aggregate and
        # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
        # only applied if all of the SELECT items are non-aggregate.
        self._probabilities = {
            'OPTIONAL_QUERY_CLAUSES': {
                'WITH': 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                'FROM': 1,
                'WHERE': 0.5,
                'GROUP_BY':
                0.1,  # special case, doesn't really do much, see comment above
                'HAVING': 0.25,
                'UNION': 0.1,
                'ORDER_BY': 0.1
            },
            'OPTIONAL_ANALYTIC_CLAUSES': {
                'PARTITION_BY': 0.5,
                'ORDER_BY': 0.5,
                'WINDOW': 0.5
            },  # will only be used if ORDER BY is chosen
            'MISC': {
                'INLINE_VIEW':
                0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                'SELECT_DISTINCT': 0.1,
                'SCALAR_SUBQUERY': 0.1,
                'UNION_ALL': 0.5
            }
        }  # Determines use of "ALL" but not "UNION"

        self.__type_weights = {}

        self.constant_generator = RandomValGenerator()

    def _get_config_value(self, start_config, *keys):
        value = start_config
        for key in keys:
            value = value[key]
        return value

    def weights(self, *keys):
        '''Convenience method for getting the values of named weights'''
        return self._get_config_value(self._weights, *keys)

    def bounds(self, *keys):
        '''Convenience method for getting the values of named bounds'''
        return self._get_config_value(self._bounds, *keys)

    def probability(self, *keys):
        '''Convenience method for getting the value of named probabilities'''
        return self._get_config_value(self._probabilities, *keys)

    def _choose_from_bounds(self, *bounds):
        '''Returns a value that is within the given bounds. Each value has an equal chance
       of being chosen.
    '''
        if isinstance(bounds[0], str):
            lower, upper = self.bounds(*bounds)
        else:
            lower, upper = bounds
        return randint(lower, upper)

    def _choose_from_weights(self, *weights):
        '''Returns a value that is selected from the keys of weights with the probability
       determined by the values of weights.
    '''
        if isinstance(weights[0], str):
            weights = self.weights(*weights)
        else:
            weights = weights[0]
        total_weight = sum(weights.itervalues())
        numeric_choice = randint(1, total_weight)
        for choice_, weight in weights.iteritems():
            if weight <= 0:
                continue
            if numeric_choice <= weight:
                return choice_
            numeric_choice -= weight

    def _choose_from_filtered_weights(self, filter, *weights):
        '''Convenience method, apply the given filter before choosing a value.'''
        if isinstance(weights[0], str):
            weights = self.weights(*weights)
        else:
            weights = weights[0]
        return self._choose_from_weights(
            dict((choice_, weight) for choice_, weight in weights.iteritems()
                 if filter(choice_)))

    def _decide_from_probability(self, *keys):
        return random() < self.probability(*keys)

    def get_max_nested_query_count(self):
        '''Return the maximum number of queries the top level query may contain.'''
        return self._choose_from_bounds('MAX_NESTED_QUERY_COUNT')

    def use_with_clause(self):
        return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WITH')

    def get_with_clause_table_ref_count(self):
        '''Return the number of table ref entries a WITH clause should contain.'''
        return self._choose_from_bounds('WITH_TABLE_COUNT')

    def get_select_item_count(self):
        return self._choose_from_bounds('SELECT_ITEM_COUNT')

    def choose_nested_expr_count(self):
        return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')

    def allowed_analytic_designs(self):
        return [
            design for design, is_enabled in
            self._flags['ANALYTIC_DESIGNS'].iteritems() if is_enabled
        ]

    def use_partition_by_clause_in_analytic(self):
        return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES',
                                             'PARTITION_BY')

    def use_order_by_clause_in_analytic(self):
        return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES',
                                             'ORDER_BY')

    def use_window_in_analytic(self):
        return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES',
                                             'WINDOW')

    def choose_window_type(self):
        return self._choose_from_weights('ANALYTIC_WINDOW')

    def get_window_offset(self):
        return self._choose_from_bounds('ANALYTIC_WINDOW_OFFSET')

    def get_offset_for_analytic_lead_or_lag(self):
        return self._choose_from_bounds('ANALYTIC_LEAD_LAG_OFFSET')

    def get_table_count(self):
        return self._choose_from_bounds('TABLE_COUNT')

    def use_inline_view(self):
        return self._decide_from_probability('MISC', 'INLINE_VIEW')

    def choose_table(self, table_exprs):
        return choice(table_exprs)

    def choose_join_type(self, join_types):
        return self._choose_from_filtered_weights(
            lambda join_type: join_type in join_types, 'JOIN')

    def get_join_condition_count(self):
        return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')

    def use_where_clause(self):
        return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WHERE')

    def use_scalar_subquery(self):
        return self._decide_from_probability('MISC', 'SCALAR_SUBQUERY')

    def choose_subquery_predicate_category(self, func_name, allow_correlated):
        weights = self.weights('SUBQUERY_PREDICATE')
        func_names = set(name for name, _, _ in weights.iterkeys())
        if func_name not in func_names:
            func_name = 'Scalar'
        allow_agg = self.weights('SELECT_ITEM_CATEGORY').get('AGG', 0)
        if allow_correlated and self.bounds('TABLE_COUNT')[1] == 0:
            allow_correlated = False
        weights = dict(((name, use_agg, use_correlated), weight)
                       for (name, use_agg, use_correlated), weight in weights.iteritems()
                       if name == func_name \
                           and (allow_agg or use_agg == 'NON_AGG') \
                           and weight)
        if weights:
            return self._choose_from_weights(weights)

    def use_distinct(self):
        return self._decide_from_probability('MISC', 'SELECT_DISTINCT')

    def use_distinct_in_func(self):
        return self._decide_from_probability('MISC', 'SELECT_DISTINCT')

    def use_group_by_clause(self):
        return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES',
                                             'GROUP_BY')

    def use_having_clause(self):
        return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES',
                                             'HAVING')

    def use_union_clause(self):
        return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'UNION')

    def use_union_all(self):
        return self._decide_from_probability('MISC', 'UNION_ALL')

    def get_query_execution(self):
        return self._choose_from_weights('QUERY_EXECUTION')

    def use_having_without_groupby(self):
        return True

    def use_nested_with(self):
        return True

    # Workaround for Hive null ordering differences, and lack of 'NULL FIRST', 'NULL LAST'
    # specifications. The ref db will order nulls as specified for ASC sorting to make it
    # identifical to Hive. Valid return values are: 'BEFORE', 'AFTER', or 'DEFAULT',
    # the latter means no specification needed.
    def nulls_order_asc(self):
        return 'DEFAULT'

    def choose_val_expr(self, val_exprs, types=TYPES):
        if not val_exprs:
            raise Exception('At least on value is required')
        if not types:
            raise Exception('At least one type is required')
        available_types = set(types) & set(val_exprs.by_type)
        if not available_types:
            raise Exception(
                'None of the provided values return any of the required types')
        val_type = self.choose_type(available_types)
        return choice(val_exprs.by_type[val_type])

    def choose_constant(self, return_type=None, allow_null=True):
        if not return_type:
            return_type = self.choose_type()
        while True:
            val = self.constant_generator.generate_val(return_type)
            if val is None and not allow_null:
                continue
            return return_type(val)

    def choose_type(self, types=TYPES):
        type_weights = self.weights('TYPES')
        weights = dict((type_, type_weights[type_]) for type_ in types)
        if not weights:
            raise Exception('None of the requested types are enabled')
        return self._choose_from_weights(weights)

    def choose_func_signature(self, signatures):
        '''Return a signature chosen from "signatures".'''
        if not signatures:
            raise Exception('At least one signature is required')

        type_weights = self.weights('TYPES')
        # First a function will be chosen then a signature. This is done so that the number
        # of signatures a function has doesn't influence its likelihood of being chosen.
        # Functions will be weighted based on the weight of the types in their arguments.
        # The weights will be normalized by the number of arguments in the signature. The
        # weight of a function will be the maximum weight out of all of it's signatures.
        # If any signature has a type with a weight of zero, the signature will not be used.
        #
        # Example: type_weights = {Int: 10, Float: 1},
        #          funcs = [foo(Int), foo(Float), bar(Int, Float)]
        #
        #          max signature length = 2   # from bar(Int, Float)
        #          weight of foo(Int) = (10 * 2)
        #          weight of foo(Float) = (1 * 2)
        #          weight of bar(Int, Float) = ((10 + 1) * 1)
        #          func_weights = {foo: 20, bar: 11}
        #
        # Note that this only selects a function, the function signature will be selected
        # later. This is done to prevent function with a greater number of signatures from
        # being selected more frequently.
        func_weights = dict()
        # The length of the signature in func_weights
        signature_length_by_func = dict()
        for signature in signatures:
            signature_weight = type_weights[signature.return_type]
            signature_length = 1
            for arg in signature.args:
                if arg.is_subquery:
                    for subtype in arg.type:
                        signature_weight *= type_weights[subtype]
                        signature_length += 1
                else:
                    signature_weight *= type_weights[arg.type]
                    signature_length += 1
            if not signature_weight:
                continue
            if not signature.func in func_weights \
                or signature_weight > func_weights[signature.func]:
                func_weights[signature.func] = signature_weight
                signature_length_by_func[signature.func] = signature_length
        if not func_weights:
            raise Exception(
                'All functions disallowed based on signature types')
        distinct_signature_lengths = set(signature_length_by_func.values())
        for func, weight in func_weights.iteritems():
            signature_length = signature_length_by_func[func]
            func_weights[func] = reduce(
                lambda x, y: x * y,
                distinct_signature_lengths - set([signature_length]),
                func_weights[func])
        func = self._choose_from_weights(func_weights)

        # Same idea as above but for the signatures of the selected function.
        signature_weights = dict()
        signature_lengths = dict()
        for idx, signature in enumerate(func.signatures()):
            if signature not in signatures:
                continue
            signature_weight = type_weights[signature.return_type]
            signature_length = 1
            for arg in signature.args:
                if arg.is_subquery:
                    for subtype in arg.type:
                        signature_weight *= type_weights[subtype]
                        signature_length += 1
                else:
                    signature_weight *= type_weights[arg.type]
                    signature_length += 1
            if signature_weight:
                signature_weights[idx] = signature_weight
                signature_lengths[idx] = signature_length
        distinct_signature_lengths = set(signature_lengths.values())
        for idx, weight in signature_weights.iteritems():
            signature_length = signature_lengths[idx]
            signature_weights[idx] = reduce(
                lambda x, y: x * y,
                distinct_signature_lengths - set([signature_length]),
                signature_weights[idx])
        idx = self._choose_from_weights(signature_weights)
        return func.signatures()[idx]

    def allow_func_signature(self, signature):
        weights = self.weights('TYPES')
        if not weights[signature.return_type]:
            return False
        for arg in signature.args:
            if arg.is_subquery:
                if not all(weights[subtype] for subtype in arg.type):
                    return False
            elif not weights[arg.type]:
                return False
        return True
예제 #3
0
    def __init__(self):
        # Bounds are (min, max) values, the actual value used will be selected from the
        # bounds and each value within the range has an equal probability of being selected.
        self._bounds = {
            "MAX_NESTED_QUERY_COUNT": (0, 2),
            "MAX_NESTED_EXPR_COUNT": (0, 2),
            "SELECT_ITEM_COUNT": (1, 5),
            "WITH_TABLE_COUNT": (1, 3),
            "TABLE_COUNT": (1, 2),
            "ANALYTIC_LEAD_LAG_OFFSET": (1, 100),
            "ANALYTIC_WINDOW_OFFSET": (1, 100),
        }

        # Below are interdependent weights used to determine probabilities. The probability
        # of any item being selected should be (item weight) / sum(weights). A weight of
        # zero means the item will never be selected.
        self._weights = {
            "SELECT_ITEM_CATEGORY": {"AGG": 3, "ANALYTIC": 1, "BASIC": 10},
            "TYPES": {Boolean: 1, Char: 1, Decimal: 1, Float: 1, Int: 10, Timestamp: 1},
            "ANALYTIC_WINDOW": {
                ("ROWS", UNBOUNDED_PRECEDING, None): 1,
                ("ROWS", UNBOUNDED_PRECEDING, PRECEDING): 2,
                ("ROWS", UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
                ("ROWS", UNBOUNDED_PRECEDING, FOLLOWING): 2,
                ("ROWS", UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", PRECEDING, None): 1,
                ("ROWS", PRECEDING, PRECEDING): 2,
                ("ROWS", PRECEDING, CURRENT_ROW): 1,
                ("ROWS", PRECEDING, FOLLOWING): 2,
                ("ROWS", PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", CURRENT_ROW, None): 1,
                ("ROWS", CURRENT_ROW, CURRENT_ROW): 1,
                ("ROWS", CURRENT_ROW, FOLLOWING): 2,
                ("ROWS", CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", FOLLOWING, FOLLOWING): 2,
                ("ROWS", FOLLOWING, UNBOUNDED_FOLLOWING): 2,
                # Ranges not yet supported
                ("RANGE", UNBOUNDED_PRECEDING, None): 0,
                ("RANGE", UNBOUNDED_PRECEDING, PRECEDING): 0,
                ("RANGE", UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
                ("RANGE", UNBOUNDED_PRECEDING, FOLLOWING): 0,
                ("RANGE", UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", PRECEDING, None): 0,
                ("RANGE", PRECEDING, PRECEDING): 0,
                ("RANGE", PRECEDING, CURRENT_ROW): 0,
                ("RANGE", PRECEDING, FOLLOWING): 0,
                ("RANGE", PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", CURRENT_ROW, None): 0,
                ("RANGE", CURRENT_ROW, CURRENT_ROW): 0,
                ("RANGE", CURRENT_ROW, FOLLOWING): 0,
                ("RANGE", CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", FOLLOWING, FOLLOWING): 0,
                ("RANGE", FOLLOWING, UNBOUNDED_FOLLOWING): 0,
            },
            "JOIN": {"INNER": 90, "LEFT": 30, "RIGHT": 10, "FULL_OUTER": 3, "CROSS": 1},
            "SUBQUERY_PREDICATE": {
                ("Exists", "AGG", "CORRELATED"): 0,  # Not supported
                ("Exists", "AGG", "UNCORRELATED"): 1,
                ("Exists", "NON_AGG", "CORRELATED"): 1,
                ("Exists", "NON_AGG", "UNCORRELATED"): 1,
                ("NotExists", "AGG", "CORRELATED"): 0,  # Not supported
                ("NotExists", "AGG", "UNCORRELATED"): 0,  # Not supported
                ("NotExists", "NON_AGG", "CORRELATED"): 1,
                ("NotExists", "NON_AGG", "UNCORRELATED"): 0,  # Not supported
                ("In", "AGG", "CORRELATED"): 0,  # Not supported
                ("In", "AGG", "UNCORRELATED"): 0,  # Not supported
                ("In", "NON_AGG", "CORRELATED"): 1,
                ("In", "NON_AGG", "UNCORRELATED"): 1,
                ("NotIn", "AGG", "CORRELATED"): 0,  # Not supported
                ("NotIn", "AGG", "UNCORRELATED"): 1,
                ("NotIn", "NON_AGG", "CORRELATED"): 1,
                ("NotIn", "NON_AGG", "UNCORRELATED"): 1,
                ("Scalar", "AGG", "CORRELATED"): 0,  # Not supported
                ("Scalar", "AGG", "UNCORRELATED"): 1,
                ("Scalar", "NON_AGG", "CORRELATED"): 0,  # Not supported
                ("Scalar", "NON_AGG", "UNCORRELATED"): 1,
            },
            "QUERY_EXECUTION": {"CREATE_TABLE_AS": 1, "RAW": 10, "VIEW": 1},  # Used by the discrepancy searcher
        }

        # On/off switches
        self._flags = {
            "ANALYTIC_DESIGNS": {
                "TOP_LEVEL_QUERY_WITHOUT_LIMIT": True,
                "DETERMINISTIC_ORDER_BY": True,
                "NO_ORDER_BY": True,
                "ONLY_SELECT_ITEM": True,
                "UNBOUNDED_WINDOW": True,
                "RANK_FUNC": True,
            }
        }

        # Independent probabilities where 1 means 100%. These values may be ignored depending
        # on the context. For example, GROUP_BY is almost always ignored and instead
        # determined by the SELECT item weights above, since mixing aggregate and
        # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
        # only applied if all of the SELECT items are non-aggregate.
        self._probabilities = {
            "OPTIONAL_QUERY_CLAUSES": {
                "WITH": 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                "FROM": 1,
                "WHERE": 0.5,
                "GROUP_BY": 0.1,  # special case, doesn't really do much, see comment above
                "HAVING": 0.25,
                "UNION": 0.1,
                "ORDER_BY": 0.1,
            },
            "OPTIONAL_ANALYTIC_CLAUSES": {
                "PARTITION_BY": 0.5,
                "ORDER_BY": 0.5,
                "WINDOW": 0.5,
            },  # will only be used if ORDER BY is chosen
            "MISC": {
                "INLINE_VIEW": 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                "SELECT_DISTINCT": 0.1,
                "SCALAR_SUBQUERY": 0.1,
                "UNION_ALL": 0.5,
            },
        }  # Determines use of "ALL" but not "UNION"

        self.__type_weights = {}

        self.constant_generator = RandomValGenerator()
예제 #4
0
class DefaultProfile(object):
    def __init__(self):
        # Bounds are (min, max) values, the actual value used will be selected from the
        # bounds and each value within the range has an equal probability of being selected.
        self._bounds = {
            "MAX_NESTED_QUERY_COUNT": (0, 2),
            "MAX_NESTED_EXPR_COUNT": (0, 2),
            "SELECT_ITEM_COUNT": (1, 5),
            "WITH_TABLE_COUNT": (1, 3),
            "TABLE_COUNT": (1, 2),
            "ANALYTIC_LEAD_LAG_OFFSET": (1, 100),
            "ANALYTIC_WINDOW_OFFSET": (1, 100),
        }

        # Below are interdependent weights used to determine probabilities. The probability
        # of any item being selected should be (item weight) / sum(weights). A weight of
        # zero means the item will never be selected.
        self._weights = {
            "SELECT_ITEM_CATEGORY": {"AGG": 3, "ANALYTIC": 1, "BASIC": 10},
            "TYPES": {Boolean: 1, Char: 1, Decimal: 1, Float: 1, Int: 10, Timestamp: 1},
            "ANALYTIC_WINDOW": {
                ("ROWS", UNBOUNDED_PRECEDING, None): 1,
                ("ROWS", UNBOUNDED_PRECEDING, PRECEDING): 2,
                ("ROWS", UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
                ("ROWS", UNBOUNDED_PRECEDING, FOLLOWING): 2,
                ("ROWS", UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", PRECEDING, None): 1,
                ("ROWS", PRECEDING, PRECEDING): 2,
                ("ROWS", PRECEDING, CURRENT_ROW): 1,
                ("ROWS", PRECEDING, FOLLOWING): 2,
                ("ROWS", PRECEDING, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", CURRENT_ROW, None): 1,
                ("ROWS", CURRENT_ROW, CURRENT_ROW): 1,
                ("ROWS", CURRENT_ROW, FOLLOWING): 2,
                ("ROWS", CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
                ("ROWS", FOLLOWING, FOLLOWING): 2,
                ("ROWS", FOLLOWING, UNBOUNDED_FOLLOWING): 2,
                # Ranges not yet supported
                ("RANGE", UNBOUNDED_PRECEDING, None): 0,
                ("RANGE", UNBOUNDED_PRECEDING, PRECEDING): 0,
                ("RANGE", UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
                ("RANGE", UNBOUNDED_PRECEDING, FOLLOWING): 0,
                ("RANGE", UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", PRECEDING, None): 0,
                ("RANGE", PRECEDING, PRECEDING): 0,
                ("RANGE", PRECEDING, CURRENT_ROW): 0,
                ("RANGE", PRECEDING, FOLLOWING): 0,
                ("RANGE", PRECEDING, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", CURRENT_ROW, None): 0,
                ("RANGE", CURRENT_ROW, CURRENT_ROW): 0,
                ("RANGE", CURRENT_ROW, FOLLOWING): 0,
                ("RANGE", CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
                ("RANGE", FOLLOWING, FOLLOWING): 0,
                ("RANGE", FOLLOWING, UNBOUNDED_FOLLOWING): 0,
            },
            "JOIN": {"INNER": 90, "LEFT": 30, "RIGHT": 10, "FULL_OUTER": 3, "CROSS": 1},
            "SUBQUERY_PREDICATE": {
                ("Exists", "AGG", "CORRELATED"): 0,  # Not supported
                ("Exists", "AGG", "UNCORRELATED"): 1,
                ("Exists", "NON_AGG", "CORRELATED"): 1,
                ("Exists", "NON_AGG", "UNCORRELATED"): 1,
                ("NotExists", "AGG", "CORRELATED"): 0,  # Not supported
                ("NotExists", "AGG", "UNCORRELATED"): 0,  # Not supported
                ("NotExists", "NON_AGG", "CORRELATED"): 1,
                ("NotExists", "NON_AGG", "UNCORRELATED"): 0,  # Not supported
                ("In", "AGG", "CORRELATED"): 0,  # Not supported
                ("In", "AGG", "UNCORRELATED"): 0,  # Not supported
                ("In", "NON_AGG", "CORRELATED"): 1,
                ("In", "NON_AGG", "UNCORRELATED"): 1,
                ("NotIn", "AGG", "CORRELATED"): 0,  # Not supported
                ("NotIn", "AGG", "UNCORRELATED"): 1,
                ("NotIn", "NON_AGG", "CORRELATED"): 1,
                ("NotIn", "NON_AGG", "UNCORRELATED"): 1,
                ("Scalar", "AGG", "CORRELATED"): 0,  # Not supported
                ("Scalar", "AGG", "UNCORRELATED"): 1,
                ("Scalar", "NON_AGG", "CORRELATED"): 0,  # Not supported
                ("Scalar", "NON_AGG", "UNCORRELATED"): 1,
            },
            "QUERY_EXECUTION": {"CREATE_TABLE_AS": 1, "RAW": 10, "VIEW": 1},  # Used by the discrepancy searcher
        }

        # On/off switches
        self._flags = {
            "ANALYTIC_DESIGNS": {
                "TOP_LEVEL_QUERY_WITHOUT_LIMIT": True,
                "DETERMINISTIC_ORDER_BY": True,
                "NO_ORDER_BY": True,
                "ONLY_SELECT_ITEM": True,
                "UNBOUNDED_WINDOW": True,
                "RANK_FUNC": True,
            }
        }

        # Independent probabilities where 1 means 100%. These values may be ignored depending
        # on the context. For example, GROUP_BY is almost always ignored and instead
        # determined by the SELECT item weights above, since mixing aggregate and
        # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
        # only applied if all of the SELECT items are non-aggregate.
        self._probabilities = {
            "OPTIONAL_QUERY_CLAUSES": {
                "WITH": 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                "FROM": 1,
                "WHERE": 0.5,
                "GROUP_BY": 0.1,  # special case, doesn't really do much, see comment above
                "HAVING": 0.25,
                "UNION": 0.1,
                "ORDER_BY": 0.1,
            },
            "OPTIONAL_ANALYTIC_CLAUSES": {
                "PARTITION_BY": 0.5,
                "ORDER_BY": 0.5,
                "WINDOW": 0.5,
            },  # will only be used if ORDER BY is chosen
            "MISC": {
                "INLINE_VIEW": 0.1,  # MAX_NESTED_QUERY_COUNT bounds take precedence
                "SELECT_DISTINCT": 0.1,
                "SCALAR_SUBQUERY": 0.1,
                "UNION_ALL": 0.5,
            },
        }  # Determines use of "ALL" but not "UNION"

        self.__type_weights = {}

        self.constant_generator = RandomValGenerator()

    def _get_config_value(self, start_config, *keys):
        value = start_config
        for key in keys:
            value = value[key]
        return value

    def weights(self, *keys):
        """Convenience method for getting the values of named weights"""
        return self._get_config_value(self._weights, *keys)

    def bounds(self, *keys):
        """Convenience method for getting the values of named bounds"""
        return self._get_config_value(self._bounds, *keys)

    def probability(self, *keys):
        """Convenience method for getting the value of named probabilities"""
        return self._get_config_value(self._probabilities, *keys)

    def _choose_from_bounds(self, *bounds):
        """Returns a value that is within the given bounds. Each value has an equal chance
       of being chosen.
    """
        if isinstance(bounds[0], str):
            lower, upper = self.bounds(*bounds)
        else:
            lower, upper = bounds
        return randint(lower, upper)

    def _choose_from_weights(self, *weights):
        """Returns a value that is selected from the keys of weights with the probability
       determined by the values of weights.
    """
        if isinstance(weights[0], str):
            weights = self.weights(*weights)
        else:
            weights = weights[0]
        total_weight = sum(weights.itervalues())
        numeric_choice = randint(1, total_weight)
        for choice_, weight in weights.iteritems():
            if weight <= 0:
                continue
            if numeric_choice <= weight:
                return choice_
            numeric_choice -= weight

    def _choose_from_filtered_weights(self, filter, *weights):
        """Convenience method, apply the given filter before choosing a value."""
        if isinstance(weights[0], str):
            weights = self.weights(*weights)
        else:
            weights = weights[0]
        return self._choose_from_weights(
            dict((choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_))
        )

    def _decide_from_probability(self, *keys):
        return random() < self.probability(*keys)

    def get_max_nested_query_count(self):
        """Return the maximum number of queries the top level query may contain."""
        return self._choose_from_bounds("MAX_NESTED_QUERY_COUNT")

    def use_with_clause(self):
        return self._decide_from_probability("OPTIONAL_QUERY_CLAUSES", "WITH")

    def get_with_clause_table_ref_count(self):
        """Return the number of table ref entries a WITH clause should contain."""
        return self._choose_from_bounds("WITH_TABLE_COUNT")

    def get_select_item_count(self):
        return self._choose_from_bounds("SELECT_ITEM_COUNT")

    def choose_nested_expr_count(self):
        return self._choose_from_bounds("MAX_NESTED_EXPR_COUNT")

    def allowed_analytic_designs(self):
        return [design for design, is_enabled in self._flags["ANALYTIC_DESIGNS"].iteritems() if is_enabled]

    def use_partition_by_clause_in_analytic(self):
        return self._decide_from_probability("OPTIONAL_ANALYTIC_CLAUSES", "PARTITION_BY")

    def use_order_by_clause_in_analytic(self):
        return self._decide_from_probability("OPTIONAL_ANALYTIC_CLAUSES", "ORDER_BY")

    def use_window_in_analytic(self):
        return self._decide_from_probability("OPTIONAL_ANALYTIC_CLAUSES", "WINDOW")

    def choose_window_type(self):
        return self._choose_from_weights("ANALYTIC_WINDOW")

    def get_window_offset(self):
        return self._choose_from_bounds("ANALYTIC_WINDOW_OFFSET")

    def get_offset_for_analytic_lead_or_lag(self):
        return self._choose_from_bounds("ANALYTIC_LEAD_LAG_OFFSET")

    def get_table_count(self):
        return self._choose_from_bounds("TABLE_COUNT")

    def use_inline_view(self):
        return self._decide_from_probability("MISC", "INLINE_VIEW")

    def choose_table(self, table_exprs):
        return choice(table_exprs)

    def choose_join_type(self, join_types):
        return self._choose_from_filtered_weights(lambda join_type: join_type in join_types, "JOIN")

    def get_join_condition_count(self):
        return self._choose_from_bounds("MAX_NESTED_EXPR_COUNT")

    def use_where_clause(self):
        return self._decide_from_probability("OPTIONAL_QUERY_CLAUSES", "WHERE")

    def use_scalar_subquery(self):
        return self._decide_from_probability("MISC", "SCALAR_SUBQUERY")

    def choose_subquery_predicate_category(self, func_name, allow_correlated):
        weights = self.weights("SUBQUERY_PREDICATE")
        func_names = set(name for name, _, _ in weights.iterkeys())
        if func_name not in func_names:
            func_name = "Scalar"
        allow_agg = self.weights("SELECT_ITEM_CATEGORY").get("AGG", 0)
        if allow_correlated and self.bounds("TABLE_COUNT")[1] == 0:
            allow_correlated = False
        weights = dict(
            ((name, use_agg, use_correlated), weight)
            for (name, use_agg, use_correlated), weight in weights.iteritems()
            if name == func_name and (allow_agg or use_agg == "NON_AGG") and weight
        )
        if weights:
            return self._choose_from_weights(weights)

    def use_distinct(self):
        return self._decide_from_probability("MISC", "SELECT_DISTINCT")

    def use_distinct_in_func(self):
        return self._decide_from_probability("MISC", "SELECT_DISTINCT")

    def use_group_by_clause(self):
        return self._decide_from_probability("OPTIONAL_QUERY_CLAUSES", "GROUP_BY")

    def use_having_clause(self):
        return self._decide_from_probability("OPTIONAL_QUERY_CLAUSES", "HAVING")

    def use_union_clause(self):
        return self._decide_from_probability("OPTIONAL_QUERY_CLAUSES", "UNION")

    def use_union_all(self):
        return self._decide_from_probability("MISC", "UNION_ALL")

    def get_query_execution(self):
        return self._choose_from_weights("QUERY_EXECUTION")

    def choose_val_expr(self, val_exprs, types=TYPES):
        if not val_exprs:
            raise Exception("At least on value is required")
        if not types:
            raise Exception("At least one type is required")
        available_types = set(types) & set(val_exprs.by_type)
        if not available_types:
            raise Exception("None of the provided values return any of the required types")
        val_type = self.choose_type(available_types)
        return choice(val_exprs.by_type[val_type])

    def choose_constant(self, return_type=None, allow_null=True):
        if not return_type:
            return_type = self.choose_type()
        while True:
            val = self.constant_generator.generate_val(return_type)
            if val is None and not allow_null:
                continue
            return return_type(val)

    def choose_type(self, types=TYPES):
        type_weights = self.weights("TYPES")
        weights = dict((type_, type_weights[type_]) for type_ in types)
        if not weights:
            raise Exception("None of the requested types are enabled")
        return self._choose_from_weights(weights)

    def choose_func_signature(self, signatures):
        """Return a signature chosen from "signatures"."""
        if not signatures:
            raise Exception("At least one signature is required")

        type_weights = self.weights("TYPES")
        # First a function will be chosen then a signature. This is done so that the number
        # of signatures a function has doesn't influence its likelihood of being chosen.
        # Functions will be weighted based on the weight of the types in their arguments.
        # The weights will be normalized by the number of arguments in the signature. The
        # weight of a function will be the maximum weight out of all of it's signatures.
        # If any signature has a type with a weight of zero, the signature will not be used.
        #
        # Example: type_weights = {Int: 10, Float: 1},
        #          funcs = [foo(Int), foo(Float), bar(Int, Float)]
        #
        #          max signature length = 2   # from bar(Int, Float)
        #          weight of foo(Int) = (10 * 2)
        #          weight of foo(Float) = (1 * 2)
        #          weight of bar(Int, Float) = ((10 + 1) * 1)
        #          func_weights = {foo: 20, bar: 11}
        #
        # Note that this only selects a function, the function signature will be selected
        # later. This is done to prevent function with a greater number of signatures from
        # being selected more frequently.
        func_weights = dict()
        # The length of the signature in func_weights
        signature_length_by_func = dict()
        for signature in signatures:
            signature_weight = type_weights[signature.return_type]
            signature_length = 1
            for arg in signature.args:
                if arg.is_subquery:
                    for subtype in arg.type:
                        signature_weight *= type_weights[subtype]
                        signature_length += 1
                else:
                    signature_weight *= type_weights[arg.type]
                    signature_length += 1
            if not signature_weight:
                continue
            if not signature.func in func_weights or signature_weight > func_weights[signature.func]:
                func_weights[signature.func] = signature_weight
                signature_length_by_func[signature.func] = signature_length
        if not func_weights:
            raise Exception("All functions disallowed based on signature types")
        distinct_signature_lengths = set(signature_length_by_func.values())
        for func, weight in func_weights.iteritems():
            signature_length = signature_length_by_func[func]
            func_weights[func] = reduce(
                lambda x, y: x * y, distinct_signature_lengths - set([signature_length]), func_weights[func]
            )
        func = self._choose_from_weights(func_weights)

        # Same idea as above but for the signatures of the selected function.
        signature_weights = dict()
        signature_lengths = dict()
        for idx, signature in enumerate(func.signatures()):
            if signature not in signatures:
                continue
            signature_weight = type_weights[signature.return_type]
            signature_length = 1
            for arg in signature.args:
                if arg.is_subquery:
                    for subtype in arg.type:
                        signature_weight *= type_weights[subtype]
                        signature_length += 1
                else:
                    signature_weight *= type_weights[arg.type]
                    signature_length += 1
            if signature_weight:
                signature_weights[idx] = signature_weight
                signature_lengths[idx] = signature_length
        distinct_signature_lengths = set(signature_lengths.values())
        for idx, weight in signature_weights.iteritems():
            signature_length = signature_lengths[idx]
            signature_weights[idx] = reduce(
                lambda x, y: x * y, distinct_signature_lengths - set([signature_length]), signature_weights[idx]
            )
        idx = self._choose_from_weights(signature_weights)
        return func.signatures()[idx]

    def allow_func_signature(self, signature):
        weights = self.weights("TYPES")
        if not weights[signature.return_type]:
            return False
        for arg in signature.args:
            if arg.is_subquery:
                if not all(weights[subtype] for subtype in arg.type):
                    return False
            elif not weights[arg.type]:
                return False
        return True
예제 #5
0
  def __init__(self):
    # Bounds are (min, max) values, the actual value used will be selected from the
    # bounds and each value within the range has an equal probability of being selected.
    self._bounds = {
        'MAX_NESTED_QUERY_COUNT': (0, 2),
        'MAX_NESTED_EXPR_COUNT': (0, 2),
        'SELECT_ITEM_COUNT': (1, 5),
        'WITH_TABLE_COUNT': (1, 3),
        'TABLE_COUNT': (1, 2),
        'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
        'ANALYTIC_WINDOW_OFFSET': (1, 100)}

    # Below are interdependent weights used to determine probabilities. The probability
    # of any item being selected should be (item weight) / sum(weights). A weight of
    # zero means the item will never be selected.
    self._weights = {
        'SELECT_ITEM_CATEGORY': {
            'AGG': 3,
            'ANALYTIC': 1,
            'BASIC': 10},
        'TYPES': {
            Boolean: 1,
            Char: 1,
            Decimal: 1,
            Float: 1,
            Int: 10,
            Timestamp: 1},
        'ANALYTIC_WINDOW': {
            ('ROWS', UNBOUNDED_PRECEDING, None): 1,
            ('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
            ('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
            ('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
            ('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', PRECEDING, None): 1,
            ('ROWS', PRECEDING, PRECEDING): 2,
            ('ROWS', PRECEDING, CURRENT_ROW): 1,
            ('ROWS', PRECEDING, FOLLOWING): 2,
            ('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', CURRENT_ROW, None): 1,
            ('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
            ('ROWS', CURRENT_ROW, FOLLOWING): 2,
            ('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', FOLLOWING, FOLLOWING): 2,
            ('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
            # Ranges not yet supported
            ('RANGE', UNBOUNDED_PRECEDING, None): 0,
            ('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
            ('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
            ('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
            ('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', PRECEDING, None): 0,
            ('RANGE', PRECEDING, PRECEDING): 0,
            ('RANGE', PRECEDING, CURRENT_ROW): 0,
            ('RANGE', PRECEDING, FOLLOWING): 0,
            ('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', CURRENT_ROW, None): 0,
            ('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
            ('RANGE', CURRENT_ROW, FOLLOWING): 0,
            ('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', FOLLOWING, FOLLOWING): 0,
            ('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0},
        'JOIN': {
            'INNER': 90,
            'LEFT': 30,
            'RIGHT': 10,
            'FULL_OUTER': 3,
            'CROSS': 1},
        'SUBQUERY_PREDICATE': {
            ('Exists', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('Exists', 'AGG', 'UNCORRELATED'): 1,
            ('Exists', 'NON_AGG', 'CORRELATED'): 1,
            ('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
            ('NotExists', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('NotExists', 'AGG', 'UNCORRELATED'): 0,   # Not supported
            ('NotExists', 'NON_AGG', 'CORRELATED'): 1,
            ('NotExists', 'NON_AGG', 'UNCORRELATED'): 0,   # Not supported
            ('In', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('In', 'AGG', 'UNCORRELATED'): 0,   # Not supported
            ('In', 'NON_AGG', 'CORRELATED'): 1,
            ('In', 'NON_AGG', 'UNCORRELATED'): 1,
            ('NotIn', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('NotIn', 'AGG', 'UNCORRELATED'): 1,
            ('NotIn', 'NON_AGG', 'CORRELATED'): 1,
            ('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
            ('Scalar', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('Scalar', 'AGG', 'UNCORRELATED'): 1,
            ('Scalar', 'NON_AGG', 'CORRELATED'): 0,   # Not supported
            ('Scalar', 'NON_AGG', 'UNCORRELATED'): 1},
        'QUERY_EXECUTION': {   # Used by the discrepancy searcher
            'CREATE_TABLE_AS': 1,
            'RAW': 10,
            'VIEW': 1}}

    # On/off switches
    self._flags = {
        'ANALYTIC_DESIGNS': {
            'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
            'DETERMINISTIC_ORDER_BY': True,
            'NO_ORDER_BY': True,
            'ONLY_SELECT_ITEM': True,
            'UNBOUNDED_WINDOW': True,
            'RANK_FUNC': True}}

    # Independent probabilities where 1 means 100%. These values may be ignored depending
    # on the context. For example, GROUP_BY is almost always ignored and instead
    # determined by the SELECT item weights above, since mixing aggregate and
    # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
    # only applied if all of the SELECT items are non-aggregate.
    self._probabilities = {
        'OPTIONAL_QUERY_CLAUSES': {
            'WITH': 0.1,   # MAX_NESTED_QUERY_COUNT bounds take precedence
            'FROM': 1,
            'WHERE': 0.5,
            'GROUP_BY': 0.1,   # special case, doesn't really do much, see comment above
            'HAVING': 0.25,
            'UNION': 0.1,
            'ORDER_BY': 0.1},
        'OPTIONAL_ANALYTIC_CLAUSES': {
            'PARTITION_BY': 0.5,
            'ORDER_BY': 0.5,
            'WINDOW': 0.5},   # will only be used if ORDER BY is chosen
        'MISC': {
            'INLINE_VIEW': 0.1,   # MAX_NESTED_QUERY_COUNT bounds take precedence
            'SELECT_DISTINCT': 0.1,
            'SCALAR_SUBQUERY': 0.1,
            'UNION_ALL': 0.5}}   # Determines use of "ALL" but not "UNION"

    self.__type_weights = {}

    self.constant_generator = RandomValGenerator()
예제 #6
0
class DefaultProfile(object):

  def __init__(self):
    # Bounds are (min, max) values, the actual value used will be selected from the
    # bounds and each value within the range has an equal probability of being selected.
    self._bounds = {
        'MAX_NESTED_QUERY_COUNT': (0, 2),
        'MAX_NESTED_EXPR_COUNT': (0, 2),
        'SELECT_ITEM_COUNT': (1, 5),
        'WITH_TABLE_COUNT': (1, 3),
        'TABLE_COUNT': (1, 2),
        'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
        'ANALYTIC_WINDOW_OFFSET': (1, 100)}

    # Below are interdependent weights used to determine probabilities. The probability
    # of any item being selected should be (item weight) / sum(weights). A weight of
    # zero means the item will never be selected.
    self._weights = {
        'SELECT_ITEM_CATEGORY': {
            'AGG': 3,
            'ANALYTIC': 1,
            'BASIC': 10},
        'TYPES': {
            Boolean: 1,
            Char: 1,
            Decimal: 1,
            Float: 1,
            Int: 10,
            Timestamp: 1},
        'ANALYTIC_WINDOW': {
            ('ROWS', UNBOUNDED_PRECEDING, None): 1,
            ('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
            ('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
            ('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
            ('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', PRECEDING, None): 1,
            ('ROWS', PRECEDING, PRECEDING): 2,
            ('ROWS', PRECEDING, CURRENT_ROW): 1,
            ('ROWS', PRECEDING, FOLLOWING): 2,
            ('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', CURRENT_ROW, None): 1,
            ('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
            ('ROWS', CURRENT_ROW, FOLLOWING): 2,
            ('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
            ('ROWS', FOLLOWING, FOLLOWING): 2,
            ('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
            # Ranges not yet supported
            ('RANGE', UNBOUNDED_PRECEDING, None): 0,
            ('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
            ('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
            ('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
            ('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', PRECEDING, None): 0,
            ('RANGE', PRECEDING, PRECEDING): 0,
            ('RANGE', PRECEDING, CURRENT_ROW): 0,
            ('RANGE', PRECEDING, FOLLOWING): 0,
            ('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', CURRENT_ROW, None): 0,
            ('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
            ('RANGE', CURRENT_ROW, FOLLOWING): 0,
            ('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
            ('RANGE', FOLLOWING, FOLLOWING): 0,
            ('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0},
        'JOIN': {
            'INNER': 90,
            'LEFT': 30,
            'RIGHT': 10,
            'FULL_OUTER': 3,
            'CROSS': 1},
        'SUBQUERY_PREDICATE': {
            ('Exists', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('Exists', 'AGG', 'UNCORRELATED'): 1,
            ('Exists', 'NON_AGG', 'CORRELATED'): 1,
            ('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
            ('NotExists', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('NotExists', 'AGG', 'UNCORRELATED'): 0,   # Not supported
            ('NotExists', 'NON_AGG', 'CORRELATED'): 1,
            ('NotExists', 'NON_AGG', 'UNCORRELATED'): 0,   # Not supported
            ('In', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('In', 'AGG', 'UNCORRELATED'): 0,   # Not supported
            ('In', 'NON_AGG', 'CORRELATED'): 1,
            ('In', 'NON_AGG', 'UNCORRELATED'): 1,
            ('NotIn', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('NotIn', 'AGG', 'UNCORRELATED'): 1,
            ('NotIn', 'NON_AGG', 'CORRELATED'): 1,
            ('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
            ('Scalar', 'AGG', 'CORRELATED'): 0,   # Not supported
            ('Scalar', 'AGG', 'UNCORRELATED'): 1,
            ('Scalar', 'NON_AGG', 'CORRELATED'): 0,   # Not supported
            ('Scalar', 'NON_AGG', 'UNCORRELATED'): 1},
        'QUERY_EXECUTION': {   # Used by the discrepancy searcher
            'CREATE_TABLE_AS': 1,
            'RAW': 10,
            'VIEW': 1}}

    # On/off switches
    self._flags = {
        'ANALYTIC_DESIGNS': {
            'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
            'DETERMINISTIC_ORDER_BY': True,
            'NO_ORDER_BY': True,
            'ONLY_SELECT_ITEM': True,
            'UNBOUNDED_WINDOW': True,
            'RANK_FUNC': True}}

    # Independent probabilities where 1 means 100%. These values may be ignored depending
    # on the context. For example, GROUP_BY is almost always ignored and instead
    # determined by the SELECT item weights above, since mixing aggregate and
    # non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
    # only applied if all of the SELECT items are non-aggregate.
    self._probabilities = {
        'OPTIONAL_QUERY_CLAUSES': {
            'WITH': 0.1,   # MAX_NESTED_QUERY_COUNT bounds take precedence
            'FROM': 1,
            'WHERE': 0.5,
            'GROUP_BY': 0.1,   # special case, doesn't really do much, see comment above
            'HAVING': 0.25,
            'UNION': 0.1,
            'ORDER_BY': 0.1},
        'OPTIONAL_ANALYTIC_CLAUSES': {
            'PARTITION_BY': 0.5,
            'ORDER_BY': 0.5,
            'WINDOW': 0.5},   # will only be used if ORDER BY is chosen
        'MISC': {
            'INLINE_VIEW': 0.1,   # MAX_NESTED_QUERY_COUNT bounds take precedence
            'SELECT_DISTINCT': 0.1,
            'SCALAR_SUBQUERY': 0.1,
            'UNION_ALL': 0.5}}   # Determines use of "ALL" but not "UNION"

    self.__type_weights = {}

    self.constant_generator = RandomValGenerator()

  def _get_config_value(self, start_config, *keys):
    value = start_config
    for key in keys:
      value = value[key]
    return value

  def weights(self, *keys):
    '''Convenience method for getting the values of named weights'''
    return self._get_config_value(self._weights, *keys)

  def bounds(self, *keys):
    '''Convenience method for getting the values of named bounds'''
    return self._get_config_value(self._bounds, *keys)

  def probability(self, *keys):
    '''Convenience method for getting the value of named probabilities'''
    return self._get_config_value(self._probabilities, *keys)

  def _choose_from_bounds(self, *bounds):
    '''Returns a value that is within the given bounds. Each value has an equal chance
       of being chosen.
    '''
    if isinstance(bounds[0], str):
      lower, upper = self.bounds(*bounds)
    else:
      lower, upper = bounds
    return randint(lower, upper)

  def _choose_from_weights(self, *weights):
    '''Returns a value that is selected from the keys of weights with the probability
       determined by the values of weights.
    '''
    if isinstance(weights[0], str):
      weights = self.weights(*weights)
    else:
      weights = weights[0]
    total_weight = sum(weights.itervalues())
    numeric_choice = randint(1, total_weight)
    for choice_, weight in weights.iteritems():
      if weight <= 0:
        continue
      if numeric_choice <= weight:
        return choice_
      numeric_choice -= weight

  def _choose_from_filtered_weights(self, filter, *weights):
    '''Convenience method, apply the given filter before choosing a value.'''
    if isinstance(weights[0], str):
      weights = self.weights(*weights)
    else:
      weights = weights[0]
    return self._choose_from_weights(dict(
      (choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_)))

  def _decide_from_probability(self, *keys):
    return random() < self.probability(*keys)

  def get_max_nested_query_count(self):
    '''Return the maximum number of queries the top level query may contain.'''
    return self._choose_from_bounds('MAX_NESTED_QUERY_COUNT')

  def use_with_clause(self):
    return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WITH')

  def get_with_clause_table_ref_count(self):
    '''Return the number of table ref entries a WITH clause should contain.'''
    return self._choose_from_bounds('WITH_TABLE_COUNT')

  def get_select_item_count(self):
    return self._choose_from_bounds('SELECT_ITEM_COUNT')

  def choose_nested_expr_count(self):
    return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')

  def allowed_analytic_designs(self):
    return [design for design, is_enabled in self._flags['ANALYTIC_DESIGNS'].iteritems()
            if is_enabled]

  def use_partition_by_clause_in_analytic(self):
    return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'PARTITION_BY')

  def use_order_by_clause_in_analytic(self):
    return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'ORDER_BY')

  def use_window_in_analytic(self):
    return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'WINDOW')

  def choose_window_type(self):
    return self._choose_from_weights('ANALYTIC_WINDOW')

  def get_window_offset(self):
    return self._choose_from_bounds('ANALYTIC_WINDOW_OFFSET')

  def get_offset_for_analytic_lead_or_lag(self):
    return self._choose_from_bounds('ANALYTIC_LEAD_LAG_OFFSET')

  def get_table_count(self):
    return self._choose_from_bounds('TABLE_COUNT')

  def use_inline_view(self):
    return self._decide_from_probability('MISC', 'INLINE_VIEW')

  def choose_table(self, table_exprs):
    return choice(table_exprs)

  def choose_join_type(self, join_types):
    return self._choose_from_filtered_weights(
        lambda join_type: join_type in join_types, 'JOIN')

  def get_join_condition_count(self):
    return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')

  def use_where_clause(self):
    return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WHERE')

  def use_scalar_subquery(self):
    return self._decide_from_probability('MISC', 'SCALAR_SUBQUERY')

  def choose_subquery_predicate_category(self, func_name, allow_correlated):
    weights = self.weights('SUBQUERY_PREDICATE')
    func_names = set(name for name, _, _ in weights.iterkeys())
    if func_name not in func_names:
      func_name = 'Scalar'
    allow_agg = self.weights('SELECT_ITEM_CATEGORY').get('AGG', 0)
    if allow_correlated and self.bounds('TABLE_COUNT')[1] == 0:
      allow_correlated = False
    weights = dict(((name, use_agg, use_correlated), weight)
                   for (name, use_agg, use_correlated), weight in weights.iteritems()
                   if name == func_name \
                       and (allow_agg or use_agg == 'NON_AGG') \
                       and weight)
    if weights:
      return self._choose_from_weights(weights)

  def use_distinct(self):
    return self._decide_from_probability('MISC', 'SELECT_DISTINCT')

  def use_distinct_in_func(self):
    return self._decide_from_probability('MISC', 'SELECT_DISTINCT')

  def use_group_by_clause(self):
    return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'GROUP_BY')

  def use_having_clause(self):
    return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'HAVING')

  def use_union_clause(self):
    return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'UNION')

  def use_union_all(self):
    return self._decide_from_probability('MISC', 'UNION_ALL')

  def get_query_execution(self):
    return self._choose_from_weights('QUERY_EXECUTION')

  def use_having_without_groupby(self):
    return True

  def use_nested_with(self):
    return True

  # Workaround for Hive null ordering differences, and lack of 'NULL FIRST', 'NULL LAST'
  # specifications. The ref db will order nulls as specified for ASC sorting to make it
  # identifical to Hive. Valid return values are: 'BEFORE', 'AFTER', or 'DEFAULT',
  # the latter means no specification needed.
  def nulls_order_asc(self):
    return 'DEFAULT'

  def choose_val_expr(self, val_exprs, types=TYPES):
    if not val_exprs:
      raise Exception('At least on value is required')
    if not types:
      raise Exception('At least one type is required')
    available_types = set(types) & set(val_exprs.by_type)
    if not available_types:
      raise Exception('None of the provided values return any of the required types')
    val_type = self.choose_type(available_types)
    return choice(val_exprs.by_type[val_type])

  def choose_constant(self, return_type=None, allow_null=True):
    if not return_type:
      return_type = self.choose_type()
    while True:
      val = self.constant_generator.generate_val(return_type)
      if val is None and not allow_null:
        continue
      return return_type(val)

  def choose_type(self, types=TYPES):
    type_weights = self.weights('TYPES')
    weights = dict((type_, type_weights[type_]) for type_ in types)
    if not weights:
      raise Exception('None of the requested types are enabled')
    return self._choose_from_weights(weights)

  def choose_func_signature(self, signatures):
    '''Return a signature chosen from "signatures".'''
    if not signatures:
      raise Exception('At least one signature is required')

    type_weights = self.weights('TYPES')
    # First a function will be chosen then a signature. This is done so that the number
    # of signatures a function has doesn't influence its likelihood of being chosen.
    # Functions will be weighted based on the weight of the types in their arguments.
    # The weights will be normalized by the number of arguments in the signature. The
    # weight of a function will be the maximum weight out of all of it's signatures.
    # If any signature has a type with a weight of zero, the signature will not be used.
    #
    # Example: type_weights = {Int: 10, Float: 1},
    #          funcs = [foo(Int), foo(Float), bar(Int, Float)]
    #
    #          max signature length = 2   # from bar(Int, Float)
    #          weight of foo(Int) = (10 * 2)
    #          weight of foo(Float) = (1 * 2)
    #          weight of bar(Int, Float) = ((10 + 1) * 1)
    #          func_weights = {foo: 20, bar: 11}
    #
    # Note that this only selects a function, the function signature will be selected
    # later. This is done to prevent function with a greater number of signatures from
    # being selected more frequently.
    func_weights = dict()
    # The length of the signature in func_weights
    signature_length_by_func = dict()
    for signature in signatures:
      signature_weight = type_weights[signature.return_type]
      signature_length = 1
      for arg in signature.args:
        if arg.is_subquery:
          for subtype in arg.type:
            signature_weight *= type_weights[subtype]
            signature_length += 1
        else:
          signature_weight *= type_weights[arg.type]
          signature_length += 1
      if not signature_weight:
        continue
      if not signature.func in func_weights \
          or signature_weight > func_weights[signature.func]:
        func_weights[signature.func] = signature_weight
        signature_length_by_func[signature.func] = signature_length
    if not func_weights:
      raise Exception('All functions disallowed based on signature types')
    distinct_signature_lengths = set(signature_length_by_func.values())
    for func, weight in func_weights.iteritems():
      signature_length = signature_length_by_func[func]
      func_weights[func] = reduce(
          lambda x, y: x * y,
          distinct_signature_lengths - set([signature_length]),
          func_weights[func])
    func = self._choose_from_weights(func_weights)

    # Same idea as above but for the signatures of the selected function.
    signature_weights = dict()
    signature_lengths = dict()
    for idx, signature in enumerate(func.signatures()):
      if signature not in signatures:
        continue
      signature_weight = type_weights[signature.return_type]
      signature_length = 1
      for arg in signature.args:
        if arg.is_subquery:
          for subtype in arg.type:
            signature_weight *= type_weights[subtype]
            signature_length += 1
        else:
          signature_weight *= type_weights[arg.type]
          signature_length += 1
      if signature_weight:
        signature_weights[idx] = signature_weight
        signature_lengths[idx] = signature_length
    distinct_signature_lengths = set(signature_lengths.values())
    for idx, weight in signature_weights.iteritems():
      signature_length = signature_lengths[idx]
      signature_weights[idx] = reduce(
          lambda x, y: x * y,
          distinct_signature_lengths - set([signature_length]),
          signature_weights[idx])
    idx = self._choose_from_weights(signature_weights)
    return func.signatures()[idx]

  def allow_func_signature(self, signature):
    weights = self.weights('TYPES')
    if not weights[signature.return_type]:
      return False
    for arg in signature.args:
      if arg.is_subquery:
        if not all(weights[subtype] for subtype in arg.type):
          return False
      elif not weights[arg.type]:
        return False
    return True