def _features_lookup(self, unit_type, kind, attr_filter=None): assert kind in ('foreign_key', 'reverse_foreign_key', 'dimension', 'measure') unit_type = self.identifier_for_unit(unit_type) feature_source = getattr(self._cache, kind + 's') features = SequenceMap() for avail_unit_type in feature_source: if avail_unit_type.matches(unit_type): for feature, instances in feature_source[ avail_unit_type].items(): if feature not in features and (not attr_filter or attr_filter(feature)): mask = None if kind in ('foreign_key', 'reverse_foreign_key' ) and avail_unit_type == feature.name: mask = unit_type.name features.append( _ResolvedFeature( feature.name, providers=[d.provider for d in instances], unit_type=unit_type, mask=mask, kind=kind)) return features
def measures_for_unit(self, unit_type): unit_type = self.identifier_for_unit(unit_type) if unit_type is None: return self.measures measures = SequenceMap() for measure in self.measures: if self._unit_has_measure(unit_type, measure): measures.append(measure) return measures
def dimensions_for_unit(self, unit_type, include_partitions=True): unit_type = self.identifier_for_unit(unit_type) if unit_type is None: return self.dimensions dimensions = SequenceMap() for dimension in self.dimensions: if (self._unit_has_dimension(unit_type, dimension) and (include_partitions or not dimension.partition)): dimensions.append(dimension) return dimensions
def foreign_keys_for_unit(self, unit_type): unit_type = self.identifier_for_unit(unit_type) if unit_type is None: return self.identifiers foreign_keys = SequenceMap() for foreign_key in self.identifiers: if self._unit_has_foreign_key(unit_type, foreign_key): if unit_type.name == foreign_key: foreign_key = foreign_key.with_mask(unit_type.mask) foreign_keys.append(foreign_key) return foreign_keys
def _get_dimensions_from_specs(self, cls, specs): dims = SequenceMap() if specs is None: return dims for spec in specs: dim = cls.from_spec(spec, provider=self) dims[dim] = dim return dims
def __init__(self, registry, provider, unit_type, measures, segment_by=None, where=None, join_on_left=None, join_on_right=None, join_prefix=None, joins=None): self.registry = registry self.provider = provider # Statistical unit used for evaluation self.unit_type = unit_type # Anticipated measures, segmentations and constraints self.measures = SequenceMap(measures or []) self.segment_by = SequenceMap(segment_by or []) self.where = where # Join parameters self.is_joined = False self.join_is_compatible = True self.join_on_left = join_on_left self.join_on_right = join_on_right or [self.matched_unit_type.name] self.joins = joins or [] self.join_prefix = join_prefix or self.unit_type.name
def resolve(self, unit_type, features, role=None, with_attrs=None): """ This method resolves one or more features optionally associated with a unit_type and a role. Note that this method is concerned about *functional* resolution, so if `role='dimension'` both identifiers and measures will be resolved, since they can be used as dimensions. Parameters: names (str, list<str>): A name or list of names to resolve. unit_type (str, None): A unit type for which the resolution should be done. role (str, None): One of 'measure', 'dimension', 'identifier' or `None`. with_attrs (dict, None): Attributes to set on the returned feature. Note that these are *additive* to any attributes already inherited from feature_type (which are otherwise preserved). Returns: _Dimension, _Measure, _StatisticalUnitIdentifier: The resolved object. """ return_one = False if not isinstance(features, (list, SequenceMap)): return_one = True features = [features] unresolvable = [] resolved = SequenceMap() for feature in features: try: attrs = with_attrs.copy() if with_attrs else {} if isinstance(feature, tuple): feature = FeatureSpec(feature[0], **feature[1]) if isinstance(feature, dict): feature = FeatureSpec(**feature) if isinstance(feature, FeatureSpec): feature, extra_attrs = feature.as_source_with_attrs( unit_type) attrs.update(extra_attrs) r = self._resolve(unit_type=unit_type, feature=feature, role=role)._with_attrs(**attrs) resolved[r] = r except ValueError: unresolvable.append(feature) if len(unresolvable): raise ValueError( "Could not resolve {}(s) associated with unit_type '{}' for: '{}'" .format(role or 'feature', unit_type.__repr__(), "', '".join(str(dim) for dim in unresolvable))) if return_one: return resolved.first return resolved
def wrapped(self, unit_type, measures=None, segment_by=None, where=None, joins=None, stats=True, covariates=False, context=None, stats_registry=None, **opts): unit_type = self.identifier_for_unit(unit_type) if isinstance(measures, (str, _ProvidedFeature)): measures = [measures] measures = SequenceMap() if measures is None else self.resolve( unit_type=unit_type, features=measures, role='measure') if isinstance(segment_by, (str, _ProvidedFeature)): segment_by = [segment_by] segment_by = SequenceMap() if segment_by is None else self.resolve( unit_type=unit_type, features=segment_by, role='dimension') where = Constraint.from_spec(where) joins = joins or [] stats_registry = stats_registry or global_stats_registry context = context or {} # opts = self.opts.process(**opts) return f(self, unit_type, measures=measures, segment_by=segment_by, where=where, joins=joins, stats=stats, covariates=covariates, context=context, stats_registry=stats_registry, **opts)
def _evaluate(self, unit_type, measures, segment_by, where, joins, stats, covariates, context, stats_registry, **opts): assert stats_registry is not None assert not any(measure.external for measure in measures) assert not any(dimension.external for dimension in segment_by) rebase_agg = not unit_type.is_unique raw_data = (self.data.assign(count=1)) where_dims = SequenceMap([ self.dimensions[dim] for dim in where.dimensions if dim not in segment_by ]) df = (pd.DataFrame().assign( **{ dimension.fieldname(role='dimension', unit_type=unit_type if not rebase_agg else None): raw_data.eval(dimension.expr) for dimension in itertools.chain(segment_by, where_dims) }).assign( **{ measure.fieldname(role='measure', unit_type=unit_type if not rebase_agg else None): raw_data.eval(measure.expr) for measure in measures })) return (self._finalise_dataframe(df, unit_type=unit_type, measures=measures, segment_by=segment_by, where=where, stats_registry=stats_registry, stats=stats, rebase_agg=rebase_agg))
def identifiers(self): return SequenceMap( self.identifier_for_unit(ut) for ut in self._cache.identifiers.keys())
class MetaMeasureProvider(MeasureProvider): """ A `MeasureProvider` subclass that acts as a host for other `MeasureProvider` instances, allowing evaluations of measures that span multiple providers. Instances of this class generate a graph of relationships between all of the identifiers, measures and dimensions provided by all hosted providers. Relationships between these features can then be extracted and used in various tasks, chief among which being the evaluation of measures for a statistical unit type segmented by various dimensions. The logic for the evaluation is handled by the `mensor.measures.evaluation.EvaluationStrategy` class. The graph formed by registering `MeasureProvider` instances has the following relationships: - unit_type -> foreign_key - unit_type <- foreign_key [-> reverse_foreign_key] - unit_type -> dimension - unit_type -> measure """ class GraphCache: """ The internal representation of the relationships between features across multiple MeasureProviders. """ def __init__(self, providers=None, identifiers=None, foreign_keys=None, reverse_foreign_keys=None, dimensions=None, measures=None): self.providers = providers or {} self.identifiers = identifiers or {} self.foreign_keys = foreign_keys or {} self.reverse_foreign_keys = reverse_foreign_keys or {} self.dimensions = dimensions or {} self.measures = measures or {} def copy(self): return MetaMeasureProvider.GraphCache( **{ key: nested_dict_copy(getattr(self, key)) for key in [ 'providers', 'identifiers', 'foreign_keys', 'reverse_foreign_keys', 'dimensions', 'measures' ] }) def register(self, provider): # TODO: Enforce that measures and dimensions share same namespace, # and never conflict with stat types # TODO: Ensure no contradictory key types (e.g. Two identifiers # primary on one table and not both primary on a secondary table) # Require that each provider have at least one primary key and a # measure "count". # TODO: Uncomment these checks and retain compatibility with nested # MetaMeasureProvider instances. # if len(list(identifier for identifier in provider.identifiers if identifier.is_unique)) == 0: # raise RuntimeError("MeasureProvider '{}' does not have at least one unique identifier.".format(provider)) # if 'count' not in provider.measures: # raise RuntimeError("MeasureProvider '{}' does not provide a 'count' measure.".format(provider)) for identifier in provider.identifiers: self.register_identifier(identifier) for unit_type in provider.identifiers: self.register_foreign_key(identifier, unit_type) for dimension in provider.dimensions_for_unit(identifier): self.register_dimension(identifier, dimension) for measure in provider.measures_for_unit(identifier): self.register_measure(identifier, measure) def _handled_resolved_features(f): def wrapped(self, unit_type, *args): assert len(args) in (0, 1) if args: if isinstance(unit_type, _ResolvedFeature): unit_type = unit_type.from_provider( list(unit_type._providers.values())[0]) if isinstance(args[0], _ResolvedFeature): for provider in args[0]._providers: f(self, unit_type, args[0].from_provider(provider)) else: f(self, unit_type, args[0]) else: if isinstance(unit_type, _ResolvedFeature): for provider in unit_type._providers: f(self, unit_type.from_provider(provider)) else: f(self, unit_type) return wrapped @_handled_resolved_features def register_identifier(self, unit_type): if isinstance(unit_type, _ResolvedFeature): for provider in unit_type._providers: provided = unit_type.from_provider(provider) self._append(self.identifiers, [provided], provided) else: self._append(self.identifiers, [unit_type], unit_type) @_handled_resolved_features def register_foreign_key(self, unit_type, foreign_key): if unit_type.is_unique: self._append(self.foreign_keys, [unit_type, foreign_key], foreign_key) elif foreign_key.is_unique: self._append(self.reverse_foreign_keys, [unit_type, foreign_key], foreign_key) @_handled_resolved_features def register_dimension(self, unit_type, dimension): self._append(self.dimensions, [unit_type, dimension], dimension) @_handled_resolved_features def register_measure(self, unit_type, measure): self._append(self.measures, [unit_type, measure], measure) @staticmethod def _extract(store, keys): for key in keys: if key not in store: return [] store = store[key] assert isinstance(store, list) return store @staticmethod def _append(store, keys, value): for i, key in enumerate(keys): if key not in store: if i == len(keys) - 1: store[key] = [] else: store[key] = {} store = store[key] assert isinstance(store, list) if store and not (value.shared and all([d.shared for d in store])): raise RuntimeError( "Attempted to add duplicate non-shared feature '{}'.". format(value)) store.append(value) # Initialisation methods def __init__(self, name=None): MeasureProvider.__init__(self, name) self._providers = SequenceMap() self._stats_registry = StatsRegistry(fallback=global_stats_registry) self._cache = MetaMeasureProvider.GraphCache() # MeasureProvider registration @property def providers(self): """A SequenceMap of all of the providers hosted by this registry.""" return self._providers def register(self, provider): """ This method atomically registers a provider, and extends the graph to include it. Once registered, its features will be immediately available to all evaluations. """ if provider.name in self._providers: raise ValueError( "A MeasureProvider named '{}' has already been registered.". format(provider.name)) self._providers[provider.name] = provider cache = self._cache.copy() cache.register(provider) # Committing cache self._cache = cache return self def register_from_yaml(self, path_or_yaml): if '\n' in path_or_yaml or not os.path.isdir( os.path.expanduser(path_or_yaml)): return self.register(MeasureProvider.from_yaml(path_or_yaml)) else: for dirpath, dirnames, filenames in os.walk( os.path.expanduser(path_or_yaml)): for filename in filenames: if filename.endswith('.yml'): try: provider = MeasureProvider.from_yaml( os.path.join(dirpath, filename)) self.register(provider) except AssertionError: pass def unregister(self, provider): """ Remove a nominated provider from this registry. Args: provider (MeasureProvider, str): The provider to be removed. Returns: MeasureProvider: The removed provider. """ provider = self._providers.pop(provider) self._cache_refresh() return provider def _cache_refresh(self): self._cache = MetaMeasureProvider.GraphCache() for provider in self._providers.values(): self._cache.register(provider) # Transform registration def register_transform(self, transform, name=None, backend=None): return self._stats_registry.transforms.register(transform=transform, name=name, backend=backend) def register_agg(self, agg, name=None, backend=None): return self._stats_registry.aggregations.register(agg=agg, name=name, backend=backend) @property def identifiers(self): return SequenceMap( self.identifier_for_unit(ut) for ut in self._cache.identifiers.keys()) # MeasureEvaluator methods def identifier_for_unit(self, unit_type): return _ResolvedFeature( name=unit_type if isinstance(unit_type, str) else unit_type.name, providers=[d.provider for d in self._cache.identifiers[unit_type]], kind='identifier') def _features_lookup(self, unit_type, kind, attr_filter=None): assert kind in ('foreign_key', 'reverse_foreign_key', 'dimension', 'measure') unit_type = self.identifier_for_unit(unit_type) feature_source = getattr(self._cache, kind + 's') features = SequenceMap() for avail_unit_type in feature_source: if avail_unit_type.matches(unit_type): for feature, instances in feature_source[ avail_unit_type].items(): if feature not in features and (not attr_filter or attr_filter(feature)): mask = None if kind in ('foreign_key', 'reverse_foreign_key' ) and avail_unit_type == feature.name: mask = unit_type.name features.append( _ResolvedFeature( feature.name, providers=[d.provider for d in instances], unit_type=unit_type, mask=mask, kind=kind)) return features def foreign_keys_for_unit(self, unit_type): return self._features_lookup(unit_type, 'foreign_key') def reverse_foreign_keys_for_unit(self, unit_type): return self._features_lookup(unit_type, 'reverse_foreign_key') def dimensions_for_unit(self, unit_type, include_partitions=True): return self._features_lookup( unit_type, 'dimension', attr_filter=None if include_partitions else lambda feature: not feature.partition) def partitions_for_unit(self, unit_type): return self._features_lookup( unit_type, 'dimension', attr_filter=lambda feature: feature.partition) def measures_for_unit(self, unit_type): return self._features_lookup(unit_type, 'measure') def _resolve(self, unit_type, feature, role=None): unit_type = self.identifier_for_unit(unit_type) via = '' attrs = {} eff_unit_type = unit_type if isinstance(feature, (_ResolvedFeature, _ProvidedFeature)): attrs = feature.attrs del attrs['name'] feature = feature.via_name # Re-resolve any resolved feature, since resolved features are currently not deeply resolved if isinstance(feature, str): s = feature.split('/') # assert len(s) == 1, '/'.join([str(unit_type), str(feature)]) if len(s) > 1 and s[ 0] == unit_type.name: # Remove reference to current unit_type s = s[1:] via_suffix = '/'.join(s[:-1]) feature = s[-1] if via_suffix: eff_unit_type = self.identifier_for_unit(s[-2]) via += ('/' + via_suffix) if via else via_suffix attrs['unit_type'] = unit_type return MeasureProvider._resolve( self, eff_unit_type, feature, role=role)._with_attrs(**attrs).as_via(via) def _find_primary_key_for_unit_type(self, unit_type): for identifier in sorted(self._cache.identifiers, key=lambda x: len(x.name), reverse=True): if identifier.matches(unit_type) and any( i.is_primary for i in self._cache.identifiers[identifier]): return identifier raise RuntimeError( "No primary key exists for unit_type `{}`.".format(unit_type)) def _find_optimal_provision(self, unit_type, measures, dimensions, require_primary=True): """ This method takes a set of meaures and dimensions for a given unit_type, and generates a somewhat optimised sequence of `Provision` instances, which indicate the MeasureProvider instance from which measures and dimensions should be extracted. This is primarily useful for the generation of an `EvaluationStrategy`. Args: unit_type (str, _StatisticalUnitIdentifier): The statistical unit type for which indicated measures and dimensions should be extracted. measures (list<str,_Measure>): A set of measures to be extracted. dimensions (list<str, _Dimension): A set of dimensions to be extracted. require_primary (bool): Whether to require the first `Provision` to be from a `MeasureProvider` with `unit_type` as a primary identifier. Returns: list<Provision>: A list of `Provision` instances which optimally supply the requested measures and dimensions. """ # TODO: Handle relation case, where ... # [Provision(provider, measures, dimensions), ...] unit_type = self.identifier_for_unit(unit_type) measures = { self.resolve(unit_type, measure, role='measure'): self.resolve(unit_type, measure, role='measure') for measure in measures } dimensions = { self.resolve(unit_type, dimension, role='dimension'): self.resolve(unit_type, dimension, role='dimension') for dimension in dimensions } def get_next_provider(unit_type, measures, dimensions, primary=False): provider_count = Counter() provider_count.update(provider for measure in measures.values() for provider in measure.providers.values()) provider_count.update(provider for dimension in dimensions.values() for provider in dimension.providers.values()) provider = None if primary: primary_unit_type = self._find_primary_key_for_unit_type( unit_type) # Try to extract primary provider from used providers, or locate # one in the unit_type registry. for p, _ in provider_count.most_common() + [ (ut.provider, 0) for ut in self._cache.identifiers[ primary_unit_type.name] if ut.is_primary ]: if p.identifiers.get( primary_unit_type) and p.identifiers.get( primary_unit_type).is_primary: provider = p break if provider is None: raise ValueError("No primary key for {}.".format( unit_type.name)) else: provider = provider_count.most_common(1)[0][0] return provider provisions = [] dimension_count = len(measures) + len(dimensions) while dimension_count > 0: p = get_next_provider(unit_type, measures, dimensions, primary=True if require_primary and len(provisions) == 0 else False) join_prefix = unit_type.name provisions.append( Provision( p, join_prefix, measures=[ measures.pop(measure).from_provider(p) for measure in measures.copy() if measure in p.measures_for_unit(unit_type) ], dimensions=[ dimensions.pop(dimension).from_provider(p) for dimension in dimensions.copy() if dimension in p.dimensions_for_unit(unit_type) or dimension in p.foreign_keys_for_unit(unit_type) or dimension in p.measures_for_unit(unit_type) ] # TODO: Use p.resolve? )) if len(measures) + len(dimensions) == dimension_count and not ( require_primary is True and len(provisions) == 1): raise RuntimeError( "Could not provide provisions for: measures={}, dimensions={}. This is a bug." .format(list(measures), list(dimensions))) dimension_count = len(measures) + len(dimensions) return provisions def evaluate(self, unit_type, measures=None, segment_by=None, where=None, joins=None, stats=True, covariates=False, context=None, stats_registry=None, **opts): strategy = self.get_strategy(unit_type, measures=measures, segment_by=segment_by, where=where, context=context) return strategy.execute(stats=stats, covariates=covariates, context=context, **opts) def get_ir(self, unit_type, measures=None, segment_by=None, where=None, joins=None, stats=True, covariates=False, context=None, stats_registry=None, **opts): strategy = self.get_strategy(unit_type, measures=measures, segment_by=segment_by, where=where, context=context) return strategy.execute(stats=stats, covariates=covariates, ir_only=True, context=context, **opts) def get_strategy(self, unit_type, measures=None, segment_by=None, where=None, context=None): # TODO: incorporate context into strategy evaluation # TODO: Add support for joins to meta measure provider # TODO: Add support for stats_registry return EvaluationStrategy.from_spec(self, unit_type, measures=measures, segment_by=segment_by, where=where)
def __init__(self, name=None): MeasureProvider.__init__(self, name) self._providers = SequenceMap() self._stats_registry = StatsRegistry(fallback=global_stats_registry) self._cache = MetaMeasureProvider.GraphCache()
def reverse_foreign_keys_for_unit(self, unit_type): return SequenceMap()
class EvaluationStrategy(object): class Type(Enum): REGULAR = 1 UNIT_REBASE = 2 @classmethod def from_spec(cls, registry, unit_type, measures=None, segment_by=None, where=None, **opts): # Step 0: Resolve applicable measures and dimensions unit_type = registry.identifier_for_unit(unit_type) measures = [] if measures is None else measures segment_by = [] if segment_by is None else segment_by if not isinstance(measures, list): measures = [measures] if not isinstance(segment_by, list): segment_by = [segment_by] measures = [ registry.resolve(unit_type, measure, role='measure') for measure in measures ] segment_by = [ registry.resolve(unit_type, dimension, role='dimension') for dimension in segment_by ] where = Constraint.from_spec(where) where_dimensions = [ ( registry.resolve(unit_type, dimension, role='dimension').as_implicit ) for dimension in where.scoped_for_unit_type(unit_type).dimensions if dimension not in segment_by ] # Step 1: Collect measures and dimensions into groups based on current unit_type # and next unit_type current_evaluation = FeatureBundle(unit_type=unit_type, dimensions=[], measures=[]) next_evaluations = {} def collect_dimensions(dimensions, kind='measures', for_constraint=False): for dimension in dimensions: if not dimension.via: current_evaluation._asdict()[kind].append(dimension) elif ( # Handle reverse foreign key joins dimension.next_unit_type in registry.reverse_foreign_keys_for_unit(unit_type) ): next_unit_type = registry.resolve(unit_type, dimension.next_unit_type, role='reverse_foreign_key') if next_unit_type not in next_evaluations: next_evaluations[next_unit_type] = FeatureBundle(unit_type=unit_type, dimensions=[], measures=[]) next_evaluations[next_unit_type]._asdict()[kind].append(dimension.via_next) else: next_unit_type = registry.resolve(unit_type, dimension.next_unit_type, role='foreign_key') if next_unit_type not in next_evaluations: next_evaluations[next_unit_type] = FeatureBundle(unit_type=next_unit_type, dimensions=[], measures=[]) next_evaluations[next_unit_type]._asdict()[kind].append(dimension.via_next) collect_dimensions(measures, kind='measures') collect_dimensions(segment_by, kind='dimensions') collect_dimensions(where_dimensions, kind='dimensions', for_constraint=True) # Add required dimension for joining in next unit_types for dimension_bundle in next_evaluations.values(): fk = registry.resolve(unit_type, dimension_bundle.unit_type, role='foreign_key') if fk not in current_evaluation.dimensions: current_evaluation.dimensions.append(fk.as_private) # Step 2: Create optimal joins for current unit_type provisions = registry._find_optimal_provision( unit_type=unit_type, measures=current_evaluation.measures, dimensions=current_evaluation.dimensions ) evaluations = [] for provision in provisions: generic_constraints = where.generic_for_provider(provision.provider) generic_constraint_dimensions = [ provision.provider.resolve(unit_type, dimension).as_private for dimension in generic_constraints.dimensions if not provision.dimensions or dimension not in provision.dimensions ] evaluations.append( cls( registry=registry, provider=provision.provider, unit_type=unit_type, measures=provision.measures, segment_by=provision.dimensions + generic_constraint_dimensions, where=generic_constraints, join_prefix=provision.join_prefix ) ) # Step 3: For each next unit_type, recurse problem and join into above query for foreign_key, dim_bundle in next_evaluations.items(): foreign_strategy = cls.from_spec(registry=registry, unit_type=foreign_key, measures=dim_bundle.measures, segment_by=dim_bundle.dimensions, where=where.via_next(foreign_key.name), **opts) if foreign_key != dim_bundle.unit_type: # Reverse foreign key join foreign_key = dim_bundle.unit_type foreign_strategy.unit_type = dim_bundle.unit_type added = False for sub_strategy in evaluations: for dimension in sub_strategy.segment_by: if isinstance(dimension, _StatisticalUnitIdentifier) and dimension.matches(foreign_key): sub_strategy.add_join(foreign_key, foreign_strategy) added = True break if not added: raise RuntimeError("Could not add foreign strategy: {}".format(foreign_strategy)) strategy = evaluations[0] for sub_strategy in evaluations[1:]: strategy.add_join(unit_type, sub_strategy) strategy.where = And.from_operands(strategy.where, where.scoped_for_unit_type(unit_type).scoped_applicable) # Step 4: Mark any resolved where dependencies as private, unless otherwise # requested in `segment_by` for dimension in strategy.segment_by: if dimension.implicit and dimension in where.scoped_for_unit_type(unit_type).dimensions: strategy.segment_by[dimension] = strategy.segment_by[dimension].as_private # Step 5: Return EvaluationStrategy, and profit. return strategy def __init__(self, registry, provider, unit_type, measures, segment_by=None, where=None, join_on_left=None, join_on_right=None, join_prefix=None, joins=None): self.registry = registry self.provider = provider # Statistical unit used for evaluation self.unit_type = unit_type # Anticipated measures, segmentations and constraints self.measures = SequenceMap(measures or []) self.segment_by = SequenceMap(segment_by or []) self.where = where # Join parameters self.is_joined = False self.join_is_compatible = True self.join_on_left = join_on_left self.join_on_right = join_on_right or [self.matched_unit_type.name] self.joins = joins or [] self.join_prefix = join_prefix or self.unit_type.name def _check_constraints(self, prefix=None, raise_on_unconstrained=True): """ This method checks whether dimensions that require constraints have been constrained. """ unconstrained = [] constrained_dimensions = self.where.dimensions constrained_dimensions.extend(self.join_on_right) for dimension in self.provider.dimensions_for_unit(self.unit_type): if dimension.requires_constraint and dimension not in constrained_dimensions: unconstrained.append('{}/{}'.format(prefix, dimension.name) if prefix else dimension.name) for join in self.joins: unconstrained.extend(join._check_constraints(prefix='{}/{}'.format(prefix, join.unit_type.name) if prefix else join.unit_type.name, raise_on_unconstrained=False)) if raise_on_unconstrained and len(unconstrained) > 0: raise RuntimeError("The following dimensions require and lack constraints: {}.".format(unconstrained)) return unconstrained @property def matched_unit_type(self): return self.provider.identifier_for_unit(self.unit_type) @property def strategy_type(self): if not self.matched_unit_type.is_unique: return self.Type.UNIT_REBASE else: return self.Type.REGULAR @property def joins_all_compatible(self): for join in self.joins: if ( not self.provider.is_compatible_with(join.provider) or not join.joins_all_compatible ): return False return True def __repr__(self): class StrategyEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, EvaluationStrategy): d = OrderedDict([ ('provider', o.provider), ('unit_type', o.unit_type) ]) d['strategy_type'] = o.strategy_type if o.measures: d['measures'] = o.measures if o.segment_by: d['segment_by'] = o.segment_by if o.where: d['where'] = o.where if o.is_joined: d['join_on_left'] = o.join_on_left d['join_on_right'] = o.join_on_right d['join_type'] = o.join_type if o.join_prefix != o.unit_type.name: d['join_prefix'] = o.join_prefix d['join_is_compatible'] = o.join_is_compatible if o.joins: d['joins'] = o.joins d['joins_all_compatible'] = o.joins_all_compatible return d return o.__repr__() return 'EvaluationStrategy(' + json.dumps(self, indent=4, cls=StrategyEncoder, ensure_ascii=False) + ')' def add_join(self, unit_type, strategy): # TODO: Make atomic assert isinstance(strategy, EvaluationStrategy) # Add primary join key if missing and set join self_unit_type = self.provider.identifier_for_unit(unit_type.name).with_mask(unit_type.name) join_unit_type = strategy.provider.identifier_for_unit(unit_type.name) if self_unit_type not in self.segment_by: self.segment_by.prepend(self_unit_type.as_private) if join_unit_type not in strategy.segment_by: strategy.segment_by.prepend(join_unit_type) else: strategy.segment_by[join_unit_type].private = False strategy.join_on_left = [self_unit_type.fieldname(role='dimension')] strategy.join_on_right = [join_unit_type.fieldname(role='dimension')] # Add common partitions to join keys common_partitions = list( set(self.provider.partitions_for_unit(self_unit_type.fieldname(role='dimension'))) .intersection(strategy.provider.partitions_for_unit(join_unit_type.fieldname(role='dimension'))) ) for partition in common_partitions: if partition not in self.segment_by: self.segment_by.append(self.provider.resolve(self.unit_type, partition, role='dimension').as_private) if partition not in strategy.segment_by: strategy.segment_by.append(strategy.provider.resolve(strategy.unit_type, partition, role='dimension')) else: strategy.segment_by[partition].private = False strategy.join_on_left.extend([p.fieldname(role='dimension') for p in common_partitions]) strategy.join_on_right.extend([p.fieldname(role='dimension') for p in common_partitions]) # Add measures and segmentations in parent from join self.measures.extend( ( measure.as_external.as_via(strategy.join_prefix) if strategy.join_prefix != self.unit_type else measure.as_external ) for measure in strategy.measures if not measure.private ) self.segment_by.extend( ( dimension.as_external.as_via(strategy.join_prefix) if strategy.join_prefix != self.unit_type else dimension.as_external ) for dimension in strategy.segment_by if ( not dimension.private and ( dimension not in strategy.join_on_right or dimension.implicit ) ) ) # Set join metadata on incoming strategy strategy.is_joined = True strategy.join_is_compatible = ( self.provider.is_compatible_with(strategy.provider) and strategy.joins_all_compatible ) if strategy.join_prefix == self.join_prefix: strategy.join_prefix = None self.joins.append(strategy) return self @property def join_type(self): if self.strategy_type == self.Type.UNIT_REBASE: return 'left' if len(self.where.dimensions) > 0: return 'inner' for join in self.joins: if join.join_type == 'inner': return 'inner' return 'left' def execute(self, stats=True, ir_only=False, as_join=False, compatible=False, **opts): self._check_constraints() # Step 1: Build joins stats = stats and not self.is_joined joins = [] for join in self.joins: joins.append(join.execute( as_join=True, compatible=self.provider.is_compatible_with(join.provider), **opts )) # Step 2: Evaluate provider if as_join and compatible: try: return Join( provider=self.provider, unit_type=self.unit_type, join_prefix=self.join_prefix, left_on=self.join_on_left, right_on=self.join_on_right, measures=self.measures, dimensions=self.segment_by, object=self.provider.get_ir( unit_type=self.unit_type, measures=self.measures, segment_by=self.segment_by, where=self.where, joins=joins, stats_registry=self.registry._stats_registry, stats=stats, **opts ), how=self.join_type, compatible=True ) except NotImplementedError: pass if ir_only: return self.provider.get_ir( unit_type=self.unit_type, measures=self.measures, segment_by=self.segment_by, where=self.where, joins=joins, stats_registry=self.registry._stats_registry, stats=stats, **opts ) else: evaluated = self.provider.evaluate( unit_type=self.unit_type, measures=self.measures, segment_by=self.segment_by, where=self.where, joins=joins, stats_registry=self.registry._stats_registry, stats=stats, **opts ) if as_join: if self.join_prefix: evaluated = evaluated.add_prefix('{}/'.format(self.join_prefix)) right_on = ['{}/{}'.format(self.join_prefix, j) for j in self.join_on_right] else: right_on = self.join_on_right return Join( provider=self.provider, unit_type=self.unit_type, join_prefix=self.join_prefix, left_on=self.join_on_left, right_on=right_on, measures=self.measures, dimensions=self.segment_by, how=self.join_type, object=evaluated, compatible=False ) return evaluated