def _cohort_stream_transform(source, stream, start, end, transform, grouping_key, unit): start_stream = KronosSource(source, stream, start, end) if transform: transformed = transform(start_stream) else: transformed = start_stream projected = Project(transformed, [Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD), Property(grouping_key, alias=grouping_key), Floor([Property(TIMESTAMP_FIELD), Constant(DateUnit.unit_to_kronos_time(unit)), Constant(start)], alias='date')]) # This leaves us with a single event per (user, unit time) pair. aggregated = Aggregate( projected, GroupBy([Property(grouping_key, alias=grouping_key), Property('date', alias='date')]), # The first time the user performed the event in that bucket. [Min([Property(TIMESTAMP_FIELD)], alias=TIMESTAMP_FIELD)]) return aggregated
def test_aggregate(self): sums = defaultdict(int) for i in xrange(200): a = random.randint(0, 2) self.kronos_client.put( {self.stream: [{ constants.TIMESTAMP_FIELD: i, 'a': a }]}) sums[50 * (i / 50)] += a events = self.query( Aggregate( Project(KronosSource('kronos', self.stream, 0, 1000), [ Floor([Property(constants.TIMESTAMP_FIELD), Constant(50)], alias=constants.TIMESTAMP_FIELD) ], merge=True), GroupBy( Property(constants.TIMESTAMP_FIELD, alias=constants.TIMESTAMP_FIELD)), [ Count([], alias='count'), Sum([Property('a')], alias='sum'), Min([Property('a')], alias='min'), Max([Property('a')], alias='max'), Avg([Property('a')], alias='avg') ]).to_dict()) self.assertEqual(len(events), 200 / 50) for event in events: self.assertEqual(event[constants.TIMESTAMP_FIELD] % 50, 0) self.assertEqual(event['count'], 50) self.assertEqual(event['min'], 0) self.assertEqual(event['max'], 2) self.assertEqual(event['sum'], sums[event[constants.TIMESTAMP_FIELD]]) self.assertTrue(event['avg'] * 50 > event['sum'] - 0.1) self.assertTrue(event['avg'] * 50 < event['sum'] + 0.1) events = self.query( Aggregate( KronosSource('kronos', self.stream, 0, 1000), GroupBy( Floor([Property(constants.TIMESTAMP_FIELD), Constant(50)], alias=constants.TIMESTAMP_FIELD)), [Count([], alias='count')]).to_dict()) self.assertEqual(len(events), 200 / 50)
def aggregate(query_plan, operands): aggregates = [] for agg in operands['aggregates']: aggregates.append(agg_op(agg['agg_type'], cpf(agg['agg_on']), agg['alias'])) groups = [] for group in operands['groups']: groups.append(cpf(group['field'], group['alias'])) group_by = GroupBy(groups) return Aggregate(query_plan, group_by, aggregates)
def test_parsing_and_to_dict(self): _property = {'type': 'property', 'name': 'x'} constant = {'type': 'constant', 'value': 1} function = {'type': 'function', 'name': 'add', 'arguments': [deepcopy(_property), deepcopy(constant)]} self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant) self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property) self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function) kronos = {'type': 'kronos', 'host': 'localhost', 'stream': 'mystream', 'start_time': 100, 'end_time': 200} self.assertEqual(Stream.parse(deepcopy(kronos)).to_dict(), kronos) condition_lt = {'op': 'lt', 'left': deepcopy(_property), 'right': deepcopy(constant)} condition_eq = {'op': 'eq', 'left': deepcopy(function), 'right': deepcopy(_property)} condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt), deepcopy(condition_eq)]} self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(), condition_lt) self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(), condition_eq) self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(), condition_or) avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'} count = {'op': 'count', 'alias': 'mycount'} self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg) self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count) group_by = deepcopy(function) group_by['alias'] = 'mygroup' group_by = [group_by] self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by) project = {'type': 'project', 'fields': [deepcopy(_property)], 'stream': deepcopy(kronos)} _filter = {'type': 'filter', 'condition': condition_lt, 'stream': deepcopy(project)} aggregate = {'type': 'aggregate', 'group_by': deepcopy(group_by), 'aggregates': [deepcopy(avg), deepcopy(count)], 'stream': deepcopy(_filter)} join = {'type': 'join', 'left': deepcopy(aggregate), 'right': deepcopy(project), 'condition': deepcopy(condition_or)} self.assertEqual(Transform.parse(deepcopy(project)).to_dict(), project) self.assertEqual(Transform.parse(deepcopy(_filter)).to_dict(), _filter) self.assertEqual(Transform.parse(deepcopy(aggregate)).to_dict(), aggregate) self.assertEqual(Transform.parse(deepcopy(join)).to_dict(), join)
def test_parsing_and_to_dict(self): _property = {'type': 'property', 'name': 'x'} constant = {'type': 'constant', 'value': 1} function = {'type': 'function', 'name': 'add', 'arguments': [deepcopy(_property), deepcopy(constant)]} self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant) self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property) self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function) kronos = {'type': 'data_access', 'source': 'kronos', 'stream': 'mystream', 'start_time': 100, 'end_time': 200} self.assertEqual(Operator.parse(deepcopy(kronos)).to_dict(), kronos) condition_lt = {'op': 'lt', 'left': deepcopy(_property), 'right': deepcopy(constant)} condition_eq = {'op': 'eq', 'left': deepcopy(function), 'right': deepcopy(_property)} condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt), deepcopy(condition_eq)]} self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(), condition_lt) self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(), condition_eq) self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(), condition_or) avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'} count = {'op': 'count', 'alias': 'mycount'} self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg) self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count) group_by = deepcopy(function) group_by['alias'] = 'mygroup' group_by = [group_by] self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by) project = {'type': 'project', 'fields': [deepcopy(_property)], 'source': deepcopy(kronos)} _filter = {'type': 'filter', 'condition': condition_lt, 'source': deepcopy(project)} aggregate = {'type': 'aggregate', 'group_by': deepcopy(group_by), 'aggregates': [deepcopy(avg), deepcopy(count)], 'source': deepcopy(_filter)} join = {'type': 'join', 'left': deepcopy(aggregate), 'right': deepcopy(project), 'condition': deepcopy(condition_or)} self.assertEqual(Operator.parse(deepcopy(project)).to_dict(), project) self.assertEqual(Operator.parse(deepcopy(_filter)).to_dict(), _filter) self.assertEqual(Operator.parse(deepcopy(aggregate)).to_dict(), aggregate) self.assertEqual(Operator.parse(deepcopy(join)).to_dict(), join)
def aggregate(query_plan, operands): aggregates = [] for agg in operands['aggregates']: cpf_type = agg['agg_on']['cpf_type'] property_name = agg['agg_on'].get('property_name') constant_value = agg['agg_on'].get('constant_value') empty = (cpf_type == 'property' and not property_name or cpf_type == 'constant' and not constant_value) if empty: agg_on_cpf = None else: agg_on_cpf = cpf(agg['agg_on']) aggregates.append(agg_op(agg['agg_type'], agg_on_cpf, agg['alias'])) groups = [] for group in operands['groups']: groups.append(cpf(group['field'], group['alias'])) group_by = GroupBy(groups) return Aggregate(query_plan, group_by, aggregates)
def cohort_queryplan(plan): """ Input: { 'source': 'kronos', # Name of data source from settings 'cohort': {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from. 'transform': lambda x: x, # Transformations on the kstream. 'start': date.now(), # The day of the first cohort. 'unit': DateUnit.XX, # Users are in the same cohort # if they are in the same day/week. 'cohorts': 5 # How many cohorts (days/weeks/months) # to track. 'grouping_key': 'user'}, # What key in an event should we tie # to a key in the action stream? 'action': {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on. 'transform': lambda x: x # Transformations on the stream. 'unit': DateUnit.XX, # Track events in day/week/months. 'repetitions': 14 # How many days/weeks/months to track. 'grouping_key': 'user_id'} # What key in an event should we tie # to a key in the action stream? } Output: A metis-compatible query plan to return a cohort analysis. """ cohort = plan['cohort'] action = plan['action'] source = plan['source'] # Calculate the start and end dates, in Kronos time, of the # beginning and end of the cohort and action streams that will be # relevant. cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start'])) cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']}) cohort_end = cohort['start'] + cohort_span action_span = timedelta(**{action['unit']: action['repetitions']}) action_end = cohort_end + action_span cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1 action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1 left = _cohort_stream_transform(source, cohort['stream'], cohort_start, cohort_end, cohort.get('transform'), cohort['grouping_key'], cohort['unit']) right = _cohort_stream_transform(source, action['stream'], cohort_start, action_end, action.get('transform'), action['grouping_key'], action['unit']) additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) * action['repetitions']) left.alias = 'cohort' right.alias = 'action' joined = Join(left, right, (Condition(Condition.Op.EQ, Property('cohort.%s' % cohort['grouping_key']), Property('action.%s' % action['grouping_key'])) & Condition(Condition.Op.GTE, Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)) & Condition(Condition.Op.LT, Property('action.%s' % TIMESTAMP_FIELD), Add([Property('cohort.%s' % TIMESTAMP_FIELD), Constant(additional_action_time)])))) user_aggregated = Aggregate( joined, GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD), Property('cohort.%s' % cohort['grouping_key'], alias='group'), Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)]), Constant(DateUnit.unit_to_kronos_time(action['unit']))], alias='action_step')]), [Count([], alias='count')] ) aggregated = Aggregate( user_aggregated, GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD), Property('action_step', alias='action_step')]), [Count([], alias='cohort_actions')]) # TODO(marcua): Also sum up the cohort sizes, join with the plan. return aggregated.to_dict()
def parse(self, _dict): _dict['aggregates'] = map(Aggregator.parse, _dict['aggregates']) _dict['group_by'] = GroupBy.parse(_dict['group_by']) return Aggregate(**_dict)