def _date_trunc(value, timeframe): """ A date flooring function. Returns the closest datetime to the current one that aligns to timeframe. For example, _date_trunc('2014-08-13 05:00:00', DateTrunc.Unit.MONTH) will return a Kronos time representing 2014-08-01 00:00:00. """ if isinstance(value, types.StringTypes): value = parse(value) return_as_str = True else: value = kronos_time_to_datetime(value) return_as_str = False timeframes = { DateTrunc.Unit.SECOND: (lambda dt: dt - timedelta(microseconds=dt.microsecond)), DateTrunc.Unit.MINUTE: (lambda dt: dt - timedelta(seconds=dt.second, microseconds=dt.microsecond)), DateTrunc.Unit.HOUR: (lambda dt: dt - timedelta(minutes=dt.minute, seconds=dt.second, microseconds=dt.microsecond)), DateTrunc.Unit.DAY: lambda dt: dt.date(), DateTrunc.Unit.WEEK: lambda dt: dt.date() - timedelta(days=dt.weekday()), DateTrunc.Unit.MONTH: lambda dt: datetime(dt.year, dt.month, 1), DateTrunc.Unit.YEAR: lambda dt: datetime(dt.year, 1, 1) } value = timeframes[timeframe](value) if return_as_str: return value.isoformat() return datetime_to_kronos_time(value)
def cohort_queryplan(plan): """ Input: { 'source': 'kronos', # Name of data source from settings 'cohort': {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from. 'transform': lambda x: x, # Transformations on the kstream. 'start': date.now(), # The day of the first cohort. 'unit': DateUnit.XX, # Users are in the same cohort # if they are in the same day/week. 'cohorts': 5 # How many cohorts (days/weeks/months) # to track. 'grouping_key': 'user'}, # What key in an event should we tie # to a key in the action stream? 'action': {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on. 'transform': lambda x: x # Transformations on the stream. 'unit': DateUnit.XX, # Track events in day/week/months. 'repetitions': 14 # How many days/weeks/months to track. 'grouping_key': 'user_id'} # What key in an event should we tie # to a key in the action stream? } Output: A metis-compatible query plan to return a cohort analysis. """ cohort = plan['cohort'] action = plan['action'] source = plan['source'] # Calculate the start and end dates, in Kronos time, of the # beginning and end of the cohort and action streams that will be # relevant. cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start'])) cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']}) cohort_end = cohort['start'] + cohort_span action_span = timedelta(**{action['unit']: action['repetitions']}) action_end = cohort_end + action_span cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1 action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1 left = _cohort_stream_transform(source, cohort['stream'], cohort_start, cohort_end, cohort.get('transform'), cohort['grouping_key'], cohort['unit']) right = _cohort_stream_transform(source, action['stream'], cohort_start, action_end, action.get('transform'), action['grouping_key'], action['unit']) additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) * action['repetitions']) left.alias = 'cohort' right.alias = 'action' joined = Join(left, right, (Condition(Condition.Op.EQ, Property('cohort.%s' % cohort['grouping_key']), Property('action.%s' % action['grouping_key'])) & Condition(Condition.Op.GTE, Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)) & Condition(Condition.Op.LT, Property('action.%s' % TIMESTAMP_FIELD), Add([Property('cohort.%s' % TIMESTAMP_FIELD), Constant(additional_action_time)])))) user_aggregated = Aggregate( joined, GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD), Property('cohort.%s' % cohort['grouping_key'], alias='group'), Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD), Property('cohort.%s' % TIMESTAMP_FIELD)]), Constant(DateUnit.unit_to_kronos_time(action['unit']))], alias='action_step')]), [Count([], alias='count')] ) aggregated = Aggregate( user_aggregated, GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD), Property('action_step', alias='action_step')]), [Count([], alias='cohort_actions')]) # TODO(marcua): Also sum up the cohort sizes, join with the plan. return aggregated.to_dict()
def generate_data(self): user_ids = range(CohortTestCase.NUM_USERS) # Email stream: Users in groups 1-5 get an email in weeks 1 and 3, # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively. # See `EMAIL_WEEKS`. user_dates = {} # Fill in expected_output, which is of # the form: {cohort_date: {cohort_size: NN, # action_dates: {action_date: num_actions}}} expected = defaultdict(lambda: {'cohort_size': 0, 'action_dates': defaultdict(set)}) for user_id in user_ids: weeks1, weeks2 = CohortTestCase.EMAIL_WEEKS[ user_id % len(CohortTestCase.EMAIL_WEEKS)] week1 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks1) date1 = week1 + timedelta(hours=randint(0, 72)) week2 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks2) date2 = week2 + timedelta(hours=randint(0, 72)) week1_str = datetime_to_date_str(week1) week2_str = datetime_to_date_str(week2) user_dates[user_id] = ({'cohort': week1_str, 'cohort_date': week1, 'precise_date': date1}, {'cohort': week2_str, 'cohort_date': week2, 'precise_date': date2}) expected[week1_str]['cohort_size'] += 1 expected[week2_str]['cohort_size'] += 1 self.kronos_client.put({ CohortTestCase.EMAIL_STREAM: [ {'user': user_id, constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date1)}, {'user': user_id, constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date2)} ] }) # Action stream: Users in group 1 hit the front page w/ 1/5 # percent chance, group 2 with 2/5 chance, etc. Likelihood for # all users on day N is 1/N. for user_id in user_ids: day_to_min_action_dt = defaultdict(dict) for email_dates in user_dates[user_id]: for day in xrange(CohortTestCase.ACTION_REPETITION_DAYS): group_probability = ( ((user_id % len(CohortTestCase.EMAIL_WEEKS)) + 1.0) / len(CohortTestCase.EMAIL_WEEKS)) day_probability = 1.0 / (day + 1) action_probability = group_probability * day_probability if random() < action_probability: action_dt = email_dates['precise_date'] + timedelta(days=day) # This is the date that our cohort plan compares to. day_to_min_action_dt[action_dt.date()] = min( action_dt, day_to_min_action_dt.get(action_dt.date(), CohortTestCase.MAX_DT) ) self.kronos_client.put({ CohortTestCase.FRONTPAGE_STREAM: [ {'user_id': user_id, '@time': datetime_to_kronos_time(action_dt)}] }) action_compare_dt = day_to_min_action_dt[action_dt.date()] for _email_dates in user_dates[user_id]: if (_email_dates['precise_date'] > action_compare_dt or _email_dates['precise_date'] + timedelta(CohortTestCase.ACTION_REPETITION_DAYS) <= action_compare_dt): continue day = (action_compare_dt - _email_dates['precise_date']).days # Each user must be counted only once! expected[_email_dates['cohort']]['action_dates'][day].add( user_id) for cohort_name in expected: action_dates = expected[cohort_name]['action_dates'] days = action_dates.keys() for day in days: action_dates[day] = len(action_dates[day]) return expected
def generate_data(self): user_ids = range(CohortTestCase.NUM_USERS) # Email stream: Users in groups 1-5 get an email in weeks 1 and 3, # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively. # See `EMAIL_WEEKS`. user_dates = {} # Fill in expected_output, which is of # the form: {cohort_date: {cohort_size: NN, # action_dates: {action_date: num_actions}}} expected = defaultdict(lambda: { 'cohort_size': 0, 'action_dates': defaultdict(set) }) for user_id in user_ids: weeks1, weeks2 = CohortTestCase.EMAIL_WEEKS[user_id % len( CohortTestCase.EMAIL_WEEKS)] week1 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks1) date1 = week1 + timedelta(hours=randint(0, 72)) week2 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks2) date2 = week2 + timedelta(hours=randint(0, 72)) week1_str = datetime_to_date_str(week1) week2_str = datetime_to_date_str(week2) user_dates[user_id] = ({ 'cohort': week1_str, 'cohort_date': week1, 'precise_date': date1 }, { 'cohort': week2_str, 'cohort_date': week2, 'precise_date': date2 }) expected[week1_str]['cohort_size'] += 1 expected[week2_str]['cohort_size'] += 1 self.kronos_client.put({ CohortTestCase.EMAIL_STREAM: [{ 'user': user_id, constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date1) }, { 'user': user_id, constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date2) }] }) # Action stream: Users in group 1 hit the front page w/ 1/5 # percent chance, group 2 with 2/5 chance, etc. Likelihood for # all users on day N is 1/N. for user_id in user_ids: day_to_min_action_dt = defaultdict(dict) for email_dates in user_dates[user_id]: for day in xrange(CohortTestCase.ACTION_REPETITION_DAYS): group_probability = (( (user_id % len(CohortTestCase.EMAIL_WEEKS)) + 1.0) / len(CohortTestCase.EMAIL_WEEKS)) day_probability = 1.0 / (day + 1) action_probability = group_probability * day_probability if random() < action_probability: action_dt = email_dates['precise_date'] + timedelta( days=day) # This is the date that our cohort plan compares to. day_to_min_action_dt[action_dt.date()] = min( action_dt, day_to_min_action_dt.get(action_dt.date(), CohortTestCase.MAX_DT)) self.kronos_client.put({ CohortTestCase.FRONTPAGE_STREAM: [{ 'user_id': user_id, '@time': datetime_to_kronos_time(action_dt) }] }) action_compare_dt = day_to_min_action_dt[ action_dt.date()] for _email_dates in user_dates[user_id]: if (_email_dates['precise_date'] > action_compare_dt or _email_dates['precise_date'] + timedelta( CohortTestCase.ACTION_REPETITION_DAYS) <= action_compare_dt): continue day = (action_compare_dt - _email_dates['precise_date']).days # Each user must be counted only once! expected[_email_dates['cohort']]['action_dates'][ day].add(user_id) for cohort_name in expected: action_dates = expected[cohort_name]['action_dates'] days = action_dates.keys() for day in days: action_dates[day] = len(action_dates[day]) return expected
def cohort_queryplan(plan): """ Input: { 'kronos_url': 'http://...', 'cohort': {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from. 'transform': lambda x: x, # Transformations on the kstream. 'start': date.now(), # The day of the first cohort. 'unit': DateUnit.XX, # Users are in the same cohort # if they are in the same day/week. 'cohorts': 5 # How many cohorts (days/weeks/months) # to track. 'grouping_key': 'user'}, # What key in an event should we tie # to a key in the action stream? 'action': {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on. 'transform': lambda x: x # Transformations on the stream. 'unit': DateUnit.XX, # Track events in day/week/months. 'repetitions': 14 # How many days/weeks/months to track. 'grouping_key': 'user_id'} # What key in an event should we tie # to a key in the action stream? } Output: A metis-compatible query plan to return a cohort analysis. """ cohort = plan['cohort'] action = plan['action'] kronos_url = plan.get('kronos_url', app.config['KRONOS_SERVER']) # Calculate the start and end dates, in Kronos time, of the # beginning and end of the cohort and action streams that will be # relevant. cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start'])) cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']}) cohort_end = cohort['start'] + cohort_span action_span = timedelta(**{action['unit']: action['repetitions']}) action_end = cohort_end + action_span cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1 action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1 left = _cohort_stream_transform(kronos_url, cohort['stream'], cohort_start, cohort_end, cohort.get('transform'), cohort['grouping_key'], cohort['unit']) right = _cohort_stream_transform(kronos_url, action['stream'], cohort_start, action_end, action.get('transform'), action['grouping_key'], action['unit']) additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) * action['repetitions']) joined = join( left, right, cond_and(cond(p('cohort.%s' % cohort['grouping_key']), p('action.%s' % action['grouping_key']), ConditionOpType.EQ), cond(p('action.%s' % TIMESTAMP_FIELD), p('cohort.%s' % TIMESTAMP_FIELD), ConditionOpType.GTE), cond(p('action.%s' % TIMESTAMP_FIELD), f(FunctionType.ADD, [p('cohort.%s' % TIMESTAMP_FIELD), c(additional_action_time)]), ConditionOpType.LT)), left_alias='cohort', right_alias='action') user_aggregated = agg( joined, {TIMESTAMP_FIELD: p('cohort.date'), 'group': p('cohort.%s' % cohort['grouping_key']), 'action_step': f(FunctionType.FLOOR, [f(FunctionType.SUBTRACT, [p('action.%s' % TIMESTAMP_FIELD), p('cohort.%s' % TIMESTAMP_FIELD)]), c(DateUnit.unit_to_kronos_time(action['unit']))])}, [] ) aggregated = agg( user_aggregated, {TIMESTAMP_FIELD: p(TIMESTAMP_FIELD), 'action_step': p('action_step')}, [agg_op(AggregateType.COUNT, [], alias='cohort_actions')] ) # TODO(marcua): Also sum up the cohort sizes, join with the plan. return aggregated