def handle(self, *args, **options): SIGNUP_WINDOW = int(args[0]) if len(args) > 0 else 1 RETENTION_WINDOW = int(args[1]) if len(args) > 1 else 1 NEW_USER_WINDOW = int(args[2]) if len(args) > 2 else 1 print "Signup window:", SIGNUP_WINDOW, "days" print "Retention window:", RETENTION_WINDOW, "days" print "New user window:", NEW_USER_WINDOW, "days" stop = datetime.datetime.utcnow() retention_start = stop - datetime.timedelta(days=RETENTION_WINDOW) signup_start = retention_start - datetime.timedelta(days=SIGNUP_WINDOW) new_user_start = signup_start - datetime.timedelta(days=NEW_USER_WINDOW) fact_iter = lambda: fact_query.iterator(start=new_user_start, stop=stop) users = collections.defaultdict(User) all_shares = 0 old_user_max = float("inf") retained_first_ts = None signups = [] for row in fact_iter(): if not retained_first_ts and row["dt"] >= signup_start: retained_first_ts = row["ts"] key = row.get("type") if row.get("type") != "metric" else row.get("metric") user = users[row["user"]] if "user" in row else None if user and key == "signup": old_user_max = min(old_user_max, row["user"]) user.signup_ts = row["ts"] user.signup_dt = row["dt"] if user and key == "view": if not user.first_view_ts: user.first_view_ts = row["ts"] user.first_view_dt = row["dt"] user.came_back[(row["ts"] - user.signup_ts) // (24 * 60 * 60)] = 1 if row["dt"] >= signup_start and row["user"] <= old_user_max: user.retained_visits[(row["ts"] - retained_first_ts) // (24 * 60 * 60)] = 1 user_retention = collections.Counter() new_users = set() for user in users.values(): if user.signup_dt and signup_start <= user.signup_dt <= retention_start: new_users.add(user) user_retention += user.came_back retained_visits = sum((user.retained_visits for user in users.values()), collections.Counter()) print print print "Signup window:", SIGNUP_WINDOW, "days" print "Retention window:", RETENTION_WINDOW, "days" print "New user window:", NEW_USER_WINDOW, "days" print print "New user retention curve" for key, value in sorted(user_retention.items(), key=lambda (k, v): k): if key < NEW_USER_WINDOW: print key, ":", value print print "Old user retention curve" retained_visits_list = [retained_visits[day] for day in range(RETENTION_WINDOW + SIGNUP_WINDOW)] for day, value in enumerate(retained_visits_list): print day, ":", value print print print "Old user retention curve averaged over signup window days" retained_average = [ sum(retained_visits_list[day : day + SIGNUP_WINDOW]) / float(SIGNUP_WINDOW) for day in range(RETENTION_WINDOW) ] for day, value in enumerate(retained_average): print day, ":", value print print "=" * 100 print n = user_retention[NEW_USER_WINDOW - 1] d = user_retention[0] print "New user activation %% over new user window: %s / %s = %s" % (n, d, float(n) / d) d = retained_average[0] n = retained_average[-1] print "Retained %% over retained window: %s / %s = %s" % (n, d, float(n) / d)
def handle(self, *args, **options): SIGNUP_WINDOW = int(args[0]) if len(args) > 0 else 1 RETENTION_WINDOW = int(args[1]) if len(args) > 1 else 1 NEW_USER_WINDOW = int(args[2]) if len(args) > 2 else 1 print "Signup window:", SIGNUP_WINDOW, "days" print "Retention window:", RETENTION_WINDOW, "days" print "New user window:", NEW_USER_WINDOW, "days" stop = datetime.datetime.utcnow() retention_start = stop - datetime.timedelta(days=RETENTION_WINDOW) signup_start = retention_start - datetime.timedelta(days=SIGNUP_WINDOW) new_user_start = signup_start - datetime.timedelta(days=NEW_USER_WINDOW) fact_iter = lambda: fact_query.iterator(start=new_user_start, stop=stop) users = collections.defaultdict(User) all_shares = 0 old_user_max = float("inf") retained_first_ts = None signups = [] for row in fact_iter(): if not retained_first_ts and row['dt'] >= signup_start: retained_first_ts = row['ts'] key = row.get('type') if row.get('type') != 'metric' else row.get('metric') user = users[row['user']] if 'user' in row else None if user and key == 'signup': old_user_max = min(old_user_max, row['user']) user.signup_ts = row['ts'] user.signup_dt = row['dt'] if user and key == 'view': if not user.first_view_ts: user.first_view_ts = row['ts'] user.first_view_dt = row['dt'] user.came_back[(row['ts'] - user.signup_ts) // (24 * 60 * 60)] = 1 if row['dt'] >= signup_start and row['user'] <= old_user_max: user.retained_visits[(row['ts'] - retained_first_ts) // (24 * 60 * 60)] = 1 user_retention = collections.Counter() new_users = set() for user in users.values(): if user.signup_dt and signup_start <= user.signup_dt <= retention_start: new_users.add(user) user_retention += user.came_back retained_visits = sum((user.retained_visits for user in users.values()), collections.Counter()) print print print "Signup window:", SIGNUP_WINDOW, "days" print "Retention window:", RETENTION_WINDOW, "days" print "New user window:", NEW_USER_WINDOW, "days" print print "New user retention curve" for key, value in sorted(user_retention.items(), key = lambda (k,v): k): if key < NEW_USER_WINDOW: print key, ':', value print print "Old user retention curve" retained_visits_list = [retained_visits[day] for day in range(RETENTION_WINDOW + SIGNUP_WINDOW)] for day, value in enumerate(retained_visits_list): print day, ':', value print print print "Old user retention curve averaged over signup window days" retained_average = [sum(retained_visits_list[day:day+SIGNUP_WINDOW]) / float(SIGNUP_WINDOW) for day in range(RETENTION_WINDOW)] for day, value in enumerate(retained_average): print day, ':', value print print "=" * 100 print n = user_retention[NEW_USER_WINDOW-1] d = user_retention[0] print "New user activation %% over new user window: %s / %s = %s" % (n,d, float(n)/d) d = retained_average[0] n = retained_average[-1] print "Retained %% over retained window: %s / %s = %s" % (n,d, float(n)/d)
def handle(self, *args, **options): SIGNUP_WINDOW = float(args[0]) if len(args) > 0 else 0.1 VIRALITY_WINDOW = float(args[1]) if len(args) > 1 else 0.1 print "Signup window:", SIGNUP_WINDOW, "days" print "Virality window:", VIRALITY_WINDOW, "days" stop = datetime.datetime.utcnow() virality_start = stop - datetime.timedelta(days=VIRALITY_WINDOW) signup_start = virality_start - datetime.timedelta(days=SIGNUP_WINDOW) fact_iter = lambda: fact_query.iterator(start=signup_start, stop=stop) sessions = collections.defaultdict(Session) users = collections.defaultdict(User) all_shares = 0 signups = [] for row in fact_iter(): if 'ip' in row: if is_bot(row['ip']): continue session = None if 'ip' in row: if 'utma' in row: if row['utma'] not in sessions: # First time we've seen this google session # Let's tie any previous requests from this IP to this utma session sessions[row['utma']] = sessions[row['ip']] del sessions[row['ip']] session = sessions[row['utma']] else: # We don't have any google session, use the ip instead session = sessions[row['ip']] user = users[row['user']] if 'user' in row else None key = row.get('type') if row.get('type') != 'metric' else row.get( 'metric') if key == 'signup': user.id = row['user'] user.signup_dt = row['dt'] user.in_signup_window = signup_start <= user.signup_dt < virality_start session.users.add(user.id) if key == 'create_share_url': all_shares += 1 if user and user.signup_dt and row[ 'dt'] < user.signup_dt + datetime.timedelta( VIRALITY_WINDOW): # Within our window of consideration for user-attributable shares if key == 'create_share_url': user.shares += 1 if session and key == 'share_redirect': session.share_views += 1 session.sharers.add(row['sharer']) signups_initial = set() for user in users.values(): if user.in_signup_window: signups_initial.add(user) viral_sessions = set() outside_viral_sessions = set() for session in sessions.values(): if len(session.sharers & signups_initial): viral_sessions.add(session) elif len(session.sharers): print session.sharers outside_viral_sessions.add(session) viral_signups = set() for session in viral_sessions: viral_signups |= session.users outside_signups = set() for session in outside_viral_sessions: outside_signups |= session.users print print print "CORE STATS:" print "Initial Signups (IS):", len(signups_initial) print "Shares from IS:", sum(user.shares for user in signups_initial) print "Sessions from shares:", len(viral_sessions) print "Signups from share sessions:", len(viral_signups) print print print "JUST FOR REFERENCE:" print "All shares (new and old users):", all_shares print "Outside sessions from shares:", len(outside_viral_sessions) print "Outside signups from shares:", len(outside_signups)
def handle(self, *args, **options): SIGNUP_WINDOW = float(args[0]) if len(args) > 0 else 0.1 VIRALITY_WINDOW = float(args[1]) if len(args) > 1 else 0.1 print "Signup window:", SIGNUP_WINDOW, "days" print "Virality window:", VIRALITY_WINDOW, "days" stop = datetime.datetime.utcnow() virality_start = stop - datetime.timedelta(days=VIRALITY_WINDOW) signup_start = virality_start - datetime.timedelta(days=SIGNUP_WINDOW) fact_iter = lambda: fact_query.iterator(start=signup_start, stop=stop) sessions = collections.defaultdict(Session) users = collections.defaultdict(User) all_shares = 0 signups = [] for row in fact_iter(): if "ip" in row: if is_bot(row["ip"]): continue session = None if "ip" in row: if "utma" in row: if row["utma"] not in sessions: # First time we've seen this google session # Let's tie any previous requests from this IP to this utma session sessions[row["utma"]] = sessions[row["ip"]] del sessions[row["ip"]] session = sessions[row["utma"]] else: # We don't have any google session, use the ip instead session = sessions[row["ip"]] user = users[row["user"]] if "user" in row else None key = row.get("type") if row.get("type") != "metric" else row.get("metric") if key == "signup": user.id = row["user"] user.signup_dt = row["dt"] user.in_signup_window = signup_start <= user.signup_dt < virality_start session.users.add(user.id) if key == "create_share_url": all_shares += 1 if user and user.signup_dt and row["dt"] < user.signup_dt + datetime.timedelta(VIRALITY_WINDOW): # Within our window of consideration for user-attributable shares if key == "create_share_url": user.shares += 1 if session and key == "share_redirect": session.share_views += 1 session.sharers.add(row["sharer"]) signups_initial = set() for user in users.values(): if user.in_signup_window: signups_initial.add(user) viral_sessions = set() outside_viral_sessions = set() for session in sessions.values(): if len(session.sharers & signups_initial): viral_sessions.add(session) elif len(session.sharers): print session.sharers outside_viral_sessions.add(session) viral_signups = set() for session in viral_sessions: viral_signups |= session.users outside_signups = set() for session in outside_viral_sessions: outside_signups |= session.users print print print "CORE STATS:" print "Initial Signups (IS):", len(signups_initial) print "Shares from IS:", sum(user.shares for user in signups_initial) print "Sessions from shares:", len(viral_sessions) print "Signups from share sessions:", len(viral_signups) print print print "JUST FOR REFERENCE:" print "All shares (new and old users):", all_shares print "Outside sessions from shares:", len(outside_viral_sessions) print "Outside signups from shares:", len(outside_signups)
def __init__(self, **kwargs): self.fact_iter = lambda: _wrapped_iter(fact_query.iterator(**kwargs)) self._phase = 1 self.start = kwargs.get('start') self.stop = kwargs.get('stop')