def test_index_latest(self): self.create_switch("local-statistics-processing") latest = datetime.date.today() - datetime.timedelta(days=5) UpdateCount.index({"date": latest}) self.refresh("stats") start = latest.strftime("%Y-%m-%d") finish = datetime.date.today().strftime("%Y-%m-%d") with mock.patch("olympia.stats.cron.call_command") as call: cron.index_latest_stats() call.assert_called_with("index_stats", addons=None, date="%s:%s" % (start, finish))
def test_index_latest(self): self.create_switch('local-statistics-processing') latest = datetime.date.today() - datetime.timedelta(days=5) UpdateCount.index({'date': latest}) self.refresh('stats') start = latest.strftime('%Y-%m-%d') finish = datetime.date.today().strftime('%Y-%m-%d') with mock.patch('olympia.stats.cron.call_command') as call: cron.index_latest_stats() call.assert_called_with('index_stats', addons=None, date='%s:%s' % (start, finish))
def test_trim_field(self): uc = UpdateCount(addon_id=3615, count=1, date='2015-01-11') self.command.trim_field(uc.versions) # Empty field. assert not uc.versions uc.versions = {'3.6': 123, '3.7': 321} self.command.trim_field(uc.versions) # Small enough to fit in the db. assert uc.versions == {'3.6': 123, '3.7': 321} # Unchanged. very_long_key = 'x' * (2 ** 16) uc.versions[very_long_key] = 1 self.command.trim_field(uc.versions) # Too big, must be trimmed. assert uc.versions == {'3.6': 123, '3.7': 321} # Keep the most used. uc.versions[very_long_key] = 1000 # Most used. self.command.trim_field(uc.versions) # Too big, must be trimmed. # Nothing left: least used removed, but still too big, so all the keys # were removed. assert uc.versions == {} # Make sure we can store a very large field in the database. long_key = 'x' * 65528 # This makes the dict barely fit in the db. uc.versions[long_key] = 1 assert len(json.dumps(uc.versions)) == (2 ** 16) - 1 uc.save() uc = UpdateCount.objects.get(pk=uc.pk) # Reload # Fits in the database, so no truncation. assert len(json.dumps(uc.versions)) == (2 ** 16) - 1
def setUp(self): self.persona = addon_factory(type=amo.ADDON_PERSONA) self.extension = addon_factory() self.static_theme = addon_factory(type=amo.ADDON_STATICTHEME) self.awaiting_review = addon_factory(status=amo.STATUS_NOMINATED) today = datetime.date.today() stats = [ (today - datetime.timedelta(days=days_in_past), update_count) for days_in_past, update_count in ( (1, 827080), (2, 787930), (3, 995860), (4, 1044260), (5, 105431), (6, 106065), (7, 980930), (8, 817100), (9, 78843), (10, 993830), (11, 104431), (12, 105943), (13, 105039), (14, 100183), (15, 82265), (16, 100183), (17, 82265), (18, 100183), (19, 82265), (20, 100183), (21, 82265), )] for obj in (self.persona, self.extension, self.static_theme, self.awaiting_review): UpdateCount.objects.bulk_create([ UpdateCount(addon=obj, date=date, count=count) for date, count in stats ])
def test_13_day_window(self): addon = Addon.objects.get(pk=3615) # can't use a fixed date since we are relying on # mysql to get us the `CURDATE()` today = datetime.date.today() # data is coming from `tab groups` add-on from # jun 11 till may 29th 2017 stats = [ (today - datetime.timedelta(days=days_in_past), update_count) for days_in_past, update_count in ( (1, 82708), (2, 78793), (3, 99586), (4, 104426), (5, 105431), (6, 106065), (7, 98093), (8, 81710), (9, 78843), (10, 99383), (11, 104431), (12, 105943), (13, 105039), (14, 100183), (15, 82265) )] UpdateCount.objects.bulk_create([ UpdateCount(addon=addon, date=date, count=count) for date, count in stats ]) addon.update(average_daily_users=0) cron.update_addon_average_daily_users() addon.refresh_from_db() assert ( 82708 + 78793 + 99586 + 104426 + 105431 + 106065 + 98093 + 81710 + 78843 + 99383 + 104431 + 105943) / 12 == 95451 assert addon.average_daily_users == 95451
def test_update_version(self): # Initialize the known addons and their versions. self.command.addons_versions = {3615: ['3.5', '3.6']} uc = UpdateCount(addon_id=3615) self.command.update_version(uc, '3.6', 123) assert uc.versions == {'3.6': 123} # Test very long version: self.command.update_version(uc, '1' * 33, 1) assert uc.versions == {'3.6': 123, '1' * 32: 1} # Trimmed.
def migrate_theme_update_count(lwt, static_theme, **kw): """Create UpdateCount instances from ThemeUpdateCount instances. By default all instances for the specified lwt (lightweight theme) are copied. Any additional **kw are passed to the filter to - for example to limit to a certain day or day range.""" theme_update_counts = ThemeUpdateCount.objects.filter(addon_id=lwt.id, **kw).iterator() update_counts = [ UpdateCount(addon_id=static_theme.id, date=tuc.date, count=tuc.count) for tuc in theme_update_counts ] UpdateCount.objects.bulk_create(update_counts, 100)
def test_stats_from_model_update_count(): result = serialize_stats( UpdateCount(addon_id=321, date='2016-01-18', count=123, versions={ u'3.8': 2, u'3.7': 3 }, statuses={u'userEnabled': 5}, applications={ u'{ec8030f7-c20a-464f-9b0e-13a3a9e97384}': { u'3.6': 18 } }, oses={u'WINNT': 5}, locales={ u'en-us': 1, u'en-US': 4 })) assert json.loads(result) == { 'date': '2016-01-18', 'addon': 321, 'count': 123, 'versions': { '3.7': 3, '3.8': 2 }, 'oses': { 'WINNT': 5 }, 'applications': { '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}': { '3.6': 18 } }, 'locales': { 'en-US': 4, 'en-us': 1 }, 'statuses': { 'userEnabled': 5 } }
def test_update_app(self): firefox_guid = '{ec8030f7-c20a-464f-9b0e-13a3a9e97384}' uc = UpdateCount(addon_id=3615) self.command.update_app(uc, 'foobar', '1.0', 123) # Non-existent app. assert not uc.applications # Malformed versions. self.command.update_app(uc, firefox_guid, '3.0.1.2', 123) self.command.update_app(uc, firefox_guid, '3.0123', 123) self.command.update_app(uc, firefox_guid, '3.0c2', 123) self.command.update_app(uc, firefox_guid, 'a.b.c', 123) assert not uc.applications # Well formed versions. self.command.update_app(uc, firefox_guid, '1.0', 123) self.command.update_app(uc, firefox_guid, '1.0.1', 124) self.command.update_app(uc, firefox_guid, '1.0a1', 125) self.command.update_app(uc, firefox_guid, '1.0b2', 126) assert uc.applications == {firefox_guid: { '1.0': 123, '1.0.1': 124, '1.0a1': 125, '1.0b2': 126}}
def test_update_locale(self): current_locales = [ # Taken from the language pack index. 'ach', 'af', 'ak', 'an', 'ar', 'as', 'ast', 'ast-ES', 'az', 'bb-BK', 'be', 'bg', 'bn-BD', 'bn-IN', 'br', 'bs', 'ca', 'ca-valencia', 'cs', 'csb', 'cy', 'cy-GB', 'da', 'de', 'dsb', 'el', 'en-GB', 'en-ZA', 'eo', 'es-AR', 'es-CL', 'es-ES', 'es-MX', 'et', 'eu', 'fa', 'ff', 'fi', 'fj-FJ', 'fr', 'fur-IT', 'fy-NL', 'ga-IE', 'gd', 'gl', 'gu-IN', 'he', 'hi', 'hi-IN', 'hr', 'hsb', 'hu', 'hy-AM', 'id', 'is', 'it', 'ja', 'kk', 'km', 'kn', 'ko', 'ku', 'lg', 'lij', 'lt', 'lv', 'mai', 'mg', 'mk', 'ml', 'mr', 'ms', 'nb-NO', 'nl', 'nn-NO', 'nr', 'nso', 'or', 'pa-IN', 'pl', 'pt-BR', 'pt-PT', 'rm', 'ro', 'ru', 'si', 'sk', 'sl', 'son', 'sq', 'sr', 'ss', 'st', 'sv-SE', 'sw', 'sw-TZ', 'ta', 'ta-IN', 'ta-LK', 'te', 'th', 'tn', 'tr', 'ts', 'uk', 'ur', 've', 'vi', 'wa', 'wo-SN', 'xh', 'zap-MX-diiste', 'zh-CN', 'zh-TW', 'zu'] uc = UpdateCount(addon_id=3615) self.command.update_locale(uc, 'foobar', 123) # Non-existent locale. assert not uc.locales for locale in current_locales: self.command.update_locale(uc, locale, 1) assert len(uc.locales) == len(current_locales)
def test_update_status(self): uc = UpdateCount(addon_id=3615) self.command.update_status(uc, 'foobar', 123) # Non-existent status. assert not uc.statuses self.command.update_status(uc, 'userEnabled', 123) assert uc.statuses == {'userEnabled': 123}
def test_update_os(self): uc = UpdateCount(addon_id=3615) self.command.update_os(uc, 'foobar', 123) # Non-existent OS. assert not uc.oses self.command.update_os(uc, 'WINNT', 123) assert uc.oses == {'WINNT': 123}
def handle(self, *args, **options): sep = options['separator'] start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([settings.AWS_STATS_S3_BUCKET, 'amo_stats', 'theme_update_counts', day, '000000_0']) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'theme_update_counts.hive') # Make sure we're not trying to update with mismatched data. if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. ThemeUpdateCount.objects.filter(date=day).delete() theme_update_counts = {} new_stheme_update_counts = {} # Preload a set containing the ids of all the persona Add-on objects # that we care about. When looping, if we find an id that is not in # that set, we'll reject it. addons = set(Addon.objects.filter(type=amo.ADDON_PERSONA, status=amo.STATUS_PUBLIC, persona__isnull=False) .values_list('id', flat=True)) # Preload a dict of persona to static theme ids that are migrated. migrated_personas = dict( MigratedLWT.objects.values_list( 'lightweight_theme_id', 'static_theme_id') ) existing_stheme_update_counts = { uc.addon_id: uc for uc in UpdateCount.objects.filter( addon_id__in=migrated_personas.values())} # Preload all the Personas once and for all. This builds a dict where # each key (the persona_id we get from the hive query) has the addon_id # as value. persona_to_addon = dict(Persona.objects.values_list('persona_id', 'addon_id')) count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 4: log.debug('Badly formatted row: %s' % line) continue day, id_, src, count = splitted try: id_, count = int(id_), int(count) except ValueError: # Badly formatted? Drop. continue if src: src = src.strip() # If src is 'gp', it's an old request for the persona id. if id_ not in persona_to_addon and src == 'gp': continue # No such persona. addon_id = persona_to_addon[id_] if src == 'gp' else id_ # Is the persona already migrated to static theme? if addon_id in migrated_personas: mig_addon_id = migrated_personas[addon_id] if mig_addon_id in existing_stheme_update_counts: existing_stheme_update_counts[mig_addon_id].count += count existing_stheme_update_counts[mig_addon_id].save() elif mig_addon_id in new_stheme_update_counts: new_stheme_update_counts[mig_addon_id].count += count else: new_stheme_update_counts[mig_addon_id] = UpdateCount( addon_id=mig_addon_id, date=day, count=count) # Does this addon exist? if addon_id not in addons: continue # Memoize the ThemeUpdateCount. if addon_id in theme_update_counts: tuc = theme_update_counts[addon_id] else: tuc = ThemeUpdateCount(addon_id=addon_id, date=day, count=0) theme_update_counts[addon_id] = tuc # We can now fill the ThemeUpdateCount object. tuc.count += count # Create in bulk: this is much faster. ThemeUpdateCount.objects.bulk_create(theme_update_counts.values(), 100) UpdateCount.objects.bulk_create(new_stheme_update_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up file. if options['stats_source'] == 'file': log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): sep = options['separator'] start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') groups = ('app', 'locale', 'os', 'status', 'version') group_filepaths = [] # Make sure we're not trying to update with mismatched data. for group in groups: if options['stats_source'] == 's3': filepath = 's3://' + '/'.join([ settings.AWS_STATS_S3_BUCKET, settings.AWS_STATS_S3_PREFIX, 'update_counts_by_%s' % group, day, '000000_0' ]) elif options['stats_source'] == 'file': folder = options['folder_name'] folder = path.join(settings.TMP_PATH, folder, day) filepath = path.join(folder, 'update_counts_by_%s.hive' % group) if get_date(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) group_filepaths.append((group, filepath)) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. UpdateCount.objects.filter(date=day).delete() # Memoize the addons and the UpdateCounts. update_counts = {} # Perf: preload all the addons once and for all. # This builds a dict where each key (the addon guid we get from the # hive query) has the addon_id as value. guids_to_addon = ( dict( Addon.unfiltered.exclude(status=amo.STATUS_NULL).exclude( guid__isnull=True) # Shouldn't be necessary to exclude _ADDON_PERSONA now but we've # still got a huge number of deleted LWT in the database. .exclude(type=9).values_list('guid', 'id'))) for group, filepath in group_filepaths: count_file = get_stats_data(filepath) for index, line in enumerate(count_file): if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if ((group == 'app' and len(splitted) != 6) or (group != 'app' and len(splitted) != 5)): log.debug('Badly formatted row: %s' % line) continue if group == 'app': day, addon_guid, app_id, app_ver, count, \ update_type = splitted else: day, addon_guid, data, count, update_type = splitted addon_guid = addon_guid.strip() if update_type: update_type.strip() # Old versions of Firefox don't provide the update type. # All the following are "empty-like" values. if update_type in [ '0', 'NULL', 'None', '', '\\N', '%UPDATE_TYPE%' ]: update_type = None try: count = int(count) if update_type: update_type = int(update_type) except ValueError: # Badly formatted? Drop. continue # The following is magic that I don't understand. I've just # been told that this is the way we can make sure a request # is valid: # > the lower bits for updateType (eg 112) should add to # > 16, if not, ignore the request. # > udpateType & 31 == 16 == valid request. if update_type and update_type & 31 != 16: log.debug("Update type doesn't add to 16: %s" % update_type) continue # Does this addon exist? if addon_guid and addon_guid in guids_to_addon: addon_id = guids_to_addon[addon_guid] else: log.debug(u"Addon {guid} doesn't exist.".format( guid=addon_guid.strip())) continue # Memoize the UpdateCount. if addon_guid in update_counts: uc = update_counts[addon_guid] else: uc = UpdateCount(date=day, addon_id=addon_id, count=0) update_counts[addon_guid] = uc # We can now fill the UpdateCount object. if group == 'version': self.update_version(uc, data, count) elif group == 'status': self.update_status(uc, data, count) if data == UPDATE_COUNT_TRIGGER: # Use this count to compute the global number # of daily users for this addon. uc.count += count elif group == 'app': self.update_app(uc, app_id, app_ver, count) elif group == 'os': self.update_os(uc, data, count) elif group == 'locale': self.update_locale(uc, data, count) # Make sure the locales and versions fields aren't too big to fit in # the database. Those two fields are the only ones that are not fully # validated, so we could end up with just anything in there (spam, # buffer overflow attempts and the like). # We don't care that they will increase the numbers, but we do not want # those to break the process because of a "Data too long for column # 'version'" error. # The database field (TEXT), can hold up to 2^16 = 64k characters. # If the field is longer than that, we we drop the least used items # (with the lower count) until the field fits. for addon_guid, update_count in update_counts.items(): self.trim_field(update_count.locales) self.trim_field(update_count.versions) # Create in bulk: this is much faster. UpdateCount.objects.bulk_create(update_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up files. if options['stats_source'] == 'file': for _, filepath in group_filepaths: log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def setUp(self): self.extension = addon_factory() self.static_theme = addon_factory(type=amo.ADDON_STATICTHEME) self.unpopular_extension = addon_factory() self.unpopular_theme = addon_factory(type=amo.ADDON_STATICTHEME) self.barely_popular_theme = addon_factory(type=amo.ADDON_STATICTHEME) self.same_stats_as_barely_popular_theme = addon_factory() self.awaiting_review = addon_factory(status=amo.STATUS_NOMINATED) today = datetime.date.today() stats = [(today - datetime.timedelta(days=days_in_past), update_count) for days_in_past, update_count in ( (1, 827080), (2, 787930), (3, 995860), (4, 1044260), (5, 105431), (6, 106065), (7, 980930), (8, 817100), (9, 78843), (10, 993830), (11, 104431), (12, 105943), (13, 105039), (14, 100183), (15, 82265), (16, 100183), (17, 82265), (18, 100183), (19, 82265), (20, 100183), (21, 82265), )] unpopular_stats = [(today - datetime.timedelta(days=days_in_past), update_count) for days_in_past, update_count in ( (1, 99), (2, 76), (3, 25), (4, 32), (5, 289), (6, 34), (7, 45), (8, 25), (9, 78), (10, 36), (11, 25), (12, 100), (13, 156), (14, 24), (15, 9), (16, 267), (17, 176), (18, 16), (19, 156), (20, 187), (21, 149), )] barely_popular_stats = [(today - datetime.timedelta(days=days_in_past), update_count) for days_in_past, update_count in ( (1, 399), (2, 276), (3, 215), (4, 312), (5, 289), (6, 234), (7, 345), (8, 205), (9, 178), (10, 336), (11, 325), (12, 400), (13, 456), (14, 324), (15, 290), (16, 267), (17, 276), (18, 216), (19, 256), (20, 287), (21, 249), )] for obj in (self.extension, self.static_theme, self.awaiting_review): UpdateCount.objects.bulk_create([ UpdateCount(addon=obj, date=date, count=count) for date, count in stats ]) for obj in (self.unpopular_extension, self.unpopular_theme): UpdateCount.objects.bulk_create([ UpdateCount(addon=obj, date=date, count=count) for date, count in unpopular_stats ]) for obj in (self.barely_popular_theme, self.same_stats_as_barely_popular_theme): UpdateCount.objects.bulk_create([ UpdateCount(addon=obj, date=date, count=count) for date, count in barely_popular_stats ])