def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.TMP_PATH, folder, day) sep = options['separator'] groups = ('version', 'status', 'app', 'os', 'locale') group_filepaths = [] # Make sure we're not trying to update with mismatched data. for group in groups: filepath = path.join(folder, 'update_counts_by_%s.hive' % group) if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) group_filepaths.append((group, filepath)) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. UpdateCount.objects.filter(date=day).delete() # Memoize the addons and the UpdateCounts. update_counts = {} # Perf: preload all the addons once and for all. # This builds a dict where each key (the addon guid we get from the # hive query) has the addon_id as value. guids_to_addon = (dict(Addon.objects.exclude(guid__isnull=True) .filter(type=amo.ADDON_EXTENSION) .values_list('guid', 'id'))) index = -1 for group, filepath in group_filepaths: with open(filepath) as results_file: for line in results_file: index += 1 if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if ((group == 'app' and len(splitted) != 6) or (group != 'app' and len(splitted) != 5)): log.debug('Badly formatted row: %s' % line) continue if group == 'app': day, addon_guid, app_id, app_ver, count, \ update_type = splitted else: day, addon_guid, data, count, update_type = splitted if update_type: update_type.strip() # Old versions of Firefox don't provide the update type. # All the following are "empty-like" values. if update_type in ['0', 'NULL', 'None', '', '\N', '%UPDATE_TYPE%']: update_type = None try: count = int(count) if update_type: update_type = int(update_type) except ValueError: # Badly formatted? Drop. continue # The following is magic that I don't understand. I've just # been told that this is the way we can make sure a request # is valid: # > the lower bits for updateType (eg 112) should add to # > 16, if not, ignore the request. # > udpateType & 31 == 16 == valid request. if update_type and update_type & 31 != 16: log.debug("Update type doesn't add to 16: %s" % update_type) continue # Does this addon exit? if addon_guid.strip() and addon_guid in guids_to_addon: addon_id = guids_to_addon[addon_guid] else: continue # Memoize the UpdateCount. if addon_guid in update_counts: uc = update_counts[addon_guid] else: uc = UpdateCount(date=day, addon_id=addon_id, count=0) update_counts[addon_guid] = uc # We can now fill the UpdateCount object. if group == 'version': # Take this count as the global number of daily users. uc.count += count uc.versions = update_inc(uc.versions, data, count) elif group == 'status': uc.statuses = update_inc(uc.statuses, data, count) elif group == 'app': # Applications is a dict of dicts, eg: # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}": # {"10.0": 2, "21.0": 1, ....}, # "some other application guid": ... # } if uc.applications is None: uc.applications = {} app = uc.applications.get(app_id, {}) # Now overwrite this application's dict with # incremented counts for its versions. uc.applications.update( {app_id: update_inc(app, app_ver, count)}) elif group == 'os': uc.oses = update_inc(uc.oses, data, count) elif group == 'locale': # Drop incorrect locales sizes. if len(data) > 10: continue # Collapse locales to `xx_yy` if possible. data = data.strip().lower().replace('-', '_') uc.locales = update_inc(uc.locales, data, count) # Create in bulk: this is much faster. UpdateCount.objects.bulk_create(update_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start)) # Clean up files. for _, filepath in group_filepaths: log.debug('Deleting {path}'.format(path=filepath)) unlink(filepath)
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: day = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') folder = args[0] if args else 'hive_results' folder = path.join(settings.NETAPP_STORAGE, 'tmp', folder, day) sep = options['separator'] groups = ('version', 'status', 'app', 'os', 'locale') # Make sure we're not trying to update with mismatched data. for group in groups: filepath = path.join(folder, 'update_counts_by_%s.hive' % group) if get_date_from_file(filepath, sep) != day: raise CommandError('%s file contains data for another day' % filepath) # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. UpdateCount.objects.filter(date=day).delete() # Memoize the addons and the UpdateCounts. update_counts = {} # Perf: preload all the addons once and for all. # This builds a dict where each key (the addon guid we get from the # hive query) has the addon_id as value. guids_to_addon = (dict( Addon.objects.exclude(guid__isnull=True).filter( type=amo.ADDON_EXTENSION).values_list('guid', 'id'))) index = -1 for group in groups: filepath = path.join(folder, 'update_counts_by_%s.hive' % group) with open(filepath) as results_file: for line in results_file: index += 1 if index and (index % 1000000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if ((group == 'app' and len(splitted) != 6) or (group != 'app' and len(splitted) != 5)): log.debug('Badly formatted row: %s' % line) continue if group == 'app': day, addon_guid, app_id, app_ver, count, \ update_type = splitted else: day, addon_guid, data, count, update_type = splitted try: count, update_type = int(count), int(update_type) except ValueError: # Badly formatted? Drop. continue # The following is magic that I don't understand. I've just # been told that this is the way we can make sure a request # is valid: # > the lower bits for updateType (eg 112) should add to # > 16, if not, ignore the request. # > udpateType & 31 == 16 == valid request. if update_type & 31 != 16: log.debug("Update type doesn't add to 16: %s" % update_type) continue # Does this addon exit? if addon_guid.strip() and addon_guid in guids_to_addon: addon_id = guids_to_addon[addon_guid] else: continue # Memoize the UpdateCount. if addon_guid in update_counts: uc = update_counts[addon_guid] else: uc = UpdateCount(date=day, addon_id=addon_id, count=0) update_counts[addon_guid] = uc # We can now fill the UpdateCount object. if group == 'version': # Take this count as the global number of daily users. uc.count += count uc.versions = update_inc(uc.versions, data, count) elif group == 'status': uc.statuses = update_inc(uc.statuses, data, count) elif group == 'app': # Applications is a dict of dicts, eg: # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}": # {"10.0": 2, "21.0": 1, ....}, # "some other application guid": ... # } if uc.applications is None: uc.applications = {} app = uc.applications.get(app_id, {}) # Now overwrite this application's dict with # incremented counts for its versions. uc.applications.update( {app_id: update_inc(app, app_ver, count)}) elif group == 'os': uc.oses = update_inc(uc.oses, data, count) elif group == 'locale': # Drop incorrect locales sizes. if len(data) > 10: continue # Collapse locales to `xx_yy` if possible. data = data.strip().lower().replace('-', '_') uc.locales = update_inc(uc.locales, data, count) # Create in bulk: this is much faster. UpdateCount.objects.bulk_create(update_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start))
def handle(self, *args, **options): start = datetime.now() # Measure the time it takes to run the script. day = options['date'] if not day: raise CommandError('You must specify a --date parameter in the ' ' YYYY-MM-DD format.') sep = options['separator'] filename = args[0] # First, make sure we don't have any existing counts for the same day, # or it would just increment again the same data. UpdateCount.objects.filter(date=day).delete() # Memoize the addons and the UpdateCounts. update_counts = {} # Perf: preload all the addons once and for all. # This builds a dict where each key (the addon guid we get from the # hive query) has the addon_id as value. guids_to_addon = dict(Addon.objects.values_list('guid', 'id')) with open(filename) as count_file: for index, line in enumerate(count_file): if index and (index % 10000) == 0: log.info('Processed %s lines' % index) splitted = line[:-1].split(sep) if len(splitted) != 8: log.debug('Badly formatted row: %s' % line) continue counter, addon_guid, version, status, app_id, version, \ app_os, locale, update_type = splitted try: counter = int(counter) except ValueError: # Badly formatted? Drop. continue # The following is magic that I don't understand. I've just # been told that this is the way we can make sure a request is # valid: # > the lower bits for updateType=112 should add to 16, if not, # > ignore the request. udpateType & 31 == 16 == valid request. # The 8th column is the updateType this quote is talking about. try: if int(update_type) & 31 == 16: continue except: continue # We may have several statuses in the same field. statuses = status.split(',') # Does this addon exit? if addon_guid in guids_to_addon: addon_id = guids_to_addon[addon_guid] else: log.info('Addon with guid: %s not found' % addon_guid) continue # Memoize the UpdateCount. if addon_guid in update_counts: uc = update_counts[addon_guid] else: uc = UpdateCount(date=day, addon_id=addon_id, count=0) update_counts[addon_guid] = uc # We can now fill the UpdateCount object. uc.count += counter uc.versions = update_inc(uc.versions, version, counter) # Applications is a dict of dicts, eg: # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}": # Firefox. # {"10.0": 2, "21.0": 1, ....}, # "some other application guid": ... # } if uc.applications is None: uc.applications = {} app = uc.applications.get(app_id, {}) # Now overwrite this application's dict with incremented # counts for its versions. uc.applications.update( {app_id: update_inc(app, version, counter)}) uc.oses = update_inc(uc.oses, app_os, counter) uc.locales = update_inc(uc.locales, locale, counter) # We may have received a list of more than one status. for status in statuses: uc.statuses = update_inc(uc.statuses, status, counter) # Create in bulk: this is much faster. UpdateCount.objects.bulk_create(update_counts.values(), 100) log.info('Processed a total of %s lines' % (index + 1)) log.debug('Total processing time: %s' % (datetime.now() - start))