def update_addon_average_daily_users(chunk_size=250): """Update add-ons ADU totals.""" if not waffle.switch_is_active('local-statistics-processing'): return False counts = dict( # In order to reset the `average_daily_users` values of add-ons that # don't exist in BigQuery, we prepare a set of `(guid, 0)` for most # add-ons. Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude( guid__isnull=True ).exclude(guid__exact='' ).exclude(average_daily_users=0 ).annotate(count=Value(0, IntegerField()) ).values_list('guid', 'count') # Just to make order predictable in tests, we order by id. This # matches the GROUP BY being generated so it should be safe. .order_by('id')) # Update the `counts` with values from BigQuery. counts.update(get_addons_and_average_daily_users_from_bigquery()) counts = list(counts.items()) log.info('Preparing update of `average_daily_users` for %s add-ons.', len(counts)) create_chunked_tasks_signatures(_update_addon_average_daily_users, counts, chunk_size).apply_async()
def gather_index_stats_tasks(index, addons=None, dates=None): """ Return the list of task groups to execute to index statistics for the given index/dates/addons. """ queries = [ (UpdateCount.objects, index_update_counts, {'date': 'date'}), (DownloadCount.objects, index_download_counts, {'date': 'date'}), ] jobs = [] for qs, task, fields in queries: date_field = fields['date'] if dates or addons: qs = qs.order_by('-%s' % date_field) qs = qs.values_list('id', flat=True) if addons: pks = [int(a.strip()) for a in addons.split(',')] qs = qs.filter(addon__in=pks) if dates: if ':' in dates: qs = qs.filter(**{'%s__range' % date_field: dates.split(':')}) else: qs = qs.filter(**{date_field: dates}) if not (dates or addons): # We're loading the whole world. Do it in stages so we get most # recent stats first and don't do huge queries. limits = (qs.model.objects.filter(**{'%s__isnull' % date_field: False}) .extra(where=['%s <> "0000-00-00"' % date_field]) .aggregate(min=Min(date_field), max=Max(date_field))) # If there isn't any data at all, skip over. if not (limits['max'] or limits['min']): continue num_days = (limits['max'] - limits['min']).days for start in range(0, num_days, STEP): stop = start + STEP - 1 date_range = (limits['max'] - timedelta(days=stop), limits['max'] - timedelta(days=start)) data = list(qs.filter(**{ '%s__range' % date_field: date_range })) if data: jobs.append(create_chunked_tasks_signatures( task, data, CHUNK_SIZE, task_args=(index,))) else: jobs.append(create_chunked_tasks_signatures( task, list(qs), CHUNK_SIZE, task_args=(index,))) return jobs
def run_yara_query_rule(query_rule_pk): """ Run a specific ScannerQueryRule on multiple Versions. """ # We're not forcing this task to happen on primary db to let the replicas # handle the Version query below, but we want to fetch the rule using the # primary db in all cases. rule = ScannerQueryRule.objects.using('default').get(pk=query_rule_pk) # Build a huge list of all pks we're going to run the tasks on. pks = Version.unfiltered.all().filter( addon__type=amo.ADDON_EXTENSION, files__is_webextension=True, ).exclude(addon__status=amo.STATUS_DISABLED, ).filter( Q(channel=amo.RELEASE_CHANNEL_UNLISTED) | Q(channel=amo.RELEASE_CHANNEL_LISTED, pk=F('addon___current_version'))).values_list( 'id', flat=True).order_by('pk') rule.update(state=RUNNING) # Build the workflow using a group of tasks dealing with 250 files at a # time, chained to a task that marks the query as completed. chunk_size = 250 workflow = (create_chunked_tasks_signatures( run_yara_query_rule_on_versions_chunk, list(pks), chunk_size, task_args=(query_rule_pk, )) | mark_yara_query_rule_as_completed.si(query_rule_pk)) # Fire it up. workflow.apply_async()
def handle(self, *args, **options): ids = AddonGUID.objects.filter(hashed_guid=None).values_list('id', flat=True) chunked_tasks = create_chunked_tasks_signatures(backfill_hashed_guids, items=list(ids), chunk_size=100) chunked_tasks.apply_async()
def get_indexing_tasks_for_qs(qs): index_data_tasks = create_chunked_tasks_signatures( cls.get_indexing_task(), qs, cls.CHUNK_SIZE, task_args=(index_name,)) # Unwrap the tasks from the group create_chunked_tasks_signatures() # returned, we'll create our own flat group with all the tasks, # no need to create unnecessary nesting. return index_data_tasks.tasks
def reindex_tasks_group(index_name): """ Return the group of tasks to execute for a full reindex of addons on the index called `index_name` (which is not an alias but the real index name). """ from olympia.addons.models import Addon from olympia.addons.tasks import index_addons ids = Addon.unfiltered.values_list('id', flat=True).order_by('id') chunk_size = 150 return create_chunked_tasks_signatures(index_addons, list(ids), chunk_size)
def run_yara_query_rule(query_rule_pk): """ Run a specific ScannerQueryRule on multiple Versions. Needs the rule to be a the SCHEDULED state, otherwise does nothing. """ # We're not forcing this task to happen on primary db to let the replicas # handle the Version query below, but we want to fetch the rule using the # primary db in all cases. rule = ScannerQueryRule.objects.using('default').get(pk=query_rule_pk) try: rule.change_state_to(RUNNING) except ImproperScannerQueryRuleStateError: log.error( 'Not proceeding with run_yara_query_rule on rule %s because ' 'its state is %s', rule.pk, rule.get_state_display(), ) return log.info('Fetching versions for run_yara_query_rule on rule %s', rule.pk) # Build a huge list of all pks we're going to run the tasks on. qs = Version.unfiltered.filter( addon__type=amo.ADDON_EXTENSION, files__is_webextension=True, ) if not rule.run_on_disabled_addons: qs = qs.exclude(addon__status=amo.STATUS_DISABLED) qs = qs.values_list('id', flat=True).order_by('pk') # Build the workflow using a group of tasks dealing with 250 files at a # time, chained to a task that marks the query as completed. chunk_size = 250 chunked_tasks = create_chunked_tasks_signatures( run_yara_query_rule_on_versions_chunk, list(qs), chunk_size, task_args=(query_rule_pk, ), ) # Force the group id to be generated for those tasks, and store it in the # result backend. group_result = chunked_tasks.freeze() group_result.save() rule.update(task_count=len(chunked_tasks), celery_group_result_id=uuid.UUID(group_result.id)) workflow = chunked_tasks | mark_yara_query_rule_as_completed_or_aborted.si( query_rule_pk) log.info( 'Running workflow of %s tasks for run_yara_query_rule on rule %s', len(chunked_tasks), rule.pk, ) # Fire it up. workflow.apply_async()
def handle(self, *args, **options): log = olympia.core.logger.getLogger('z.files') files = File.objects.filter( is_webextension=True, version__addon__type=amo.ADDON_EXTENSION).order_by('pk') pks = files.values_list('pk', flat=True) log.info('Using %s file pks to extract permissions' % pks.count()) if pks: chunked_tasks = create_chunked_tasks_signatures( extract_optional_permissions, list(pks), chunk_size=100) chunked_tasks.apply_async()
def deliver_hotness(chunk_size=300): """ Calculate hotness of all add-ons. a = avg(users this week) b = avg(users three weeks before this week) threshold = 250 if addon type is theme, else 1000 hotness = (a-b) / b if a > threshold and b > 1 else 0 """ frozen_guids = list(set(fa.addon.guid for fa in FrozenAddon.objects.all())) averages = get_averages_by_addon_from_bigquery(today=date.today(), exclude=frozen_guids) create_chunked_tasks_signatures(update_addon_hotness, averages.items(), chunk_size).apply_async() # Reset add-ons that won't be returned by BigQuery. addon_ids = (Addon.objects.filter(status__in=amo.REVIEWED_STATUSES).filter( hotness__gt=0).exclude(guid__in=averages.keys()).values_list( 'id', flat=True)) create_chunked_tasks_signatures(reset_addon_hotness, addon_ids, chunk_size).apply_async()
def update_addon_hotness(chunk_size=300): """ Calculate hotness of all add-ons. a = avg(users this week) b = avg(users three weeks before this week) threshold = 250 if addon type is theme, else 1000 hotness = (a-b) / b if a > threshold and b > 1 else 0 """ frozen_guids = list( {fa.addon.guid for fa in FrozenAddon.objects.all() if fa.addon.guid} ) log.info('Found %s frozen add-on GUIDs.', len(frozen_guids)) amo_guids = ( Addon.objects.exclude(guid__in=frozen_guids) .exclude(guid__isnull=True) .exclude(guid__exact='') .exclude(hotness=0) .values_list('guid', flat=True) ) averages = { guid: {'avg_this_week': 1, 'avg_three_weeks_before': 1} for guid in amo_guids } log.info('Found %s add-on GUIDs in AMO DB.', len(averages)) bq_averages = get_averages_by_addon_from_bigquery( today=date.today(), exclude=frozen_guids ) log.info('Found %s add-on GUIDs with averages in BigQuery.', len(bq_averages)) averages.update(bq_averages) log.info('Preparing update of `hotness` for %s add-ons.', len(averages)) create_chunked_tasks_signatures( _update_addon_hotness, averages.items(), chunk_size ).apply_async()
def update_addon_weekly_downloads(chunk_size=250): """ Update 7-day add-on download counts. """ counts = dict( # In order to reset the `weekly_downloads` values of add-ons that # don't exist in BigQuery, we prepare a set of `(hashed_guid, 0)` # for most add-ons. Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude( guid__isnull=True ).exclude(guid__exact='' ).exclude(weekly_downloads=0 ).annotate(count=Value(0, IntegerField()) ).values_list('addonguid__hashed_guid', 'count')) # Update the `counts` with values from BigQuery. counts.update(get_addons_and_weekly_downloads_from_bigquery()) counts = list(counts.items()) log.info('Preparing update of `weekly_downloads` for %s add-ons.', len(counts)) create_chunked_tasks_signatures(_update_addon_weekly_downloads, counts, chunk_size).apply_async()
def update_addon_weekly_downloads(chunk_size=250): """ Update 7-day add-on download counts. """ if waffle.switch_is_active('use-bigquery-for-download-stats-cron'): counts = dict( # In order to reset the `weekly_downloads` values of add-ons that # don't exist in BigQuery, we prepare a set of `(guid, 0)` for most # add-ons. Addon.objects.filter(type__in=amo.ADDON_TYPES_WITH_STATS).exclude( guid__isnull=True ).exclude(guid__exact='' ).exclude(weekly_downloads=0 ).annotate(count=Value(0, IntegerField()) ).values_list('guid', 'count')) # Update the `counts` with values from BigQuery. counts.update(get_addons_and_weekly_downloads_from_bigquery()) counts = list(counts.items()) log.info('Preparing update of `weekly_downloads` for %s add-ons.', len(counts)) create_chunked_tasks_signatures(_update_addon_weekly_downloads, counts, chunk_size).apply_async() else: raise_if_reindex_in_progress('amo') with connection.cursor() as cursor: cursor.execute(""" SELECT addon_id, SUM(count) AS weekly_count FROM download_counts WHERE `date` >= DATE_SUB(CURDATE(), INTERVAL 7 DAY) GROUP BY addon_id ORDER BY addon_id""") counts = cursor.fetchall() addon_ids = [r[0] for r in counts] if not addon_ids: return with connection.cursor() as cursor: cursor.execute( """ SELECT id, 0 FROM addons WHERE id NOT IN %s""", (addon_ids, )) counts += cursor.fetchall() cursor.execute(""" CREATE TEMPORARY TABLE tmp_wd (addon_id INT PRIMARY KEY, count INT)""") cursor.execute( 'INSERT INTO tmp_wd VALUES %s' % ','.join(['(%s,%s)'] * len(counts)), list(itertools.chain(*counts))) cursor.execute(""" UPDATE addons INNER JOIN tmp_wd ON addons.id = tmp_wd.addon_id SET weeklydownloads = tmp_wd.count""") cursor.execute("DROP TABLE IF EXISTS tmp_wd")
def gather_index_stats_tasks(index, addons=None, dates=None): """ Return the list of task groups to execute to index statistics for the given index/dates/addons. """ queries = [ (UpdateCount.objects, index_update_counts, {'date': 'date'}), (DownloadCount.objects, index_download_counts, {'date': 'date'}), (ThemeUserCount.objects, index_theme_user_counts, {'date': 'date'}) ] jobs = [] for qs, task, fields in queries: date_field = fields['date'] if dates or addons: qs = qs.order_by('-%s' % date_field) qs = qs.values_list('id', flat=True) if addons: pks = [int(a.strip()) for a in addons.split(',')] qs = qs.filter(addon__in=pks) if dates: if ':' in dates: qs = qs.filter(**{'%s__range' % date_field: dates.split(':')}) else: qs = qs.filter(**{date_field: dates}) if not (dates or addons): # We're loading the whole world. Do it in stages so we get most # recent stats first and don't do huge queries. limits = (qs.model.objects.filter(**{'%s__isnull' % date_field: False}) .extra(where=['%s <> "0000-00-00"' % date_field]) .aggregate(min=Min(date_field), max=Max(date_field))) # If there isn't any data at all, skip over. if not (limits['max'] or limits['min']): continue num_days = (limits['max'] - limits['min']).days for start in range(0, num_days, STEP): stop = start + STEP - 1 date_range = (limits['max'] - timedelta(days=stop), limits['max'] - timedelta(days=start)) data = list(qs.filter(**{ '%s__range' % date_field: date_range })) if data: jobs.append(create_chunked_tasks_signatures( task, data, CHUNK_SIZE, task_args=(index,))) else: jobs.append(create_chunked_tasks_signatures( task, list(qs), CHUNK_SIZE, task_args=(index,))) return jobs