Exemplo n.º 1
0
def run_sweep_and_sleep(sweep_id: str = None):
    """
    Like run_sweep but actually sleeps for suggested amount of time before quitting.

    This is used to internalize the management of period between consecutive sweep runs.
    This is a crude way to spacing out the sweep runs. Alternative would be to
    turn runner back into a Celery task and use Celery timed delay API for recursive
    self-scheduling.
    """
    delay_next_sweep_start_by = run_sweep(sweep_id=sweep_id)
    _measurement_name_base = __name__ + '.run_sweep_and_sleep.'  # <- function name. adjust if changed
    _measurement_tags = {'sweep_id': sweep_id}
    Measure.gauge(_measurement_name_base + 'delay_next_sweep_start_by',
                  tags=_measurement_tags)(int(delay_next_sweep_start_by))
    logger.info(
        f"Done with main sweep run. Waiting for {delay_next_sweep_start_by} seconds before quitting"
    )
    time.sleep(delay_next_sweep_start_by)
Exemplo n.º 2
0
def _send_measurement_task_runtime(job_scope: JobScope, bucket: int):
    _measurement_base_name = f'{__name__}.report_tasks_outcome'
    _measurement_tags = {
        'ad_account_id': job_scope.ad_account_id,
        'sweep_id': job_scope.sweep_id,
        'report_type': job_scope.report_type,
        'report_variant': job_scope.report_variant,
        'bucket': bucket,
        'job_type': job_scope.job_type,
    }
    if job_scope.datapoint_count and job_scope.datapoint_count > 0:
        Measure.counter(f'{_measurement_base_name}.data_points',
                        tags=_measurement_tags).increment(
                            job_scope.datapoint_count)
        Measure.histogram(f'{_measurement_base_name}.data_points',
                          tags=_measurement_tags)(job_scope.datapoint_count)

    Measure.gauge(f'{_measurement_base_name}.running_time',
                  tags=_measurement_tags)(job_scope.running_time)
Exemplo n.º 3
0
    def ooze_task(self, task: CeleryTask, job_scope: JobScope,
                  job_context: JobContext, score: int):
        """Blocking task oozing function."""
        if OOZER_ENABLE_LEARNING and self.should_review_oozer_rate:
            pulse = self.sweep_status_tracker.get_pulse()
            old_rate = self.oozing_rate
            logger.warning(
                f'Completed {self._tasks_since_review} tasks in {self.secs_since_oozer_rate_review} seconds'
            )
            self.oozing_rate = self.calculate_rate(old_rate, pulse)
            self._rate_review_time = self.current_time()
            self._tasks_since_review = 0
            logger.warning(
                f'Updated oozing rate from {old_rate:.2f} to {self.oozing_rate:.2f}'
            )
            Measure.gauge(f'{__name__}.oozing_rate',
                          tags={'sweep_id': self.sweep_id})(self.oozing_rate)

        if self._tasks_since_review > self.expected_tasks_since_oozer_rate_review:
            gevent.sleep(self.wait_interval)

        self._ooze_task(task, job_scope, job_context, score)
        self._tasks_since_review += 1
Exemplo n.º 4
0
def build_sweep(sweep_id: str):
    from sweep_builder.init_tokens import init_tokens
    from sweep_builder.pipeline import iter_pipeline
    from sweep_builder.reality_inferrer.reality import iter_reality_base

    try:
        _measurement_name_base = __name__ + '.' + build_sweep.__name__ + '.'
        _measurement_tags = {'sweep_id': sweep_id}

        # In the jobs persister we purposefully avoid persisting
        # anything besides the Job ID. This means that things like tokens
        # and other data on *Claim is lost.
        # As long as we are doing that, we need to leave tokens somewhere
        # for workers to pick up.
        logger.info(f"#{sweep_id} Prepositioning platform tokens")
        init_tokens(sweep_id)

        logger.info(f"#{sweep_id} Starting sweep building")

        # task_group = TaskGroup()
        delayed_tasks = []

        cnt = 0
        with Measure.counter(_measurement_name_base + 'outer_loop',
                             tags=_measurement_tags) as cntr:

            for reality_claim in iter_reality_base():
                # what we get here are Scope and AdAccount objects.
                # Children of AdAccount reality claims are to be processed
                # in separate Celery tasks. But we still have jobs
                # associated with Scopes objects, so
                # need to rate and store the jobs before chipping off
                # a separate task for each of AdAccounts.
                if reality_claim.entity_type == Entity.AdAccount:

                    # child_task_id = task_group.generate_task_id()
                    # task_group.report_task_active(child_task_id)

                    delayed_tasks.append(
                        # we are using Celery chord to process AdAccounts in parallel
                        # for very very large (hundreds of thousands) numbers of AdAccounts,
                        # chord management will be super memory expensive,
                        # as chord timer/controller will be looking at entire list on
                        # each tick.
                        # In that case, probably better to switch to
                        # a callback per handler + mutex/counter somewhere
                        build_sweep_slice_per_ad_account_task.si(
                            sweep_id,
                            reality_claim,
                            # task_id=child_task_id
                        ))
                elif reality_claim.entity_type == Entity.Page:
                    delayed_tasks.append(
                        build_sweep_slice_per_page.si(sweep_id, reality_claim))
                else:
                    cnt = 1
                    _step = 1000
                    for _ in iter_pipeline(sweep_id, [reality_claim]):
                        cnt += 1
                        if cnt % _step == 0:
                            cntr += _step
                            logger.info(
                                f'#{sweep_id}-root: Queueing up #{cnt}')

                    # because above counter communicates only increments of _step,
                    # we need to report remainder --- amount under _step
                    cntr += cnt % _step

        logger.info(f"#{sweep_id}-root: Queued up a total of {cnt} tasks")

        # # here we fan out actual work to celery workers
        # # and wait for all tasks to finish before returning
        group_result = group(delayed_tasks).delay()

        # In case the workers crash, go-away (scaling) or are otherwise
        # non-responsive, the following would wait indefinitely.
        # Since that's not desirable and the total sweep build time is minutes at
        # maximum, we add a reasonable timeout
        # Because we are not joining on the results, but actually periodically
        # looking for "you done yet?", we can exit if this threshold is busted, and
        # let the next run recover from the situation
        # You will nee
        should_be_done_by = time.time() + (60 * 20)

        Measure.gauge(f'{_measurement_name_base}per_account_sweep.total',
                      tags=_measurement_tags)(len(group_result.results))

        # Monitor the progress. Although this obviously can be achieved with
        # group_result.join(), we need to "see" into the task group progress
        with Measure.gauge(f'{_measurement_name_base}per_account_sweep.done',
                           tags=_measurement_tags) as measure_done:
            while True:
                done_counter = 0
                for result in group_result.results:
                    logger.debug(f'{result}: {result.state}')
                    if result.ready():
                        done_counter += 1

                logger.debug(
                    f"TOTAL: {done_counter}/{len(group_result.results)}")
                logger.debug("=" * 20)

                logger.debug("Checking group result")

                measure_done(done_counter)
                if group_result.ready():
                    logger.debug(f"#{sweep_id}-root: Sweep build complete")
                    break

                # Important. If we don't sleep, the native join in celery context
                # switches all the time and we end up with 100% cpu, eventually somehow
                # deadlocking the process. 5 seconds is kind of an arbitrary number, but
                # does what we need and the impact of a (potential) delay is absolutely
                # minimal
                time.sleep(5)

                # The last line of defense. Workers did not finish in time we
                # expected, no point waiting, kill it.
                if time.time() > should_be_done_by:
                    Measure.gauge(
                        f'{_measurement_name_base}per_account_sweep.early_exits',
                        tags=_measurement_tags)(1)
                    logger.warning(
                        "Exiting incomplete sweep build, it's taking too long")
                    return

        logger.info("Waiting on results join")
        if group_result.supports_native_join:
            group_result.join_native()
        else:
            # Eager mode does not support native join.
            group_result.join()

        # # alternative to Celery's native group_result.join()
        # # our manual task tracking code + join()
        # task_group.join()
        logger.info("Join complete, sweep build ended")
    except Exception as ex:
        ErrorInspector.inspect(ex, None, {'sweep_id': sweep_id})