def analyze_shadow_schedulers(push: mozci.push.Push) -> dict:
    schedulers = []

    group_regressions = push.get_likely_regressions("group")
    config_group_regressions = push.get_likely_regressions("config_group")

    for name, config_groups in push.generate_all_shadow_scheduler_config_groups(
    ):
        if isinstance(config_groups, mozci.errors.TaskNotFound):
            continue

        groups = set(group for config, group in config_groups)

        schedulers.append({
            "name":
            name,
            "num_group_scheduled":
            len(groups),
            "num_group_regressions":
            len(group_regressions),
            "num_group_caught":
            len(group_regressions & groups),
            "num_config_group_scheduled":
            len(config_groups),
            "num_config_group_regressions":
            len(config_group_regressions),
            "num_config_group_caught":
            len(config_group_regressions & config_groups),
        })

    return {
        "id": push.rev,
        "date": push.date,
        "schedulers": schedulers,
    }
        def generate() -> Generator[PushResult, None, None]:
            num_cached = 0

            for push in tqdm(pushes):
                key = cache_key(push)

                if push in cache and cache[push] is not None:
                    num_cached += 1
                    cached = cache[push]
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()

                        value = (
                            push.revs,
                            list(runnables),
                            list(push.get_possible_regressions(granularity)),
                            list(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")
        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)

            for _ in tqdm(range(num_pushes)):
                push = pushes.pop(0)
                cached = futures.pop(0).result()

                semaphore.release()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                    # Regenerate results which don't contain the fix revision.
                    elif len(value) != 5:
                        cached = None

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        mozci.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except mozci.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")
    def generate_push_data(self, runnable):
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2
        )

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        def cache_key(push):
            return f"push_data.{runnable}.{push.rev}"

        # XXX: Some of the old pushes were stored without the mozci version, we
        # need to handle that until all have the version stored alongside them.
        for push in pushes:
            key = cache_key(push)
            cached = adr.config.cache.get(key)
            if not cached or isinstance(cached, tuple):
                continue

            adr.config.cache.forever(key, (cached, 0))

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        to_regenerate = set()
        for push in pushes[::-1]:
            cached = adr.config.cache.get(cache_key(push))
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000:
                to_regenerate.add(value[0][0])

        for push in tqdm(pushes):
            key = cache_key(push)

            if adr.config.cache.has(key) and push.revs[0] not in to_regenerate:
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    value, mozci_version = cached
                    push_data.append(value)
            else:
                logger.info(f"Analyzing {push.rev} at the {runnable} level...")

                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, (value, MOZCI_VERSION))
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            if time.monotonic() - start_time >= 10800:
                self.upload_adr_cache()
                start_time = time.monotonic()

        logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
            # run.
            to_regenerate = int(os.environ.get("OLD_RESULTS_TO_REGENERATE", 0))

            for _ in tqdm(range(num_pushes)):
                push = pushes.pop(0)
                cached = futures.pop(0).result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if granularity == "group" and any(
                        runnable.startswith("/") for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated when we didn't get a correct
                    # configuration for test-verify tasks.
                    elif granularity == "config_group" and any(
                        "test-verify" in runnable[0] for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION:
                        cached = None
                        to_regenerate -= 1

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")
示例#6
0
    def generate_push_data(self, runnable):
        def upload_adr_cache():
            cache_path = os.path.splitext(ADR_CACHE_DB)[0]
            assert os.path.abspath(adr.config["cache"]["stores"]["file"]
                                   ["path"]) == os.path.abspath(cache_path)

            with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar:
                tar.add(cache_path)

            db.upload(ADR_CACHE_DB)

        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2)

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        for push in tqdm(pushes):
            key = f"push_data.{runnable}.{push.rev}"

            logger.info(f"Analyzing {push.rev} at the {runnable} level...")

            if adr.config.cache.has(key):
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    # XXX: We have to support items in the cache that were added
                    # before the mozci version was stored. We can drop the if
                    # when all items have been switched over.
                    value = cached[0] if isinstance(cached, tuple) else cached
                    push_data.append(value)
            else:
                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, (value, MOZCI_VERSION))
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            if time.monotonic() - start_time >= 3600:
                upload_adr_cache()
                start_time = time.monotonic()

        logger.info(
            f"{num_cached} pushes were already cached out of {len(pushes)}")

        upload_adr_cache()

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
    def generate_push_data(self, runnable):
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2)

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        num_cached = 0

        push_data = []

        def cache_key(push):
            return f"push_data.{runnable}.{push.rev}"

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        to_regenerate = set()
        """for push in pushes[::-1]:
            cached = adr.config.cache.get(cache_key(push))
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000:
                to_regenerate.add(value[0][0])"""

        def periodically_upload_adr_cache():
            start_time = time.monotonic()
            while not upload_thread_stop.isSet():
                if time.monotonic() - start_time >= 10800:
                    self.upload_adr_cache()
                    start_time = time.monotonic()

                upload_thread_stop.wait(timeout=7)

        upload_thread = threading.Thread(target=periodically_upload_adr_cache)
        upload_thread_stop = threading.Event()
        upload_thread.start()

        s3_store = adr.util.cache_stores.S3Store({
            "bucket": "communitytc-bugbug",
            "prefix": "data/adr_cache/",
        })

        s3_store.set_serializer(CompressedPickleSerializer())

        for push in tqdm(pushes):
            key = cache_key(push)

            if adr.config.cache.has(key) and push.revs[0] not in to_regenerate:
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    s3_store.put(key, cached, adr.config["cache"]["retention"])
                    value, mozci_version = cached
                    push_data.append(value)
            else:
                logger.info(f"Analyzing {push.rev} at the {runnable} level...")

                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.put(key, (value, MOZCI_VERSION),
                                         adr.config["cache"]["retention"])
                    s3_store.put(key, (value, MOZCI_VERSION),
                                 adr.config["cache"]["retention"])
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

        upload_thread_stop.set()
        upload_thread.join()

        logger.info(
            f"{num_cached} pushes were already cached out of {len(pushes)}")

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
    def generate_push_data(self, runnable):
        def upload_adr_cache():
            cache_path = os.path.splitext(ADR_CACHE_DB)[0]
            assert os.path.abspath(
                adr.config["cache"]["stores"]["file"]["path"]
            ) == os.path.abspath(cache_path)

            with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar:
                tar.add(cache_path)

            db.upload(ADR_CACHE_DB)

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2
        )

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        for push in tqdm(pushes):
            key = f"push_data.{runnable}.{push.rev}"

            logger.info(f"Analyzing {push.rev} at the {runnable} level...")

            if adr.config.cache.has(key):
                num_cached += 1
                push_data.append(adr.config.cache.get(key))
            else:
                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, value)
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                except Exception:
                    traceback.print_exc()

            if time.monotonic() - start_time >= 3600:
                upload_adr_cache()
                start_time = time.monotonic()

        logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")

        upload_adr_cache()

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
示例#9
0
        def generate(
            progress_bar: tqdm,
            pushes: list[mozci.push.Push],
            futures: list[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)
            num_errors = 0

            for push, future in zip(pushes, futures):
                cached = future.result()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    if value != "ERROR":
                        yield value
                    else:
                        num_errors += 1
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.label_summaries.keys()
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        mozci.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except mozci.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        num_errors += 1
                        traceback.print_exc()
                        mozci.config.cache.put(
                            key,
                            ("ERROR", MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )

                progress_bar.update(1)

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")
            logger.info(f"There were errors in {num_errors} pushes")
        def generate(executor) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
            # run.
            to_regenerate = 1000

            semaphore = threading.BoundedSemaphore(256)

            def retrieve_from_cache(push):
                semaphore.acquire()
                return adr.config.cache.get(cache_key(push))

            futures = tuple(
                executor.submit(retrieve_from_cache, push) for push in pushes)

            for push, future in zip(tqdm(pushes), futures):
                exc = future.exception()
                if exc is not None:
                    logger.info(f"Exception {exc} while getting {push.rev}")
                    for f in futures:
                        f.cancel()

                cached = future.result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if any(runnable.startswith("/") for runnable in value[1]):
                        cached = None
                        to_regenerate -= 1
                    """# Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION and to_regenerate > 0:
                        cached = None
                        to_regenerate -= 1"""

                if cached is not None:
                    num_cached += 1
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")