Exemplo n.º 1
0
    def test_raises_on_no_replay_dir(self):
        config = _get_config()
        with pytest.raises(ValueError):
            capture_har_in_replay_server("https://www.cs.ucla.edu", config,
                                         self.client_env)

        config = _get_config(
            EnvironmentConfig(request_url="https://www.cs.ucla.edu",
                              replay_dir=""))
        with pytest.raises(ValueError):
            capture_har_in_replay_server("https://www.cs.ucla.edu", config,
                                         self.client_env)
Exemplo n.º 2
0
    def test_writes_mahimahi_files_correctly(self, mock_run, mock_open,
                                             mock_tmpdir):
        tmp_dir = "/tmp/blaze_test_123"
        mock_run.return_value = subprocess.CompletedProcess(args=[],
                                                            returncode=0)
        mock_tmpdir.return_value.__enter__.return_value = tmp_dir
        config = _get_config(
            EnvironmentConfig(request_url="https://www.cs.ucla.edu",
                              replay_dir=tmp_dir))

        capture_har_in_replay_server("https://www.cs.ucla.edu", config,
                                     self.client_env)

        assert mock_open.call_args_list[0][0][0].startswith(tmp_dir)
        assert mock_open.call_args_list[1][0][0].startswith(tmp_dir)
        assert mock_open.call_args_list[0][0][1] == "w"
        assert mock_open.call_args_list[1][0][1] == "w"
Exemplo n.º 3
0
    def test_calls_capture_har_with_correct_arguments(self, mock_run,
                                                      mock_open):
        mock_run.return_value = subprocess.CompletedProcess(args=[],
                                                            returncode=0)

        config = _get_config(
            EnvironmentConfig(request_url="https://www.cs.ucla.edu",
                              replay_dir="/tmp/dir"))
        har = capture_har_in_replay_server("https://www.cs.ucla.edu", config,
                                           self.client_env)

        run_args = mock_run.call_args_list[0][0][0]
        assert run_args[0] == "docker"
        assert run_args[-1] == "https://www.cs.ucla.edu"
        assert har == self.har
Exemplo n.º 4
0
def preprocess(args):
    """
    Preprocesses a website for training. Automatically discovers linked pages up to a certain depth
    and finds the stable set of page dependencies. The page load is recorded and stored and a
    training manifest is outputted.
    """
    domain = Url.parse(args.website).domain
    train_domain_globs = args.train_domain_globs or ["*{}*".format(domain)]
    log.info("preprocessing website",
             website=args.website,
             record_dir=args.record_dir,
             train_domain_globs=train_domain_globs)

    config = get_config(env_config=EnvironmentConfig(
        replay_dir=args.record_dir, request_url=args.website))
    client_env = get_default_client_environment()
    log.debug("using configuration", **config._asdict())

    log.info("capturing execution")
    har_resources = har_entries_to_resources(
        capture_har_in_replay_server(args.website, config, client_env))

    log.info("finding dependency stable set...")
    res_list = find_url_stable_set(args.website, config)

    log.info("found total dependencies", total=len(res_list))
    push_groups = resource_list_to_push_groups(
        res_list, train_domain_globs=train_domain_globs)

    if args.extract_critical_requests:
        log.info("extracting critical requests")
        push_groups = annotate_critical_requests(args.website, config,
                                                 client_env, push_groups)
        critical_resources = set(res.url for group in push_groups
                                 for res in group.resources if res.critical)
        log.debug("critical resources", resources=critical_resources)

    log.info("finding cacheable objects")
    push_groups = annotate_cacheable_objects(args.record_dir, push_groups)

    log.info("generating configuration...")
    env_config = EnvironmentConfig(replay_dir=args.record_dir,
                                   request_url=args.website,
                                   push_groups=push_groups,
                                   har_resources=har_resources)
    env_config.save_file(args.output)
    log.info("successfully prepared website for training", output=args.output)
Exemplo n.º 5
0
def find_url_stable_set(url: str, config: Config) -> List[Resource]:
    """
    Loads the given URL `STABLE_SET_NUM_RUNS` times back-to-back and records the HAR file
    generated by chrome. It then finds the common URLs across the page loads, computes their
    relative ordering, and returns a list of PushGroups for the webpage
    """
    log = logger.with_namespace("find_url_stable_set")
    hars: List[Har] = []
    resource_sets: List[Set[Resource]] = []
    pos_dict = collections.defaultdict(lambda: collections.defaultdict(int))
    for n in range(STABLE_SET_NUM_RUNS):
        log.debug("capturing HAR...", run=n + 1, url=url)
        har = capture_har_in_replay_server(url, config,
                                           get_default_client_environment())
        resource_list = har_entries_to_resources(har)
        if not resource_list:
            log.warn("no response received", run=n + 1)
            continue
        log.debug("received resources", total=len(resource_list))

        for i in range(len(resource_list)):  # pylint: disable=consider-using-enumerate
            for j in range(i + 1, len(resource_list)):
                pos_dict[resource_list[i].url][resource_list[j].url] += 1

        resource_sets.append(set(resource_list))
        hars.append(har)

    log.debug("resource set lengths",
              resource_lens=list(map(len, resource_sets)))
    if not resource_sets:
        return []

    common_res = list(set.intersection(*resource_sets))
    common_res.sort(key=functools.cmp_to_key(
        lambda a, b: -pos_dict[a.url][b.url] + (len(resource_sets) // 2)))

    # Hackily reorder the combined resource sets so that compute_parent_child_relationships works
    common_res = [
        Resource(**{
            **r._asdict(), "order": i
        }) for (i, r) in enumerate(common_res)
    ]
    return compute_parent_child_relationships(common_res, hars[0].timings)
Exemplo n.º 6
0
def annotate_critical_requests(
        website, config, client_env,
        push_groups: List[PushGroup]) -> List[PushGroup]:
    """
    Modifies the passed in push groups by capturing another HAR, checking the critical requests
    and annotating the ones that are critical in the push groups
    """

    har = capture_har_in_replay_server(website,
                                       config,
                                       client_env,
                                       extract_critical_requests=True)
    critical_requests = set(h.request.url for h in har.log.entries
                            if h.critical)

    for group in push_groups:
        for i, res in enumerate(group.resources):
            if res.url in critical_requests:
                group.resources[i] = res._replace(critical=True)

    return push_groups
Exemplo n.º 7
0
def get_page_load_time_in_replay_server(
    request_url: str,
    client_env: ClientEnvironment,
    config: Config,
    policy: Optional[Policy] = None,
    cache_time: Optional[int] = None,
    user_data_dir: Optional[str] = None,
    extract_critical_requests: Optional[bool] = False,
):
    """
    Return the page load time, the HAR resources captured, and the push groups detected
    by loading the page in the given mahimahi record directory
    """
    log = logger.with_namespace("get_page_load_time_in_replay_server")
    log.debug("using client environment", **client_env._asdict())
    hars = []
    for i in range(EXECUTION_CAPTURE_RUNS):
        log.debug("recording page execution in Mahimahi",
                  run=(i + 1),
                  total_runs=EXECUTION_CAPTURE_RUNS)
        har = capture_har_in_replay_server(
            url=request_url,
            config=config,
            client_env=client_env,
            policy=policy,
            cache_time=cache_time,
            user_data_dir=user_data_dir,
            extract_critical_requests=extract_critical_requests,
        )
        hars.append(har)
        log.debug("captured page execution",
                  page_load_time=har.page_load_time_ms)

    hars.sort(key=lambda h: h.page_load_time_ms)
    plt_ms = [h.page_load_time_ms for h in hars]
    median_har = hars[len(hars) // 2]
    log.debug("recorded execution times", plt_ms=plt_ms)

    return median_har.page_load_time_ms, plt_ms