def test_raises_on_no_replay_dir(self): config = _get_config() with pytest.raises(ValueError): capture_har_in_replay_server("https://www.cs.ucla.edu", config, self.client_env) config = _get_config( EnvironmentConfig(request_url="https://www.cs.ucla.edu", replay_dir="")) with pytest.raises(ValueError): capture_har_in_replay_server("https://www.cs.ucla.edu", config, self.client_env)
def test_writes_mahimahi_files_correctly(self, mock_run, mock_open, mock_tmpdir): tmp_dir = "/tmp/blaze_test_123" mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0) mock_tmpdir.return_value.__enter__.return_value = tmp_dir config = _get_config( EnvironmentConfig(request_url="https://www.cs.ucla.edu", replay_dir=tmp_dir)) capture_har_in_replay_server("https://www.cs.ucla.edu", config, self.client_env) assert mock_open.call_args_list[0][0][0].startswith(tmp_dir) assert mock_open.call_args_list[1][0][0].startswith(tmp_dir) assert mock_open.call_args_list[0][0][1] == "w" assert mock_open.call_args_list[1][0][1] == "w"
def test_calls_capture_har_with_correct_arguments(self, mock_run, mock_open): mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0) config = _get_config( EnvironmentConfig(request_url="https://www.cs.ucla.edu", replay_dir="/tmp/dir")) har = capture_har_in_replay_server("https://www.cs.ucla.edu", config, self.client_env) run_args = mock_run.call_args_list[0][0][0] assert run_args[0] == "docker" assert run_args[-1] == "https://www.cs.ucla.edu" assert har == self.har
def preprocess(args): """ Preprocesses a website for training. Automatically discovers linked pages up to a certain depth and finds the stable set of page dependencies. The page load is recorded and stored and a training manifest is outputted. """ domain = Url.parse(args.website).domain train_domain_globs = args.train_domain_globs or ["*{}*".format(domain)] log.info("preprocessing website", website=args.website, record_dir=args.record_dir, train_domain_globs=train_domain_globs) config = get_config(env_config=EnvironmentConfig( replay_dir=args.record_dir, request_url=args.website)) client_env = get_default_client_environment() log.debug("using configuration", **config._asdict()) log.info("capturing execution") har_resources = har_entries_to_resources( capture_har_in_replay_server(args.website, config, client_env)) log.info("finding dependency stable set...") res_list = find_url_stable_set(args.website, config) log.info("found total dependencies", total=len(res_list)) push_groups = resource_list_to_push_groups( res_list, train_domain_globs=train_domain_globs) if args.extract_critical_requests: log.info("extracting critical requests") push_groups = annotate_critical_requests(args.website, config, client_env, push_groups) critical_resources = set(res.url for group in push_groups for res in group.resources if res.critical) log.debug("critical resources", resources=critical_resources) log.info("finding cacheable objects") push_groups = annotate_cacheable_objects(args.record_dir, push_groups) log.info("generating configuration...") env_config = EnvironmentConfig(replay_dir=args.record_dir, request_url=args.website, push_groups=push_groups, har_resources=har_resources) env_config.save_file(args.output) log.info("successfully prepared website for training", output=args.output)
def find_url_stable_set(url: str, config: Config) -> List[Resource]: """ Loads the given URL `STABLE_SET_NUM_RUNS` times back-to-back and records the HAR file generated by chrome. It then finds the common URLs across the page loads, computes their relative ordering, and returns a list of PushGroups for the webpage """ log = logger.with_namespace("find_url_stable_set") hars: List[Har] = [] resource_sets: List[Set[Resource]] = [] pos_dict = collections.defaultdict(lambda: collections.defaultdict(int)) for n in range(STABLE_SET_NUM_RUNS): log.debug("capturing HAR...", run=n + 1, url=url) har = capture_har_in_replay_server(url, config, get_default_client_environment()) resource_list = har_entries_to_resources(har) if not resource_list: log.warn("no response received", run=n + 1) continue log.debug("received resources", total=len(resource_list)) for i in range(len(resource_list)): # pylint: disable=consider-using-enumerate for j in range(i + 1, len(resource_list)): pos_dict[resource_list[i].url][resource_list[j].url] += 1 resource_sets.append(set(resource_list)) hars.append(har) log.debug("resource set lengths", resource_lens=list(map(len, resource_sets))) if not resource_sets: return [] common_res = list(set.intersection(*resource_sets)) common_res.sort(key=functools.cmp_to_key( lambda a, b: -pos_dict[a.url][b.url] + (len(resource_sets) // 2))) # Hackily reorder the combined resource sets so that compute_parent_child_relationships works common_res = [ Resource(**{ **r._asdict(), "order": i }) for (i, r) in enumerate(common_res) ] return compute_parent_child_relationships(common_res, hars[0].timings)
def annotate_critical_requests( website, config, client_env, push_groups: List[PushGroup]) -> List[PushGroup]: """ Modifies the passed in push groups by capturing another HAR, checking the critical requests and annotating the ones that are critical in the push groups """ har = capture_har_in_replay_server(website, config, client_env, extract_critical_requests=True) critical_requests = set(h.request.url for h in har.log.entries if h.critical) for group in push_groups: for i, res in enumerate(group.resources): if res.url in critical_requests: group.resources[i] = res._replace(critical=True) return push_groups
def get_page_load_time_in_replay_server( request_url: str, client_env: ClientEnvironment, config: Config, policy: Optional[Policy] = None, cache_time: Optional[int] = None, user_data_dir: Optional[str] = None, extract_critical_requests: Optional[bool] = False, ): """ Return the page load time, the HAR resources captured, and the push groups detected by loading the page in the given mahimahi record directory """ log = logger.with_namespace("get_page_load_time_in_replay_server") log.debug("using client environment", **client_env._asdict()) hars = [] for i in range(EXECUTION_CAPTURE_RUNS): log.debug("recording page execution in Mahimahi", run=(i + 1), total_runs=EXECUTION_CAPTURE_RUNS) har = capture_har_in_replay_server( url=request_url, config=config, client_env=client_env, policy=policy, cache_time=cache_time, user_data_dir=user_data_dir, extract_critical_requests=extract_critical_requests, ) hars.append(har) log.debug("captured page execution", page_load_time=har.page_load_time_ms) hars.sort(key=lambda h: h.page_load_time_ms) plt_ms = [h.page_load_time_ms for h in hars] median_har = hars[len(hars) // 2] log.debug("recorded execution times", plt_ms=plt_ms) return median_har.page_load_time_ms, plt_ms