Exemplo n.º 1
0
def get_cache_times(file_dir: str) -> Dict[str, int]:
    """
    :return: a dictionary mapping each file name to its freshness using `findcacheable`
    """

    def demote():
        if os.path.isfile("/opt/entrypoint.sh"):
            os.setgid(27)
            os.setuid(103)

    path = os.environ.get("FINDCACHEABLE_BIN", os.path.join(os.path.dirname(__file__), "findcacheable"))
    proc = subprocess.run(
        f"{path} '{file_dir}/' | awk -F'/' '{{print $NF'}} | grep freshness",
        shell=True,
        preexec_fn=demote,
        stdout=subprocess.PIPE,
    )
    if proc.returncode != 0:
        log.with_namespace("get_cache_times").warn("failed to run findcacheable", code=proc.returncode)
    d = {}
    for line in proc.stdout.decode("utf-8").strip().split("\n"):
        try:
            fname, time = line.strip().split(" freshness=")
            d[fname] = int(time)
        except ValueError:
            continue
    return d
Exemplo n.º 2
0
 def use_action(self, action: Action):
     """ Marks the action as used in both the push and preload spaces """
     self.push_space.use_action(action)
     self.preload_space.use_action(action)
     logger.with_namespace("action_space").info(
         "used_action",
         action=repr(action),
         new_push_size=len(self.push_space),
         new_preload_size=len(self.preload_space),
     )
Exemplo n.º 3
0
def get_speed_index_in_replay_server(
    request_url: str,
    client_env: ClientEnvironment,
    config: Config,
    policy: Optional[Policy] = None,
    cache_time: Optional[int] = None,
    user_data_dir: Optional[str] = None,
    extract_critical_requests: Optional[bool] = False,
):
    """
    Return the page speed index
    """
    log = logger.with_namespace("get_speed_index_in_replay_server")
    log.debug("using client environment", **client_env._asdict())
    speed_indices = []
    for i in range(EXECUTION_CAPTURE_RUNS):
        log.debug("recording page execution in Mahimahi",
                  run=(i + 1),
                  total_runs=EXECUTION_CAPTURE_RUNS)
        speed_index = capture_si_in_replay_server(
            url=request_url,
            config=config,
            client_env=client_env,
            policy=policy,
            cache_time=cache_time,
            user_data_dir=user_data_dir,
            extract_critical_requests=extract_critical_requests,
        )
        speed_indices.append(speed_index)
        log.debug("captured page execution", speed_index=speed_index)

    speed_indices.sort()
    median_speedindex = speed_indices[len(speed_indices) // 2]
    return median_speedindex
Exemplo n.º 4
0
def capture_har_in_replay_server(
    url: str,
    config: Config,
    client_env: ClientEnvironment,
    policy: Optional[Policy] = None,
    cache_time: Optional[int] = None,
    user_data_dir: Optional[str] = None,
    extract_critical_requests: Optional[bool] = False,
) -> Har:
    """
    capture_har spawns a headless chrome instance and connects to its remote debugger
    in order to extract the HAR file generated by loading the given URL. The har capturer
    is launched inside a replay shell using the specified Mahimahi config, which means
    that the webpage needs to have been recorded before calling this method
    """
    log = logger.with_namespace("capture_har_in_replay_server")

    if not config.env_config or not config.env_config.replay_dir:
        raise ValueError("replay_dir must be specified")

    policy = policy or Policy.from_dict({})
    mahimahi_config = MahiMahiConfig(config=config,
                                     policy=policy,
                                     client_environment=client_env)

    with tempfile.TemporaryDirectory() as temp_dir:
        policy_file = os.path.join(temp_dir, "policy.json")
        output_file = os.path.join(temp_dir, "har.json")
        trace_file = os.path.join(temp_dir, "trace_file")

        with open(policy_file, "w") as f:
            log.debug("writing push policy file", policy_file=policy_file)
            f.write(json.dumps(policy.as_dict))
        with open(trace_file, "w") as f:
            log.debug("writing trace file", trace_file=trace_file)
            f.write(mahimahi_config.formatted_trace_file)

        # configure the HAR capturer
        cmd = mahimahi_config.har_capture_cmd(
            share_dir=temp_dir,
            har_output_file_name="har.json",
            policy_file_name="policy.json",
            link_trace_file_name="trace_file",
            capture_url=url,
            cache_time=cache_time,
            user_data_dir=user_data_dir,
            extract_critical_requests=extract_critical_requests,
        )

        # spawn the HAR capturer process
        log.debug("spawning har capturer", url=url, cmd=cmd)
        har_capture_proc = subprocess.run(cmd,
                                          stdout=sys.stderr,
                                          stderr=sys.stderr,
                                          timeout=300)
        har_capture_proc.check_returncode()

        with open(output_file, "r") as f:
            return har_from_json(f.read())
Exemplo n.º 5
0
    def files(self) -> List[File]:
        """
        :return: A list of File objects corresponding to the files in self.path
        """
        if not self._files:
            self._cache_times = get_cache_times(self.path)
            self._files = self._files or list(map(File.read, glob.iglob(f"{self.path}/*")))
            for f in self._files:
                cache_time = self._cache_times.get(f.file_name, 0)
                if self.cache_time is None and cache_time > 0:
                    f.set_cache_time(cache_time)
                elif self.cache_time is not None and cache_time > self.cache_time:
                    f.set_cache_time(cache_time)
                else:
                    log.with_namespace("filestore").debug(
                        "skipping setting cache", url=f.url, actual_cache_time=cache_time, cache_time=self.cache_time
                    )

        return self._files
Exemplo n.º 6
0
    def decode_action(self, action: ActionIDType) -> Action:
        """ Decodes the given action ID into an Action object """
        # Temporary for compatibility:
        if len(action) == 6:
            action = (action[0], tuple(action[1:4]), tuple(action[4:]))

        (action_type, push_id, preload_id) = action
        action_type_id = 0 if action_type == 0 else ((action_type // 5) + 1)
        if action_type_id == 0:
            return Action()

        is_push = action_type_id == 1 and not self.disable_push
        try:
            if is_push:
                return self.push_space.decode_action_id(push_id)
            return self.preload_space.decode_action_id(preload_id)
        except KeyError:
            logger.with_namespace("action_space").warn(
                "picked out of bounds action", action=action)
            return Action()
Exemplo n.º 7
0
def record_webpage(url: str, save_dir: str, config: Config):
    """
    Given a URL and runtime configuration, record_webpage creates a Mahimahi record
    shell and records the web page load in Chrome. It saves the result to the given
    save directory, which is expected to be empty. A subprocess.CalledProcessError
    is raised if an error occurs
    """
    with tempfile.TemporaryDirectory(prefix="blaze_record",
                                     dir="/tmp") as tmp_dir:
        chrome_flags = get_chrome_flags(tmp_dir)
        chrome_cmd = get_chrome_command(url, chrome_flags, config)

        mm_config = MahiMahiConfig(config)
        cmd = mm_config.record_shell_with_cmd(save_dir, chrome_cmd)

        logger.with_namespace("record_webpage").debug("spawning web recorder",
                                                      url=url,
                                                      cmd=cmd)

        proc = subprocess.run(cmd, stdout=sys.stderr, stderr=sys.stderr)
        proc.check_returncode()
Exemplo n.º 8
0
def get_page_links(url: str, max_depth: int = 1) -> List[str]:
    """
    Performs DFS with the given max_depth on the given URL to discover all
    <a href="..."> links in the page
    """
    if max_depth == 0:
        return []

    log = logger.with_namespace("get_page_links").with_context(
        depth_left=max_depth)
    try:
        log.info("fetching page", url=url)
        page = requests.get(url)
        page.raise_for_status()
        page_text = page.text
    except requests.exceptions.RequestException as err:
        log.warn("failed to fetch page", error=repr(err))
        return []

    try:
        log.debug("parsing http response", length=len(page_text))
        root = BeautifulSoup(page_text, "html.parser")
    except err:
        log.verbose(page_text)
        log.warn("failed to parse response", error=repr(err))
        return []

    parsed_links = root.find_all("a")
    log.info("found links", url=url, n_links=len(parsed_links))

    links = []
    domain = Url.parse(url).domain
    scheme = Url.parse(url).scheme
    for link in parsed_links:
        link_url = link.get("href")
        if link_url.startswith("http"):
            link_domain = Url.parse(link_url).domain
            if link_domain != domain:
                log.debug("ignoring found link (bad domain)", link=link_url)
                continue
        elif link_url.startswith("/"):
            link_url = f"{scheme}://{domain}{link_url}"
        else:
            log.debug("ignoring found link (bad prefix)", link=link_url)
            continue

        links.append(link_url)
        links.extend(get_page_links(link_url, max_depth - 1))
    return ordered_uniq(links)
Exemplo n.º 9
0
def cluster(args):
    """ Cluster the given folder of pages """
    log = logger.with_namespace("cluster")
    log.info("clustering pages", folder=args.folder)

    def read_file(fpath):
        log.debug("reading file...", file=fpath)
        return EnvironmentConfig.load_file(fpath)

    files = list(map(read_file, glob.iglob(f"{args.folder}/*")))
    distance_func = create_apted_distance_function(args.apted_port)
    c = AgglomerativeCluster(distance_func)
    mapping = c.cluster(files)
    print(
        json.dumps({f.request_url: int(i)
                    for f, i in zip(files, mapping)},
                   indent=4))
Exemplo n.º 10
0
 def __init__(
     self,
     config: Config,
     reward_func_num: int = 0,
     use_aft: bool = False,
     client_environment: Optional[ClientEnvironment] = None,
     cached_urls: Optional[Set[str]] = None,
 ):
     self.config = config
     self.use_aft = use_aft
     self.cached_urls = cached_urls
     self.client_environment = client_environment
     self.simulator = Simulator(config.env_config)
     self.reward_func_num = reward_func_num
     self.reward_func = REWARD_FUNCTIONS[self.reward_func_num](
         self.simulator, self.client_environment, self.cached_urls,
         self.use_aft)
     self.log = logger.with_namespace("analyzer")
Exemplo n.º 11
0
def find_url_stable_set(url: str, config: Config) -> List[Resource]:
    """
    Loads the given URL `STABLE_SET_NUM_RUNS` times back-to-back and records the HAR file
    generated by chrome. It then finds the common URLs across the page loads, computes their
    relative ordering, and returns a list of PushGroups for the webpage
    """
    log = logger.with_namespace("find_url_stable_set")
    hars: List[Har] = []
    resource_sets: List[Set[Resource]] = []
    pos_dict = collections.defaultdict(lambda: collections.defaultdict(int))
    for n in range(STABLE_SET_NUM_RUNS):
        log.debug("capturing HAR...", run=n + 1, url=url)
        har = capture_har_in_replay_server(url, config,
                                           get_default_client_environment())
        resource_list = har_entries_to_resources(har)
        if not resource_list:
            log.warn("no response received", run=n + 1)
            continue
        log.debug("received resources", total=len(resource_list))

        for i in range(len(resource_list)):  # pylint: disable=consider-using-enumerate
            for j in range(i + 1, len(resource_list)):
                pos_dict[resource_list[i].url][resource_list[j].url] += 1

        resource_sets.append(set(resource_list))
        hars.append(har)

    log.debug("resource set lengths",
              resource_lens=list(map(len, resource_sets)))
    if not resource_sets:
        return []

    common_res = list(set.intersection(*resource_sets))
    common_res.sort(key=functools.cmp_to_key(
        lambda a, b: -pos_dict[a.url][b.url] + (len(resource_sets) // 2)))

    # Hackily reorder the combined resource sets so that compute_parent_child_relationships works
    common_res = [
        Resource(**{
            **r._asdict(), "order": i
        }) for (i, r) in enumerate(common_res)
    ]
    return compute_parent_child_relationships(common_res, hars[0].timings)
Exemplo n.º 12
0
    def __init__(self, env_config: EnvironmentConfig):
        self.env_config = env_config
        self.log = logger.with_namespace("simulator")

        self.root = None
        self.node_map = {}
        self.url_to_node_map = {}
        self.create_execution_graph(env_config)

        self.pq: Optional[PriorityQueue] = None
        self.request_queue: Optional[RequestQueue] = None
        self.completed_nodes = {}
        self.pushed_nodes = {}
        self.total_time_ms = 0
        self.cached_urls = set()

        self.no_push: Optional[Simulator] = None
        self.client_env: Optional[ClientEnvironment] = None
        self.policy: Optional[Policy] = None
Exemplo n.º 13
0
def get_page_load_time_in_replay_server(
    request_url: str,
    client_env: ClientEnvironment,
    config: Config,
    policy: Optional[Policy] = None,
    cache_time: Optional[int] = None,
    user_data_dir: Optional[str] = None,
    extract_critical_requests: Optional[bool] = False,
):
    """
    Return the page load time, the HAR resources captured, and the push groups detected
    by loading the page in the given mahimahi record directory
    """
    log = logger.with_namespace("get_page_load_time_in_replay_server")
    log.debug("using client environment", **client_env._asdict())
    hars = []
    for i in range(EXECUTION_CAPTURE_RUNS):
        log.debug("recording page execution in Mahimahi",
                  run=(i + 1),
                  total_runs=EXECUTION_CAPTURE_RUNS)
        har = capture_har_in_replay_server(
            url=request_url,
            config=config,
            client_env=client_env,
            policy=policy,
            cache_time=cache_time,
            user_data_dir=user_data_dir,
            extract_critical_requests=extract_critical_requests,
        )
        hars.append(har)
        log.debug("captured page execution",
                  page_load_time=har.page_load_time_ms)

    hars.sort(key=lambda h: h.page_load_time_ms)
    plt_ms = [h.page_load_time_ms for h in hars]
    median_har = hars[len(hars) // 2]
    log.debug("recorded execution times", plt_ms=plt_ms)

    return median_har.page_load_time_ms, plt_ms
Exemplo n.º 14
0
def stop_condition():
    """
    Implements a stateful stopping condition to automatically stop the training based on analyzing the running
    episode reward mean over a certain window size. It also stop automatically if the number of training iterations
    exceeds some maximum, but not before it exceeds some minimum.
    """

    log = logger.with_namespace("stop_condition")
    num_iters = 0
    past_rewards = deque()

    def stopper(trial_id, result):
        nonlocal num_iters, past_rewards
        num_iters += 1

        if "time_since_restore" in result and result[
                "time_since_restore"] >= MAX_TIME_SECONDS:
            log.info("auto stopping",
                     time_seconds=result["time_since_restore"],
                     iters=num_iters)
            return True

        if "episode_reward_max" in result and "episode_reward_min" in result and "episode_reward_mean" in result:
            rewards = (result["episode_reward_min"],
                       result["episode_reward_mean"],
                       result["episode_reward_max"])
            log.debug("recording trial result",
                      trial_id=trial_id,
                      num_iters=num_iters,
                      rewards=rewards)
            past_rewards.append(rewards)
        else:
            log.warn("unable to record episode result",
                     result=result,
                     trial_id=trial_id)
            return False

        # truncate the rewards list to past `WINDOW_SIZE` iterations only
        if len(past_rewards) > WINDOW_SIZE:
            past_rewards.popleft()

        if num_iters > MIN_ITERATIONS:
            stdev_min, stdev_mean, stdev_max = tuple(
                map(stdev, zip(*past_rewards)))
            log.debug("reward stats",
                      stdev_min=stdev_min,
                      stdev_mean=stdev_mean,
                      stdev_max=stdev_max)
            relative_stdev_based_stop = stdev_mean <= 0.05 * abs(
                past_rewards[-1][1])

            if num_iters > MAX_ITERATIONS or relative_stdev_based_stop:
                log.info("auto stopping",
                         time_seconds=result.get("time_since_restore", 0),
                         iters=num_iters)
                return True

        return False

    stopper.past_rewards = past_rewards
    return stopper
Exemplo n.º 15
0
 def __init__(self, hosts: List[str]):
     """
     :param hosts: The hosts to manage IP addresses for
     """
     self.hosts = hosts
     self.log = logger.with_namespace("interface")
Exemplo n.º 16
0
def start_server(replay_dir: str,
                 cert_path: Optional[str] = None,
                 key_path: Optional[str] = None,
                 policy: Optional[Policy] = None,
                 per_resource_latency: Optional[str] = None,
                 cache_time: Optional[int] = None,
                 extract_critical_requests: Optional[bool] = False,
                 enable_http2: Optional[bool] = False):
    """
    Reads the given replay directory and sets up the NGINX server to replay it. This function also
    creates the DNS servers, Interfaces, and writes all necessary temporary files.

    :param replay_dir: The directory to replay (should be mahimahi-recorded)
    :param cert_path: The path to the SSL certificate for the HTTP/2 NGINX server
    :param key_path: The path to the SSL key for the HTTP/2 NGINX server
    :param policy: The path to the push/preload policy to use for the server
    """
    log = logger.with_namespace("replay_server")
    push_policy = policy.as_dict["push"] if policy else {}
    preload_policy = policy.as_dict["preload"] if policy else {}
    res_latency_map = json.loads(open(
        per_resource_latency, 'r').read()) if per_resource_latency else {}

    # Load the file store into memory
    if not os.path.isdir(replay_dir):
        raise NotADirectoryError(f"{replay_dir} is not a directory")
    filestore = FileStore(replay_dir, cache_time=cache_time)

    # Create host-ip mapping
    hosts = filestore.hosts
    interfaces = Interfaces(hosts)
    host_ip_map = interfaces.mapping

    # Save files and create nginx configuration
    config = Config()
    with tempfile.TemporaryDirectory() as file_dir:
        log.debug("storing temporary files in", file_dir=file_dir)

        for host, files in filestore.files_by_host.items():
            log.info("creating host", host=host, address=host_ip_map[host])
            uris_served = set()
            host_res_lmap = res_latency_map[
                host] if host in res_latency_map else {}
            http2_directive = "ssl http2" if enable_http2 and host != "archive.org" and host != "analytics.archive.org" else "ssl"

            # Create a server block for this host
            server = config.http_block.add_server(
                server_name=host,
                server_addr=host_ip_map[host],
                cert_path=cert_path,
                key_path=key_path,
                root=file_dir,
                res_latency_map=host_res_lmap,
                enable_http2=http2_directive)

            for file in files:
                # Handles the case where we may have duplicate URIs for a single host
                # or where URIs in nginx cannot be too long
                if file.uri in uris_served or len(file.uri) > 3600 or len(
                        file.headers.get("location", "")) > 3600:
                    continue

                uris_served.add(file.uri + file.scheme)
                log.debug(
                    "serve",
                    file_name=file.file_name,
                    status=file.status,
                    method=file.method,
                    uri=file.uri,
                    host=file.host,
                )

                # Create entry for this resource
                if file.status < 300 or file.status >= 400:
                    loc = server.add_location_block(
                        uri=file.uri,
                        scheme=file.scheme,
                        file_name=file.file_name,
                        content_type=file.headers.get("content-type", None))
                elif "location" in file.headers:
                    loc = server.add_location_block(
                        uri=file.uri,
                        scheme=file.scheme,
                        redirect_uri=file.headers["location"])
                else:
                    log.warn("skipping",
                             file_name=file.file_name,
                             method=file.method,
                             uri=file.uri,
                             host=file.host)
                    continue

                backup_file_body = file.body
                try:
                    if extract_critical_requests and "text/html" in file.headers.get(
                            "content-type", ""):
                        file.body = inject_extract_critical_requests_javascript(
                            file)
                    file_path = os.path.join(file_dir, file.file_name)
                    with open(
                            os.open(file_path, os.O_CREAT | os.O_WRONLY,
                                    0o644), "wb") as f:
                        f.write(file.body)
                except TypeError as e:
                    file_path = os.path.join(file_dir, file.file_name)
                    with open(os.open(file_path, os.O_CREAT | os.O_WRONLY,
                                      0o644),
                              mode="w",
                              encoding="utf8") as f:
                        f.write(file.body)
                except UnicodeEncodeError as e:
                    # file.body somehow because corrupted.
                    # this messes up the encoding
                    # Save the file's original body to file
                    # this happens if a file is a bytestream but does not have gzip header or vice versa
                    log.warn("unable to inject critical for file ",
                             uri=file.uri,
                             error=e)
                    file.body = backup_file_body
                    file_path = os.path.join(file_dir, file.file_name)
                    with open(
                            os.open(file_path, os.O_CREAT | os.O_WRONLY,
                                    0o644), "wb") as f:
                        f.write(file.body)

                # Add headers
                for key, value in file.headers.items():
                    loc.add_header(key, value)

                # Look up push and preload policy
                full_source = f"https://{file.host}{file.uri}"
                push_res_list = push_policy.get(
                    full_source, push_policy.get(full_source + "/", []))
                preload_res_list = preload_policy.get(
                    full_source, preload_policy.get(full_source + "/", []))

                for res in push_res_list:
                    path = urlparse(res["url"]).path
                    log.debug("create push rule", source=file.uri, push=path)
                    loc.add_push(path)
                for res in preload_res_list:
                    log.debug("create preload rule",
                              source=file.uri,
                              preload=res["url"],
                              type=res["type"])
                    loc.add_preload(res["url"], res["type"])

        # Save the nginx configuration
        conf_file = os.path.join(file_dir, "nginx.conf")
        log.debug("writing nginx config", conf_file=conf_file)
        with open(conf_file, "w") as f:
            f.write(str(config))

        # with open("/tmp/dummy", "w") as f:
        #     f.write(str(config))
        # log.debug("contents of nginx config", config=str(config))

        # time.sleep(10000)
        # Create the interfaces, start the DNS server, and start the NGINX server
        with interfaces:
            with DNSServer(host_ip_map) as dns:
                # If wait lasts for more than 0.5 seconds, a TimeoutError will be raised, which is okay since it
                # means that nginx is running successfully. If it finishes sooner, it means it crashed and
                # we should raise an exception
                try:
                    proc = subprocess.Popen([
                        "/usr/local/openresty/nginx/sbin/nginx", "-c",
                        conf_file
                    ],
                                            stdout=sys.stderr,
                                            stderr=sys.stderr)
                    proc.wait(0.5)
                    raise RuntimeError("nginx exited unsuccessfully")
                except subprocess.TimeoutExpired:
                    yield
                finally:
                    log.info("Killing dns server and nginx server")
                    # subprocess.call(["sudo","kill",str(proc.pid)])
                    # subprocess.call(["sudo","kill","-SIGKILL",str(proc.pid)])
                    # dns.proc.terminate()
                    proc.terminate()