示例#1
0
def _RequestsSubcommand(args):
    """`loading_trace_analyzer.py requests` Command line tool entry point.

  Example:
    Lists all request with timing:
      ... requests --output-format "{timing} {url}"

    Lists  HTTP/HTTPS requests that have used the cache:
      ... requests --where "{protocol} {from_disk_cache}" "https?\S* True"
  """
    where_format = None
    where_statement = None
    if args.where_statement:
        where_format = args.where_statement[0]
        try:
            where_statement = re.compile(args.where_statement[1])
        except re.error as e:
            sys.stderr.write("Invalid where statement REGEX: {}\n{}\n".format(
                args.where_statement[1], str(e)))
            return 1

    loading_trace = LoadingTrace.FromJsonDict(json.load(args.loading_trace))
    for request_event in loading_trace.request_track.GetEvents():
        request_event_json = request_event.ToJsonDict()

        if where_statement != None:
            where_in = where_format.format(**request_event_json)
            if not where_statement.match(where_in):
                continue

        args.output.write(
            args.output_format.format(**request_event_json) + '\n')
    return 0
示例#2
0
def ListRequests(loading_trace_path,
                 output_format='{url}',
                 where_format='{url}',
                 where_statement=None):
    """`loading_trace_analyzer.py requests` Command line tool entry point.

  Args:
    loading_trace_path: Path of the loading trace.
    output_format: Output format of the generated strings.
    where_format: String formated to be regex tested with <where_statement>
    where_statement: Regex for selecting request event.

  Yields:
    Formated string of the selected request event.

  Example:
    Lists all request with timing:
      ... requests --output-format "{timing} {url}"

    Lists  HTTP/HTTPS requests that have used the cache:
      ... requests --where "{protocol} {from_disk_cache}" "https?\S* True"
  """
    if where_statement:
        where_statement = re.compile(where_statement)
    loading_trace = LoadingTrace.FromJsonFile(loading_trace_path)
    for request_event in loading_trace.request_track.GetEvents():
        request_event_json = request_event.ToJsonDict()
        if where_statement != None:
            where_in = where_format.format(**request_event_json)
            if not where_statement.match(where_in):
                continue
        yield output_format.format(**request_event_json)
示例#3
0
def LoadRemoteTrace(storage_accessor, remote_trace_path, logger):
    """Loads and returns the LoadingTrace located at the remote trace path.

  Args:
    storage_accessor: (GoogleStorageAccessor) Used to download the trace from
                                              CloudStorage.
    remote_trace_path: (str) Path to the trace file.
  """

    # Cut the gs://<bucket_name> prefix from trace paths if needed.
    prefix = 'gs://%s/' % storage_accessor.BucketName()
    prefix_length = len(prefix)
    if remote_trace_path.startswith(prefix):
        remote_trace_path = remote_trace_path[prefix_length:]

    trace_string = storage_accessor.DownloadAsString(remote_trace_path)
    if not trace_string:
        logger.error('Failed to download: ' + remote_trace_path)
        return None

    trace_dict = json.loads(trace_string)
    if not trace_dict:
        logger.error('Failed to parse: ' + remote_trace_path)
        return None

    trace = LoadingTrace.FromJsonDict(trace_dict)
    if not trace:
        logger.error('Invalid format for: ' + remote_trace_path)
        return None

    return trace
def ReadSubresourceMapFromBenchmarkOutput(benchmark_output_directory_path):
    """Extracts a map URL-to-subresources for each navigation in benchmark
  directory.

  Args:
    benchmark_output_directory_path: Path of the benchmark output directory to
        verify.

  Returns:
    {url -> [URLs of sub-resources]}
  """
    url_subresources = {}
    run_id = -1
    while True:
        run_id += 1
        run_path = os.path.join(benchmark_output_directory_path, str(run_id))
        if not os.path.isdir(run_path):
            break
        trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)
        if not os.path.isfile(trace_path):
            continue
        trace = LoadingTrace.FromJsonFile(trace_path)
        if trace.url in url_subresources:
            continue
        logging.info('lists resources of %s from %s' % (trace.url, trace_path))
        urls_set = set()
        for request_event in _FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            if request_event.url not in urls_set:
                logging.info('  %s' % request_event.url)
                urls_set.add(request_event.url)
        url_subresources[trace.url] = [url for url in urls_set]
    return url_subresources
def PatchCacheArchive(cache_archive_path, loading_trace_path,
                      cache_archive_dest_path):
    """Patch the cache archive.

  Note: This method update the raw response headers of cache entries' to store
    the ones such as Set-Cookie that were pruned by the
    net::HttpCacheTransaction, and remove the stream index 2 holding resource's
    compile meta data.

  Args:
    cache_archive_path: Input archive's path to patch.
    loading_trace_path: Path of the loading trace that have recorded the cache
        archive <cache_archive_path>.
    cache_archive_dest_path: Archive destination's path.
  """
    trace = LoadingTrace.FromJsonFile(loading_trace_path)
    with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:
        cache_path = os.path.join(tmp_path, 'cache')
        chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path)
        cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')
        cache_entries = set(cache_backend.ListKeys())
        logging.info('Original cache size: %d bytes' % cache_backend.GetSize())
        for request in _FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            # On requests having an upload data stream such as POST requests,
            # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with
            # the upload data stream's session unique identifier.
            #
            # It is fine to not patch these requests since when reopening Chrome,
            # there is no way the entry can be reused since the upload data stream's
            # identifier will be different.
            #
            # The fact that these entries are kept in the cache after closing Chrome
            # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath()
            # do is a known Chrome bug (crbug.com/610725).
            #
            # TODO(gabadie): Add support in ValidateCacheArchiveContent() and in
            #   VerifyBenchmarkOutputDirectory() for POST requests to be known as
            #   impossible to use from cache.
            if request.url not in cache_entries:
                if request.method != 'POST':
                    raise RuntimeError(
                        'Unexpected method that is not found in cache.'
                        ''.format(request.method))
                continue
            # Chrome prunes Set-Cookie from response headers before storing them in
            # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect
            # response headers. Sandwich manages the cache, but between recording the
            # cache and benchmarking the cookie jar is invalidated. This leads to
            # invalidation of all cacheable redirects.
            raw_headers = request.GetRawResponseHeaders()
            cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)
            # NoState-Prefetch would only fetch the resources, but not parse them.
            cache_backend.DeleteStreamForKey(request.url, 2)
        chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)
        logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())
def ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer):
    """Extracts discoverable resource urls from a loading trace according to a
  sub-resource discoverer.

  Args:
    loading_trace_path: The loading trace's path.
    subresource_discoverer: The sub-resources discoverer that should white-list
      the resources to keep in cache for the NoState-Prefetch benchmarks.

  Returns:
    A set of urls.
  """
    assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \
        'unknown prefetch simulation {}'.format(subresource_discoverer)

    # Load trace and related infos.
    logging.info('loading %s' % loading_trace_path)
    trace = LoadingTrace.FromJsonFile(loading_trace_path)
    dependencies_lens = RequestDependencyLens(trace)
    first_resource_request = trace.request_track.GetFirstResourceRequest()

    # Build the list of discovered requests according to the desired simulation.
    discovered_requests = []
    if subresource_discoverer == EMPTY_CACHE_DISCOVERER:
        pass
    elif subresource_discoverer == FULL_CACHE_DISCOVERER:
        discovered_requests = trace.request_track.GetEvents()
    elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:
        discovered_requests = \
            [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]
    elif subresource_discoverer == PARSER_DISCOVERER:
        discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(
            first_resource_request, dependencies_lens)
    elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:
        discovered_requests = PrefetchSimulationView.PreloadedRequests(
            first_resource_request, dependencies_lens, trace)
    else:
        assert False

    whitelisted_urls = set()
    logging.info('white-listing %s' % first_resource_request.url)
    for request in _FilterOutDataAndIncompleteRequests(discovered_requests):
        logging.info('white-listing %s' % request.url)
        whitelisted_urls.add(request.url)
    return whitelisted_urls
  def CreateLoadingTrace(cls, trace_events=None):
    # This creates a set of requests with the following dependency structure.
    #
    # 1234.redirect.1 -> 1234.redirect.2
    # 1234.redirect.2 -> 1234.1
    # 1234.1 -> 1234.12
    # 1234.1 -> 1234.42
    # 1234.1 -> 1234.56
    # 1234.12 -> 1234.13

    trace = test_utils.LoadingTraceFromEvents(
        [cls.FIRST_REDIRECT_REQUEST, cls.SECOND_REDIRECT_REQUEST,
         cls.REDIRECTED_REQUEST, cls.REQUEST, cls.JS_REQUEST, cls.JS_REQUEST_2,
         cls.JS_REQUEST_OTHER_FRAME, cls.JS_REQUEST_UNRELATED_FRAME],
        cls.PAGE_EVENTS, trace_events)
    # Serialize and deserialize so that clients can change events without
    # affecting future tests.
    return LoadingTrace.FromJsonDict(trace.ToJsonDict())
def VerifyBenchmarkOutputDirectory(benchmark_setup_path,
                                   benchmark_output_directory_path):
    """Verifies that all run inside the run_output_directory worked as expected.

  Args:
    benchmark_setup_path: Path of the JSON of the benchmark setup.
    benchmark_output_directory_path: Path of the benchmark output directory to
        verify.
  """
    # TODO(gabadie): What's the best way of propagating errors happening in here?
    benchmark_setup = json.load(open(benchmark_setup_path))
    cache_whitelist = set(benchmark_setup['cache_whitelist'])
    original_requests = set(benchmark_setup['url_resources'])
    original_cached_requests = original_requests.intersection(cache_whitelist)
    original_uncached_requests = original_requests.difference(cache_whitelist)
    all_sent_url_requests = set()

    # Verify requests from traces.
    run_id = -1
    while True:
        run_id += 1
        run_path = os.path.join(benchmark_output_directory_path, str(run_id))
        if not os.path.isdir(run_path):
            break
        trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)
        if not os.path.isfile(trace_path):
            logging.error('missing trace %s' % trace_path)
            continue
        trace = LoadingTrace.FromJsonFile(trace_path)
        logging.info('verifying %s from %s' % (trace.url, trace_path))

        effective_requests = ListUrlRequests(trace, RequestOutcome.All)
        effective_cached_requests = \
            ListUrlRequests(trace, RequestOutcome.ServedFromCache)
        effective_uncached_requests = \
            ListUrlRequests(trace, RequestOutcome.NotServedFromCache)

        missing_requests = original_requests.difference(effective_requests)
        unexpected_requests = effective_requests.difference(original_requests)
        expected_cached_requests = \
            original_cached_requests.difference(missing_requests)
        missing_cached_requests = \
            expected_cached_requests.difference(effective_cached_requests)
        expected_uncached_requests = original_uncached_requests.union(
            unexpected_requests).union(missing_cached_requests)
        all_sent_url_requests.update(effective_uncached_requests)

        _PrintUrlSetComparison(original_requests, effective_requests,
                               'All resources')
        _PrintUrlSetComparison(expected_cached_requests,
                               effective_cached_requests, 'Cached resources')
        _PrintUrlSetComparison(expected_uncached_requests,
                               effective_uncached_requests,
                               'Non cached resources')

    # Verify requests from WPR.
    wpr_log_path = os.path.join(benchmark_output_directory_path,
                                sandwich_runner.WPR_LOG_FILENAME)
    logging.info('verifying requests from %s' % wpr_log_path)
    all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)
    all_wpr_urls = set()
    unserved_wpr_urls = set()
    wpr_command_colliding_urls = set()

    for request in all_wpr_requests:
        if request.is_wpr_host:
            continue
        if urlparse(request.url).path.startswith('/web-page-replay'):
            wpr_command_colliding_urls.add(request.url)
        elif request.is_served is False:
            unserved_wpr_urls.add(request.url)
        all_wpr_urls.add(request.url)

    _PrintUrlSetComparison(set(), unserved_wpr_urls,
                           'Distinct unserved resources from WPR')
    _PrintUrlSetComparison(set(), wpr_command_colliding_urls,
                           'Distinct resources colliding to WPR commands')
    _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests,
                           'Distinct resource requests to WPR')