def _extract_meta(minfo, timestamp):
    """ Extract some meta data from the response header, including

        uri - from request path
        date - from timestamp
        ETag or last-modified - from http header
        content-type - from http header (for its charset)

        The dictionary is case-sensitive and keys are in lowercase.
    """

    meta = {}
    uri = httputil.canonicalize(minfo.req_path)
    if len(uri) > g_maxuri:
        uri = uri[:g_maxuri] + '...'
    meta = {'uri': uri}                                                 # uri
    meta['date'] = timestamp                                            # date

    # Include either etag or last-modified
    # Otherwise use content-length as a weak ETag
    etag = minfo.rsp_headers.get('etag','')
    if etag:
        meta['etag'] = etag                                             # ETag
    else:
        last_modified = minfo.rsp_headers.get('last-modified','')
        if last_modified:
            meta['last-modified'] = last_modified                       # last-modified
        else:
            meta['etag'] = "W/%s" % minfo.clen                          # use clen as a weak etag

    meta['content-type'] = minfo.rsp_headers.get('content-type','')     # content-type

    meta['referer'] = minfo.req_headers.get('referer','')               # referer (from request)

    return meta
예제 #2
0
 def _append(self, parent, baseuri, uri, ctype, tag):
     abs_uri = urlparse.urljoin(baseuri, uri)
     abs_uri = httputil.canonicalize(abs_uri)    # TODO: test
     if abs_uri in self.uri_set:
         #log.warn('Skip repeated resource - %s %s' % (tag, uri))
         return
     res = Resource(parent, uri=abs_uri, ctype=ctype, tag=tag)
     if parent:
         parent.children.append(res)
     self.resource_list.append(res)
     self.uri_set.add(abs_uri)
     self.fetcher.queue(res)