def _extract_meta(minfo, timestamp): """ Extract some meta data from the response header, including uri - from request path date - from timestamp ETag or last-modified - from http header content-type - from http header (for its charset) The dictionary is case-sensitive and keys are in lowercase. """ meta = {} uri = httputil.canonicalize(minfo.req_path) if len(uri) > g_maxuri: uri = uri[:g_maxuri] + '...' meta = {'uri': uri} # uri meta['date'] = timestamp # date # Include either etag or last-modified # Otherwise use content-length as a weak ETag etag = minfo.rsp_headers.get('etag','') if etag: meta['etag'] = etag # ETag else: last_modified = minfo.rsp_headers.get('last-modified','') if last_modified: meta['last-modified'] = last_modified # last-modified else: meta['etag'] = "W/%s" % minfo.clen # use clen as a weak etag meta['content-type'] = minfo.rsp_headers.get('content-type','') # content-type meta['referer'] = minfo.req_headers.get('referer','') # referer (from request) return meta
def _append(self, parent, baseuri, uri, ctype, tag): abs_uri = urlparse.urljoin(baseuri, uri) abs_uri = httputil.canonicalize(abs_uri) # TODO: test if abs_uri in self.uri_set: #log.warn('Skip repeated resource - %s %s' % (tag, uri)) return res = Resource(parent, uri=abs_uri, ctype=ctype, tag=tag) if parent: parent.children.append(res) self.resource_list.append(res) self.uri_set.add(abs_uri) self.fetcher.queue(res)