def links(self): import _beautifulsoup bs = self._bs base_url = self._base_url encoding = self._encoding gen = bs.recursiveChildGenerator() for ch in bs.recursiveChildGenerator(): if (isinstance(ch, _beautifulsoup.Tag) and ch.name in self.urltags.keys() + ["base"]): link = ch attrs = bs.unescape_attrs(link.attrs) attrs_dict = dict(attrs) if link.name == "base": base_href = attrs_dict.get("href") if base_href is not None: base_url = base_href continue url_attr = self.urltags[link.name] url = attrs_dict.get(url_attr) if not url: continue url = _rfc3986.clean_url(url, encoding) text = link.fetchText(lambda t: True) if not text: # follow _pullparser's weird behaviour rigidly if link.name == "a": text = "" else: text = None else: text = self.compress_re.sub(" ", " ".join(text).strip()) yield Link(base_url, url, text, link.name, attrs)
def links(self, urltags=None): if urltags is None: urltags = self.urltags bs = self._bs base_url = self._base_url encoding = self._encoding for ch in bs.recursiveChildGenerator(): if isinstance(ch, _beautifulsoup.Tag) and ch.name in urltags.keys() + ["base"]: link = ch attrs = bs.unescape_attrs(link.attrs) attrs_dict = dict(attrs) if link.name == "base": base_href = attrs_dict.get("href") if base_href is not None: base_url = base_href continue url_attr = urltags[link.name] url = attrs_dict.get(url_attr) if not url: continue url = _rfc3986.clean_url(url, encoding) text = link.fetchText(lambda t: True) if not text: # follow _pullparser's weird behaviour rigidly if link.name == "a": text = "" else: text = None else: text = self.compress_re.sub(" ", " ".join(text).strip()) yield Link(base_url, url, text, link.name, attrs)
def http_error_302(self, req, fp, code, msg, headers): # Some servers (incorrectly) return multiple Location headers # (so probably same goes for URI). Use first header. if "location" in headers: newurl = headers.getheaders("location")[0] elif "uri" in headers: newurl = headers.getheaders("uri")[0] else: return newurl = _rfc3986.clean_url(newurl, "latin-1") newurl = _rfc3986.urljoin(req.get_full_url(), newurl) # XXX Probably want to forget about the state of the current # request, although that might interact poorly with other # handlers that also use handler-specific request attributes new = self.redirect_request(newurl, req, fp, code, msg, headers) if new is None: return # loop detection # .redirect_dict has a key url if url was previously visited. if hasattr(req, "redirect_dict"): visited = new.redirect_dict = req.redirect_dict if visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections: raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} visited[newurl] = visited.get(newurl, 0) + 1 # Don't close the fp until we are sure that we won't use it # with HTTPError. fp.read() fp.close() return self.parent.open(new)
def links(self, urltags=None): """Return an iterator that provides links of the document.""" if urltags is None: urltags = self.urltags response = self._response encoding = self._encoding base_url = self._base_url response.seek(0) p = self.link_parser_class(response, encoding=encoding) try: for token in p.tags(*(urltags.keys() + ["base"])): if token.type == "endtag": continue if token.data == "base": base_href = dict(token.attrs).get("href") if base_href is not None: base_url = base_href continue attrs = dict(token.attrs) tag = token.data text = None # XXX use attr_encoding for ref'd doc if that doc does not # provide one by other means # attr_encoding = attrs.get("charset") url = attrs.get(urltags[tag]) # XXX is "" a valid URL? if not url: # Probably an <A NAME="blah"> link or <AREA NOHREF...>. # For our purposes a link is something with a URL, so # ignore this. continue url = _rfc3986.clean_url(url, encoding) if tag == "a": if token.type != "startendtag": # hmm, this'd break if end tag is missing text = p.get_compressed_text(("endtag", tag)) # but this doesn't work for e.g. # <a href="blah"><b>Andy</b></a> # text = p.get_compressed_text() yield Link(base_url, url, text, tag, token.attrs) except sgmllib.SGMLParseError, exc: raise _form.ParseError(exc)
def links(self): """Return an iterator that provides links of the document.""" response = self._response encoding = self._encoding base_url = self._base_url p = self.link_parser_class(response, encoding=encoding) try: for token in p.tags(*(self.urltags.keys() + ["base"])): if token.type == "endtag": continue if token.data == "base": base_href = dict(token.attrs).get("href") if base_href is not None: base_url = base_href continue attrs = dict(token.attrs) tag = token.data name = attrs.get("name") text = None # XXX use attr_encoding for ref'd doc if that doc does not # provide one by other means #attr_encoding = attrs.get("charset") url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? if not url: # Probably an <A NAME="blah"> link or <AREA NOHREF...>. # For our purposes a link is something with a URL, so # ignore this. continue url = _rfc3986.clean_url(url, encoding) if tag == "a": if token.type != "startendtag": # hmm, this'd break if end tag is missing text = p.get_compressed_text(("endtag", tag)) # but this doesn't work for eg. # <a href="blah"><b>Andy</b></a> #text = p.get_compressed_text() yield Link(base_url, url, text, tag, token.attrs) except sgmllib.SGMLParseError, exc: raise ParseError(exc)
def http_error_302(self, req, fp, code, msg, headers): # Some servers (incorrectly) return multiple Location headers # (so probably same goes for URI). Use first header. if headers.has_key('location'): newurl = headers.getheaders('location')[0] elif headers.has_key('uri'): newurl = headers.getheaders('uri')[0] else: return newurl = _rfc3986.clean_url(newurl, "latin-1") newurl = _rfc3986.urljoin(req.get_full_url(), newurl) # XXX Probably want to forget about the state of the current # request, although that might interact poorly with other # handlers that also use handler-specific request attributes new = self.redirect_request(newurl, req, fp, code, msg, headers) if new is None: return # loop detection # .redirect_dict has a key url if url was previously visited. if hasattr(req, 'redirect_dict'): visited = new.redirect_dict = req.redirect_dict if (visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections): raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} visited[newurl] = visited.get(newurl, 0) + 1 # Don't close the fp until we are sure that we won't use it # with HTTPError. fp.read() fp.close() return self.parent.open(new)
def clean_refresh_url(url): # e.g. Firefox 1.5 does (something like) this if ((url.startswith('"') and url.endswith('"')) or (url.startswith("'") and url.endswith("'"))): url = url[1:-1] return _rfc3986.clean_url(url, "latin-1") # XXX encoding