示例#1
0
 def title(self):
     import _pullparser
     p = _pullparser.TolerantPullParser(self._response,
                                        encoding=self._encoding)
     try:
         try:
             p.get_tag("title")
         except _pullparser.NoMoreTokensError:
             return None
         else:
             return self._get_title_text(p)
     except sgmllib.SGMLParseError, exc:
         raise _form.ParseError(exc)
示例#2
0
    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)

        try:
            for token in p.tags(*(self.urltags.keys() + ["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                #attr_encoding = attrs.get("charset")
                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue

                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for e.g.
                    # <a href="blah"><b>Andy</b></a>
                    #text = p.get_compressed_text()

                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise _form.ParseError(exc)