Exemplo n.º 1
0
    def get_text(self, endat=None):
        """Get some text.

        endat: stop reading text at this tag (the tag is included in the
         returned text); endtag is a tuple (type, name) where type is
         "starttag", "endtag" or "startendtag", and name is the element name of
         the tag (element names must be given in lower case)

        If endat is not given, .get_text() will stop at the next opening or
        closing tag, or when there are no more tokens (no exception is raised).
        Note that .get_text() includes the text representation (if any) of the
        opening tag, but pushes the opening tag back onto the stack.  As a
        result, if you want to call .get_text() again, you need to call
        .get_tag() first (unless you want an empty string returned when you
        next call .get_text()).

        Entity references are translated using the value of the entitydefs
        constructor argument (a mapping from names to characters like that
        provided by the standard module htmlentitydefs).  Named entity
        references that are not in this mapping are left unchanged.

        The textify attribute is used to translate opening tags into text: see
        the class docstring.

        """
        text = []
        tok = None
        while True:
            try:
                tok = self.get_token()
            except NoMoreTokensError:
                # unget last token (not the one we just failed to get)
                if tok:
                    self.unget_token(tok)
                break
            if tok.type == "data":
                text.append(tok.data)
            elif tok.type == "entityref":
                t = unescape("&%s;" % tok.data, self._entitydefs,
                             self.encoding)
                text.append(t)
            elif tok.type == "charref":
                t = unescape_charref(tok.data, self.encoding)
                text.append(t)
            elif tok.type in ["starttag", "endtag", "startendtag"]:
                tag_name = tok.data
                if tok.type in ["starttag", "startendtag"]:
                    alt = self.textify.get(tag_name)
                    if alt is not None:
                        if callable(alt):
                            text.append(alt(tok))
                        elif tok.attrs is not None:
                            for k, v in tok.attrs:
                                if k == alt:
                                    text.append(v)
                            text.append("[%s]" % tag_name.upper())
                if endat is None or endat == (tok.type, tag_name):
                    self.unget_token(tok)
                    break
        return "".join(text)
Exemplo n.º 2
0
    def get_text(self, endat=None):
        """Get some text.

        endat: stop reading text at this tag (the tag is included in the
         returned text); endtag is a tuple (type, name) where type is
         "starttag", "endtag" or "startendtag", and name is the element name of
         the tag (element names must be given in lower case)

        If endat is not given, .get_text() will stop at the next opening or
        closing tag, or when there are no more tokens (no exception is raised).
        Note that .get_text() includes the text representation (if any) of the
        opening tag, but pushes the opening tag back onto the stack.  As a
        result, if you want to call .get_text() again, you need to call
        .get_tag() first (unless you want an empty string returned when you
        next call .get_text()).

        Entity references are translated using the value of the entitydefs
        constructor argument (a mapping from names to characters like that
        provided by the standard module htmlentitydefs).  Named entity
        references that are not in this mapping are left unchanged.

        The textify attribute is used to translate opening tags into text: see
        the class docstring.

        """
        text = []
        tok = None
        while 1:
            try:
                tok = self.get_token()
            except NoMoreTokensError:
                # unget last token (not the one we just failed to get)
                if tok: self.unget_token(tok)
                break
            if tok.type == "data":
                text.append(tok.data)
            elif tok.type == "entityref":
                t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
                text.append(t)
            elif tok.type == "charref":
                t = unescape_charref(tok.data, self.encoding)
                text.append(t)
            elif tok.type in ["starttag", "endtag", "startendtag"]:
                tag_name = tok.data
                if tok.type in ["starttag", "startendtag"]:
                    alt = self.textify.get(tag_name)
                    if alt is not None:
                        if callable(alt):
                            text.append(alt(tok))
                        elif tok.attrs is not None:
                            for k, v in tok.attrs:
                                if k == alt:
                                    text.append(v)
                            text.append("[%s]" % tag_name.upper())
                if endat is None or endat == (tok.type, tag_name):
                    self.unget_token(tok)
                    break
        return "".join(text)
Exemplo n.º 3
0
 def unescape_attr(self, name):
     #debug("%s", name)
     return unescape(name, self._entitydefs, self._encoding)
Exemplo n.º 4
0
 def handle_entityref(self, name):
     #debug("%s", name)
     self.handle_data(unescape(
         '&%s;' % name, self._entitydefs, self._encoding))
Exemplo n.º 5
0
 def unescape_attr(self, name):
     return unescape(name, self._entitydefs, self.encoding)
Exemplo n.º 6
0
 def unescape_attr(self, name):
     return unescape(name, self._entitydefs, self.encoding)