def get_text(self, endat=None): """Get some text. endat: stop reading text at this tag (the tag is included in the returned text); endtag is a tuple (type, name) where type is "starttag", "endtag" or "startendtag", and name is the element name of the tag (element names must be given in lower case) If endat is not given, .get_text() will stop at the next opening or closing tag, or when there are no more tokens (no exception is raised). Note that .get_text() includes the text representation (if any) of the opening tag, but pushes the opening tag back onto the stack. As a result, if you want to call .get_text() again, you need to call .get_tag() first (unless you want an empty string returned when you next call .get_text()). Entity references are translated using the value of the entitydefs constructor argument (a mapping from names to characters like that provided by the standard module htmlentitydefs). Named entity references that are not in this mapping are left unchanged. The textify attribute is used to translate opening tags into text: see the class docstring. """ text = [] tok = None while True: try: tok = self.get_token() except NoMoreTokensError: # unget last token (not the one we just failed to get) if tok: self.unget_token(tok) break if tok.type == "data": text.append(tok.data) elif tok.type == "entityref": t = unescape("&%s;" % tok.data, self._entitydefs, self.encoding) text.append(t) elif tok.type == "charref": t = unescape_charref(tok.data, self.encoding) text.append(t) elif tok.type in ["starttag", "endtag", "startendtag"]: tag_name = tok.data if tok.type in ["starttag", "startendtag"]: alt = self.textify.get(tag_name) if alt is not None: if callable(alt): text.append(alt(tok)) elif tok.attrs is not None: for k, v in tok.attrs: if k == alt: text.append(v) text.append("[%s]" % tag_name.upper()) if endat is None or endat == (tok.type, tag_name): self.unget_token(tok) break return "".join(text)
def get_text(self, endat=None): """Get some text. endat: stop reading text at this tag (the tag is included in the returned text); endtag is a tuple (type, name) where type is "starttag", "endtag" or "startendtag", and name is the element name of the tag (element names must be given in lower case) If endat is not given, .get_text() will stop at the next opening or closing tag, or when there are no more tokens (no exception is raised). Note that .get_text() includes the text representation (if any) of the opening tag, but pushes the opening tag back onto the stack. As a result, if you want to call .get_text() again, you need to call .get_tag() first (unless you want an empty string returned when you next call .get_text()). Entity references are translated using the value of the entitydefs constructor argument (a mapping from names to characters like that provided by the standard module htmlentitydefs). Named entity references that are not in this mapping are left unchanged. The textify attribute is used to translate opening tags into text: see the class docstring. """ text = [] tok = None while 1: try: tok = self.get_token() except NoMoreTokensError: # unget last token (not the one we just failed to get) if tok: self.unget_token(tok) break if tok.type == "data": text.append(tok.data) elif tok.type == "entityref": t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) text.append(t) elif tok.type == "charref": t = unescape_charref(tok.data, self.encoding) text.append(t) elif tok.type in ["starttag", "endtag", "startendtag"]: tag_name = tok.data if tok.type in ["starttag", "startendtag"]: alt = self.textify.get(tag_name) if alt is not None: if callable(alt): text.append(alt(tok)) elif tok.attrs is not None: for k, v in tok.attrs: if k == alt: text.append(v) text.append("[%s]" % tag_name.upper()) if endat is None or endat == (tok.type, tag_name): self.unget_token(tok) break return "".join(text)
def unescape_attr(self, name): #debug("%s", name) return unescape(name, self._entitydefs, self._encoding)
def handle_entityref(self, name): #debug("%s", name) self.handle_data(unescape( '&%s;' % name, self._entitydefs, self._encoding))
def unescape_attr(self, name): return unescape(name, self._entitydefs, self.encoding)