def unknown_starttag(self, tag, attrs): # Called for each unhandled tag, where attrs is a list of (attr, value) tuples # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")] # The attr name will be translated to lower case, and quotes in the # value have been removed and character and entity references # have been replaced. Starting Python 2.6 all entity references from # htmlentitydefs are replaced in the attribute values s = "".join([' %s="%s"' % (key, escape_html(value)) for key, value in attrs]) if self.xhtml_mode and (tag in self.void_elements): self.pieces.append("<%s%s />" % (tag, s)) else: self.pieces.append("<%s%s>" % (tag, s))
def get_entry_content(entry): """ Select the best content from an entry """ candidates = entry.get('content', []) if candidates: log.debug('content found for entry %s' % entry.link) if 'summary_detail' in entry: log.debug('summary found for entry %s' % entry.link) candidates.append(entry.summary_detail) for c in candidates: if 'html' in c.type: # Match text/html, application/xhtml+xml return c.type, c.value else: # If the content is declared to be (or is determined to be) text/plain, # it will not be sanitized by Feedparser. This is to avoid data loss. return c.type, escape_html(c.value) log.debug('no content found for entry %s' % entry.link) return '', ''