示例#1
0
 def unknown_starttag(self, tag, attrs):
     # Called for each unhandled tag, where attrs is a list of (attr, value) tuples
     #   e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
     #   The attr name will be translated to lower case, and quotes in the  
     #   value have been removed and character and entity references 
     #   have been replaced. Starting Python 2.6 all entity references from 
     #   htmlentitydefs are replaced in the attribute values                    
     s = "".join([' %s="%s"' % (key, escape_html(value)) for key, value in attrs])
     if self.xhtml_mode and (tag in self.void_elements):
         self.pieces.append("<%s%s />" % (tag, s))
     else:
         self.pieces.append("<%s%s>" % (tag, s))
示例#2
0
文件: fetcher.py 项目: tewe/coldsweat
def get_entry_content(entry):
    """
    Select the best content from an entry
    """

    candidates = entry.get('content', [])
    if candidates:
        log.debug('content found for entry %s' % entry.link)    
    if 'summary_detail' in entry:
        log.debug('summary found for entry %s' % entry.link)    
        candidates.append(entry.summary_detail)
    for c in candidates:
        if 'html' in c.type: # Match text/html, application/xhtml+xml
            return c.type, c.value
        else: 
            # If the content is declared to be (or is determined to be) text/plain, 
            #   it will not be sanitized by Feedparser. This is to avoid data loss.
            return c.type, escape_html(c.value)
    log.debug('no content found for entry %s' % entry.link)    
    return '', ''