Пример #1
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang') >= 0: tag = 'language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html') >= 0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        for entry in data.entries:
            if entry.has_key('published_parsed') and entry['published_parsed']:
                if entry['published_parsed'] > now:
                    del entry['published_parsed']
                    del entry['published']
            if entry.has_key('updated_parsed') and entry['updated_parsed']:
                if entry['updated_parsed'] > now:
                    del entry['updated_parsed']
                    del entry['updated']
    elif future_dates == 'ignore_entry':
        now = time.time()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        data.entries = [
            entry for entry in data.entries
            if (not entry.has_key('published_parsed')
                or not entry['published_parsed']
                or entry['published_parsed'] <= now) and
            (not entry.has_key('updated_parsed') or not entry['updated_parsed']
             or entry['updated_parsed'] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content' and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            if node['value']:
                # Run this through HTML5's sanitizer
                doc = None
                if 'xhtml' in node['type']:
                    try:
                        from xml.dom import minidom
                        doc = minidom.parseString(node['value'])
                    except:
                        node['type'] = 'text/html'

                if not doc:
                    from html5lib import html5parser, treebuilders, sanitizer
                    p = html5parser.HTMLParser(
                        tree=treebuilders.getTreeBuilder('dom'),
                        tokenizer=sanitizer.HTMLSanitizer)
                    doc = p.parseFragment(node['value'], encoding='utf-8')

                from html5lib import treewalkers, serializer
                walker = treewalkers.getTreeWalker('dom')(doc)
                xhtml = serializer.HTMLSerializer(inject_meta_charset=False)
                tree = xhtml.serialize(walker, encoding='utf-8')
                node['value'] = ''.join([str(token) for token in tree])
Пример #2
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang') >= 0: tag = 'language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html') >= 0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        for entry in data.entries:
            if entry.has_key('published_parsed') and entry['published_parsed']:
                if entry['published_parsed'] > now:
                    del entry['published_parsed']
                    del entry['published']
            if entry.has_key('updated_parsed') and entry['updated_parsed']:
                if entry['updated_parsed'] > now:
                    del entry['updated_parsed']
                    del entry['updated']
    elif future_dates == 'ignore_entry':
        now = time.time()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        data.entries = [
            entry for entry in data.entries
            if (not entry.has_key('published_parsed')
                or not entry['published_parsed']
                or entry['published_parsed'] <= now) and
            (not entry.has_key('updated_parsed') or not entry['updated_parsed']
             or entry['updated_parsed'] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content' and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            node['value'] = feedparser._sanitizeHTML(node.value, 'utf-8',
                                                     node.type)
Пример #3
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang')>=0: tag='language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html')>=0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
      now = time.gmtime()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      for entry in data.entries:
        if entry.has_key('published_parsed') and entry['published_parsed']:
          if entry['published_parsed'] > now:
            del entry['published_parsed']
            del entry['published']
        if entry.has_key('updated_parsed') and entry['updated_parsed']:
          if entry['updated_parsed'] > now:
            del entry['updated_parsed']
            del entry['updated']
    elif future_dates == 'ignore_entry':
      now = time.time()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      data.entries = [entry for entry in data.entries if 
        (not entry.has_key('published_parsed') or not entry['published_parsed']
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content'and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            if node['value']:
                # Run this through HTML5's sanitizer
                doc = None
                if 'xhtml' in node['type']:
                    try:
                        from xml.dom import minidom
                        doc = minidom.parseString(node['value'])
                    except:
                        node['type']='text/html'

                if not doc:
                    from html5lib import html5parser, treebuilders, sanitizer
                    p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'), tokenizer=sanitizer.HTMLSanitizer)
                    doc = p.parseFragment(node['value'], encoding='utf-8')

                from html5lib import treewalkers, serializer
                walker = treewalkers.getTreeWalker('dom')(doc)
                xhtml = serializer.HTMLSerializer(inject_meta_charset = False)
                tree = xhtml.serialize(walker, encoding='utf-8')
                node['value'] = ''.join([str(token) for token in tree])
Пример #4
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang')>=0: tag='language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html')>=0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
      now = time.gmtime()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      for entry in data.entries:
        if entry.has_key('published_parsed') and entry['published_parsed']:
          if entry['published_parsed'] > now:
            del entry['published_parsed']
            del entry['published']
        if entry.has_key('updated_parsed') and entry['updated_parsed']:
          if entry['updated_parsed'] > now:
            del entry['updated_parsed']
            del entry['updated']
    elif future_dates == 'ignore_entry':
      now = time.time()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      data.entries = [entry for entry in data.entries if 
        (not entry.has_key('published_parsed') or not entry['published_parsed']
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]
Пример #5
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang')>=0: tag='language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html')>=0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
      now = time.gmtime()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      for entry in data.entries:
        if entry.has_key('published_parsed') and entry['published_parsed']:
          if entry['published_parsed'] > now:
            del entry['published_parsed']
            del entry['published']
        if entry.has_key('updated_parsed') and entry['updated_parsed']:
          if entry['updated_parsed'] > now:
            del entry['updated_parsed']
            del entry['updated']
    elif future_dates == 'ignore_entry':
      now = time.time()
      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
      data.entries = [entry for entry in data.entries if 
        (not entry.has_key('published_parsed') or not entry['published_parsed']
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content'and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            node['value'] = feedparser._sanitizeHTML(
                node.value, 'utf-8', node.type)
Пример #6
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find("lang") >= 0:
            tag = "language"
        if data.feed.has_key(tag):
            del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag):
                del entry[tag]
            if entry.has_key(tag + "_detail"):
                del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"):
                del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith("_detail"):
                    continue
                for detail in entry[key].copy():
                    if detail == tag:
                        del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key("title_detail"):
                entry.title_detail["type"] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key("summary_detail"):
                entry.summary_detail["type"] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key("content"):
                entry.content[0]["type"] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find("html") >= 0:
        from shell.tmpl import stripHtml

        if data.feed.has_key("author_detail") and data.feed.author_detail.has_key("name"):
            data.feed.author_detail["name"] = str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key("author_detail") and entry.author_detail.has_key("name"):
                entry.author_detail["name"] = str(stripHtml(entry.author_detail.name))
            if entry.has_key("source"):
                source = entry.source
                if source.has_key("author_detail") and source.author_detail.has_key("name"):
                    source.author_detail["name"] = str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == "ignore_date":
        now = time.gmtime()
        if data.feed.has_key("updated_parsed") and data.feed["updated_parsed"]:
            if data.feed["updated_parsed"] > now:
                del data.feed["updated_parsed"]
        for entry in data.entries:
            if entry.has_key("published_parsed") and entry["published_parsed"]:
                if entry["published_parsed"] > now:
                    del entry["published_parsed"]
                    del entry["published"]
            if entry.has_key("updated_parsed") and entry["updated_parsed"]:
                if entry["updated_parsed"] > now:
                    del entry["updated_parsed"]
                    del entry["updated"]
    elif future_dates == "ignore_entry":
        now = time.time()
        if data.feed.has_key("updated_parsed") and data.feed["updated_parsed"]:
            if data.feed["updated_parsed"] > now:
                del data.feed["updated_parsed"]
        data.entries = [
            entry
            for entry in data.entries
            if (
                not entry.has_key("published_parsed")
                or not entry["published_parsed"]
                or entry["published_parsed"] <= now
            )
            and (not entry.has_key("updated_parsed") or not entry["updated_parsed"] or entry["updated_parsed"] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == "content" and not entry.has_key("content_detail"):
                node = entry.content[0]
            elif key.endswith("_detail"):
                node = entry[key]
            else:
                continue

            if not node.has_key("type"):
                continue
            if not "html" in node["type"]:
                continue
            if not node.has_key("value"):
                continue

            if node.has_key("base"):
                if scrub_xmlbase:
                    if scrub_xmlbase == "feed_alternate":
                        if entry.has_key("source") and entry.source.has_key("link"):
                            node["base"] = entry.source.link
                        elif data.feed.has_key("link"):
                            node["base"] = data.feed.link
                    elif scrub_xmlbase == "entry_alternate":
                        if entry.has_key("link"):
                            node["base"] = entry.link
                    else:
                        node["base"] = feedparser._urljoin(node["base"], scrub_xmlbase)

                node["value"] = feedparser._resolveRelativeURIs(node.value, node.base, "utf-8", node.type)

            # Run this through HTML5's sanitizer
            doc = None
            if "xhtml" in node["type"]:
                try:
                    from xml.dom import minidom

                    doc = minidom.parseString(node["value"])
                except:
                    node["type"] = "text/html"

            if not doc:
                from html5lib import html5parser, treebuilders

                p = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
                doc = p.parseFragment(node["value"], encoding="utf-8")

            from html5lib import treewalkers, serializer
            from html5lib.filters import sanitizer

            walker = sanitizer.Filter(treewalkers.getTreeWalker("dom")(doc))
            xhtml = serializer.XHTMLSerializer(inject_meta_charset=False)
            tree = xhtml.serialize(walker, encoding="utf-8")

            node["value"] = "".join([str(token) for token in tree])