def process(self, dom): for child in domutils.getUniqueChildbyTagName(dom, 'rss', 'channel').childNodes: if not child.nodeType == child.ELEMENT_NODE: continue if child.nodeName == 'title': domutils.replaceChildren(child, dom.createTextNode('Meneame (Directo)')) if child.nodeName == 'link': domutils.replaceChildren(child, dom.createTextNode('https://github.com/ldotlopez/feedfilter')) if child.nodeName == 'atom:link' and child.hasAttribute('rel') and child.getAttribute('rel') in ('self','hub'): child.parentNode.removeChild(child) for item in dom.getElementsByTagName('item'): self.process_item(item)
def process_item(self, node): if not self.is_valid_item(node): return dom = domutils.getDomFromNode(node) # Get nodes for the 'meneame' link and 'dest' link. # Note than tag names and variable names are swapped, this is correct. # They aren't textNodes yet dest_link_node = node.getElementsByTagName('link')[0] meneame_link_node = node.getElementsByTagName('meneame:url')[0] guid_node = node.getElementsByTagName('guid')[0] # Get URLs as text. Contrary to their names and variables this code is correct, I'm storing them # swapped. meneame_url = dest_link_node.childNodes[0].data dest_url = meneame_link_node.childNodes[0].data # Now, we have to swap them on the dom domutils.replaceChildren(dest_link_node, dom.createTextNode(dest_url)) domutils.replaceChildren(guid_node, dom.createTextNode(dest_url)) # Rewrite description description = node.getElementsByTagName('description')[0].childNodes[0].data description = description.replace('noticia original', 'enlace meneame') description = description.replace(dest_url, meneame_url) domutils.replaceChildren(node.getElementsByTagName('description')[0], dom.createCDATASection(description))
def process(self, dom): cache = Cache(debug = False) items = dom.getElementsByTagName('item') for i in xrange(0, len(items)): print "Item %d of %d" % (i+1, len(items)) item = items[i] link_node = item.getElementsByTagName('link')[0] link_url = link_node.childNodes[0].data # Fetch page try: (buff, cached) = cache.fetch_url(link_url) soup = BeautifulSoup.BeautifulSoup(buff) except IOError as e: print "Unable to load url %s: %s" % (link_url, e) continue # Dont do anything if soup.title.text == 'Too Many Requests': cache.delete(link_url) print "Reddit is angry" continue try: real_link_url = soup.find('p', 'title').find('a', 'title').get('href') except AttributeError: cache.delete(link_url) print "Unable to retrieve original link for '%s'" % soup.title continue self._debug("Got real link on '%s' (%s), replacing" % (soup.title.text, real_link_url)) domutils.replaceChildren(link_node, dom.createTextNode(real_link_url)) replaceChildren(item.getElementsByTagName('guid')[0], dom.createTextNode(real_link_url)) if not cached: time.sleep(2)