def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls
def __init__(self, service_user=None): if service_user: self.service_user = service_user self.scraper_settings_key = SolutionNewsScraperSettings.create_key(self.service_user) self.sln_settings = get_solution_settings(self.service_user) if BROADCAST_TYPE_NEWS not in self.sln_settings.broadcast_types: raise NewsScraperException( 'Cannot check for news in %s because no broadcast type with name \'%s\' is found', self.sln_settings.name, BROADCAST_TYPE_NEWS) self.broadcast_type = transl(BROADCAST_TYPE_NEWS, self.sln_settings.main_language)
def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get(self.scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings(key=self.scraper_settings_key) sln_news_scraper_settings.urls = [] if permalink not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(permalink) sln_news_scraper_settings.put() deferred.defer(create_news_item, self.sln_settings, broadcast_type, message, title, permalink, _transactional=True)
def check_for_news(self): tree = self.get_page(u'%s/nl/press/index.html' % self.BASE_URL) scraper_settings = SolutionNewsScraperSettings.get( self.scraper_settings_key) scraped_urls = scraper_settings.urls if scraper_settings else [] urls_in_page = [ u'%s%s' % (self.BASE_URL, url) for url in tree.xpath('//a[@class="readmore"]/@href') ] urls = [url for url in urls_in_page if url not in scraped_urls] news = [] for permalink in urls: title, message = self.get_details(permalink) news.append((title, message, permalink)) for title, message, permalink in news: deferred.defer(self.create_news, self.broadcast_type, title, message, permalink)
def check_for_news(self): tree = self.get_page(u'%s/nieuws/' % self.BASE_URL) scraper_settings = SolutionNewsScraperSettings.get( self.scraper_settings_key) container = tree.xpath( '//div[@class="table fixed"]//div[@class="col"]/div') urls_to_get = [] for i, row in enumerate(container): permalink = u'%s%s' % ( self.BASE_URL, row.xpath('//div[@class="node-links"]//a/@href')[i]) if permalink not in scraper_settings.urls: urls_to_get.append(permalink) news = [] for permalink in urls_to_get: title, message = self.get_details(permalink) news.append((title, message, permalink)) for title, message, permalink in news: deferred.defer(self.create_news, self.broadcast_type, title, message, permalink)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error( "check_for_news_in_be_laarne failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"http://www.laarne.be/website/437-www" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_laarne.\n%s" % response.content) return tree = html.fromstring(response.content.decode("utf8")) sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key( service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) ul = tree.xpath('//div[@id="nieuws"]//ul') if not ul: logging.error("Could not find ul for news in be_laarne") return news_items = [] for item in ul[0].getchildren(): if item.tag == "li": news_items.append([]) if item.tag == "li" or item.tag == "a": news_items[-1].append(item) for news_item in news_items: if len(news_item) != 2: continue url = news_item[1].xpath("@href")[0] if not (url.startswith("http://") or url.startswith("https://")): url = u"http://www.laarne.be%s" % url url = unicode(url) if not isinstance(url, unicode) else url if url in urls: continue title = u'%s' % news_item[0].text.replace('\n', ' ') message = _get_news_details(url) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings( key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error( "check_for_news_in_be_sint_lievens_houtem failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"https://www.sint-lievens-houtem.be/rss.xml" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error( "Could not check for news in be_sint_lievens_houtem.\n%s" % response.content) return sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key( service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) doc = minidom.parseString(response.content) for item in doc.getElementsByTagName('item'): try: title = item.getElementsByTagName("title")[0].firstChild.nodeValue url = u'%s' % item.getElementsByTagName( "link")[0].firstChild.nodeValue description_html = item.getElementsByTagName( "description")[0].firstChild.nodeValue message, _, _ = parse_html_content(description_html) except: logging.debug("title: %s", title) logging.debug(item.childNodes) raise if url in urls: continue def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings( key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error("check_for_news_in_be_lokeren failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"https://lokeren.be/nieuws" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_lokeren.\n%s" % response.content) return tree = html.fromstring(response.content.decode("utf8")) sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) master_div = tree.xpath('//div[@id="content_container"]//div[@class="content"]//div[contains(@class, "newspost")]')[0] master_div_children = master_div.getchildren() news_items = [] for item in master_div_children: if item.tag == "h2": news_items.append([]) if item.tag == "h2" or item.tag == "p": news_items[-1].append(item) for news_item in news_items: if len(news_item) > 2: url = news_item[-1].getchildren()[0].xpath("@href")[0] if not (url.startswith("http://") or url.startswith("https://")): url = u"https://lokeren.be%s" % url url = unicode(url) if not isinstance(url, unicode) else url if url in urls: continue title = u'%s' % news_item[0].getchildren()[0].text message = u'' for i in xrange(1, len(news_item) - 1): child = news_item[i] while True: message = child.text and child.text.strip() if message: break news_short_children = child.getchildren() if not news_short_children: break child = news_short_children[0] def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings(key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error("check_for_news_in_be_dendermonde failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"http://www.dendermonde.be/rssout.aspx?cat=N" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_dendermonde.\n%s" % response.content) return sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) doc = minidom.parseString(response.content) for item in doc.getElementsByTagName('item'): try: title = item.getElementsByTagName("title")[0].firstChild.nodeValue url = item.getElementsByTagName("link")[0].firstChild.nodeValue url = unicode(url) if not isinstance(url, unicode) else url if url in urls: continue response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.warn('Received status code %d from %s with content:\n', response.status_code, url, response.content) continue tree = html.fromstring(response.content.decode("utf8")) div = tree.xpath('//div[@class="short box"]') if not div: logging.error('News scraper for dendermonde needs to be updated') continue message = u'%s' % div[0].text except Exception: logging.debug("title: %s", title) logging.debug(item.childNodes) raise def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings(key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error( "check_for_news_in_be_lede failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"http://www.lede.be/nieuwsoverzicht.Aspx" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_lede.\n%s" % response.content) return tree = html.fromstring(response.content.decode("utf8")) sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key( service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) news_items = tree.xpath( '//div[@class="nwsOverview"]')[0].getchildren()[3].getchildren() for li in news_items: for item in li.getchildren(): if item.tag == "a": url = item.xpath("@href")[0] if not (url.startswith("http://") or url.startswith("https://")): url = u"http://www.lede.be%s" % url url = unicode(url) if not isinstance(url, unicode) else url if url in urls: continue message = _get_news_details(url) title = u'%s' % item.text def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings( key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error( "check_for_news_in_be_zele failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"https://www.zele.be/nieuws" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_zele.\n%s" % response.content) return tree = html.fromstring(response.content.decode("utf8")) sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key( service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) news_items = tree.xpath( '//div[@class="news-partial "]//div[@class="partial-container"]//ul//li' ) for news_item in news_items: a = news_item.getchildren()[0] url = u'%s' % a.xpath("@href")[0] if not (url.startswith("http://") or url.startswith("https://")): url = u"https://www.zele.be%s" % url if url in urls: continue title = None message = u'' for item in a.getchildren(): if item.tag == "h3": title = u'%s' % item.text if item.tag == "div" and item.xpath("@class")[0] == "short": message = item.text if title: def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings( key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)
def _check_for_news(service_user): sln_settings = get_solution_settings(service_user) if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types: logging.error( "check_for_news_in_be_wetteren failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS) return broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language) url = u"http://www.wetteren.be/nieuwsoverzicht/505/default.aspx?_vs=0_N" response = urlfetch.fetch(url, deadline=60) if response.status_code != 200: logging.error("Could not check for news in be_wetteren.\n%s" % response.content) return tree = html.fromstring(response.content.decode("utf8")) sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key( service_user) def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: return [] return sln_news_scraper_settings.urls urls = db.run_in_transaction(trans) for title_node in tree.xpath( '//div[@id="overzicht"]//ul[@class="nieuws"]//li[@class="normal"]' ): _, a_title, _, span_detail = title_node.getchildren() title = u'%s' % a_title.text url = u'%s' % a_title.xpath("@href")[0] if not (url.startswith("http://") or url.startswith("https://")): url = u"http://www.wetteren.be%s" % url if url in urls: continue child = span_detail message = u'' while True: message = child.text and child.text.strip() if message: break news_short_children = child.getchildren() if not news_short_children: break child = news_short_children[0] if message: message = u'%s' % message def trans(): sln_news_scraper_settings = SolutionNewsScraperSettings.get( sln_news_scraper_settings_key) if not sln_news_scraper_settings: sln_news_scraper_settings = SolutionNewsScraperSettings( key=sln_news_scraper_settings_key) sln_news_scraper_settings.urls = [] if url not in sln_news_scraper_settings.urls: sln_news_scraper_settings.urls.append(url) sln_news_scraper_settings.put() deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url, _transactional=True) db.run_in_transaction(trans)