示例#1
0
    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls
示例#2
0
    def __init__(self, service_user=None):
        if service_user:
            self.service_user = service_user
        self.scraper_settings_key = SolutionNewsScraperSettings.create_key(self.service_user)
        self.sln_settings = get_solution_settings(self.service_user)
        if BROADCAST_TYPE_NEWS not in self.sln_settings.broadcast_types:
            raise NewsScraperException(
                'Cannot check for news in %s because no broadcast type with name \'%s\' is found',
                self.sln_settings.name, BROADCAST_TYPE_NEWS)

        self.broadcast_type = transl(BROADCAST_TYPE_NEWS, self.sln_settings.main_language)
示例#3
0
        def trans():
            sln_news_scraper_settings = SolutionNewsScraperSettings.get(self.scraper_settings_key)
            if not sln_news_scraper_settings:
                sln_news_scraper_settings = SolutionNewsScraperSettings(key=self.scraper_settings_key)
                sln_news_scraper_settings.urls = []

            if permalink not in sln_news_scraper_settings.urls:
                sln_news_scraper_settings.urls.append(permalink)
                sln_news_scraper_settings.put()
                deferred.defer(create_news_item, self.sln_settings, broadcast_type, message, title, permalink,
                               _transactional=True)
示例#4
0
 def check_for_news(self):
     tree = self.get_page(u'%s/nl/press/index.html' % self.BASE_URL)
     scraper_settings = SolutionNewsScraperSettings.get(
         self.scraper_settings_key)
     scraped_urls = scraper_settings.urls if scraper_settings else []
     urls_in_page = [
         u'%s%s' % (self.BASE_URL, url)
         for url in tree.xpath('//a[@class="readmore"]/@href')
     ]
     urls = [url for url in urls_in_page if url not in scraped_urls]
     news = []
     for permalink in urls:
         title, message = self.get_details(permalink)
         news.append((title, message, permalink))
     for title, message, permalink in news:
         deferred.defer(self.create_news, self.broadcast_type, title,
                        message, permalink)
示例#5
0
 def check_for_news(self):
     tree = self.get_page(u'%s/nieuws/' % self.BASE_URL)
     scraper_settings = SolutionNewsScraperSettings.get(
         self.scraper_settings_key)
     container = tree.xpath(
         '//div[@class="table fixed"]//div[@class="col"]/div')
     urls_to_get = []
     for i, row in enumerate(container):
         permalink = u'%s%s' % (
             self.BASE_URL,
             row.xpath('//div[@class="node-links"]//a/@href')[i])
         if permalink not in scraper_settings.urls:
             urls_to_get.append(permalink)
     news = []
     for permalink in urls_to_get:
         title, message = self.get_details(permalink)
         news.append((title, message, permalink))
     for title, message, permalink in news:
         deferred.defer(self.create_news, self.broadcast_type, title,
                        message, permalink)
示例#6
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error(
            "check_for_news_in_be_laarne failed no broadcast type found with name '%s'",
            BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"http://www.laarne.be/website/437-www"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_laarne.\n%s" %
                      response.content)
        return

    tree = html.fromstring(response.content.decode("utf8"))

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(
        service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    ul = tree.xpath('//div[@id="nieuws"]//ul')
    if not ul:
        logging.error("Could not find ul for news in be_laarne")
        return

    news_items = []
    for item in ul[0].getchildren():
        if item.tag == "li":
            news_items.append([])

        if item.tag == "li" or item.tag == "a":
            news_items[-1].append(item)

    for news_item in news_items:
        if len(news_item) != 2:
            continue

        url = news_item[1].xpath("@href")[0]
        if not (url.startswith("http://") or url.startswith("https://")):
            url = u"http://www.laarne.be%s" % url

        url = unicode(url) if not isinstance(url, unicode) else url
        if url in urls:
            continue
        title = u'%s' % news_item[0].text.replace('\n', ' ')
        message = _get_news_details(url)

        def trans():
            sln_news_scraper_settings = SolutionNewsScraperSettings.get(
                sln_news_scraper_settings_key)
            if not sln_news_scraper_settings:
                sln_news_scraper_settings = SolutionNewsScraperSettings(
                    key=sln_news_scraper_settings_key)
                sln_news_scraper_settings.urls = []

            if url not in sln_news_scraper_settings.urls:
                sln_news_scraper_settings.urls.append(url)
                sln_news_scraper_settings.put()
                deferred.defer(create_news_item,
                               sln_settings,
                               broadcast_type,
                               message,
                               title,
                               url,
                               _transactional=True)

        db.run_in_transaction(trans)
示例#7
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error(
            "check_for_news_in_be_sint_lievens_houtem failed no broadcast type found with name '%s'",
            BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"https://www.sint-lievens-houtem.be/rss.xml"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error(
            "Could not check for news in be_sint_lievens_houtem.\n%s" %
            response.content)
        return

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(
        service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    doc = minidom.parseString(response.content)
    for item in doc.getElementsByTagName('item'):
        try:
            title = item.getElementsByTagName("title")[0].firstChild.nodeValue
            url = u'%s' % item.getElementsByTagName(
                "link")[0].firstChild.nodeValue
            description_html = item.getElementsByTagName(
                "description")[0].firstChild.nodeValue

            message, _, _ = parse_html_content(description_html)

        except:
            logging.debug("title: %s", title)
            logging.debug(item.childNodes)
            raise

        if url in urls:
            continue

        def trans():
            sln_news_scraper_settings = SolutionNewsScraperSettings.get(
                sln_news_scraper_settings_key)
            if not sln_news_scraper_settings:
                sln_news_scraper_settings = SolutionNewsScraperSettings(
                    key=sln_news_scraper_settings_key)
                sln_news_scraper_settings.urls = []

            if url not in sln_news_scraper_settings.urls:
                sln_news_scraper_settings.urls.append(url)
                sln_news_scraper_settings.put()
                deferred.defer(create_news_item,
                               sln_settings,
                               broadcast_type,
                               message,
                               title,
                               url,
                               _transactional=True)

        db.run_in_transaction(trans)
示例#8
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error("check_for_news_in_be_lokeren failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"https://lokeren.be/nieuws"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_lokeren.\n%s" % response.content)
        return

    tree = html.fromstring(response.content.decode("utf8"))

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    master_div = tree.xpath('//div[@id="content_container"]//div[@class="content"]//div[contains(@class, "newspost")]')[0]
    master_div_children = master_div.getchildren()
    news_items = []
    for item in master_div_children:
        if item.tag == "h2":
            news_items.append([])

        if item.tag == "h2" or item.tag == "p":
            news_items[-1].append(item)

    for news_item in news_items:
        if len(news_item) > 2:
            url = news_item[-1].getchildren()[0].xpath("@href")[0]
            if not (url.startswith("http://") or url.startswith("https://")):
                url = u"https://lokeren.be%s" % url
            url = unicode(url) if not isinstance(url, unicode) else url
            if url in urls:
                continue
            title = u'%s' % news_item[0].getchildren()[0].text
            message = u''
            for i in xrange(1, len(news_item) - 1):
                child = news_item[i]
                while True:
                    message = child.text and child.text.strip()
                    if message:
                        break

                    news_short_children = child.getchildren()
                    if not news_short_children:
                        break
                    child = news_short_children[0]

            def trans():
                sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key)
                if not sln_news_scraper_settings:
                    sln_news_scraper_settings = SolutionNewsScraperSettings(key=sln_news_scraper_settings_key)
                    sln_news_scraper_settings.urls = []

                if url not in sln_news_scraper_settings.urls:
                    sln_news_scraper_settings.urls.append(url)
                    sln_news_scraper_settings.put()
                    deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url,
                                   _transactional=True)

            db.run_in_transaction(trans)
示例#9
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error("check_for_news_in_be_dendermonde failed no broadcast type found with name '%s'", BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"http://www.dendermonde.be/rssout.aspx?cat=N"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_dendermonde.\n%s" % response.content)
        return

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    doc = minidom.parseString(response.content)
    for item in doc.getElementsByTagName('item'):
        try:
            title = item.getElementsByTagName("title")[0].firstChild.nodeValue
            url = item.getElementsByTagName("link")[0].firstChild.nodeValue
            url = unicode(url) if not isinstance(url, unicode) else url
            if url in urls:
                continue

            response = urlfetch.fetch(url, deadline=60)
            if response.status_code != 200:
                logging.warn('Received status code %d from %s with content:\n', response.status_code, url,
                             response.content)
                continue

            tree = html.fromstring(response.content.decode("utf8"))
            div = tree.xpath('//div[@class="short box"]')
            if not div:
                logging.error('News scraper for dendermonde needs to be updated')
                continue
            message = u'%s' % div[0].text
        except Exception:
            logging.debug("title: %s", title)
            logging.debug(item.childNodes)
            raise

        def trans():
            sln_news_scraper_settings = SolutionNewsScraperSettings.get(sln_news_scraper_settings_key)
            if not sln_news_scraper_settings:
                sln_news_scraper_settings = SolutionNewsScraperSettings(key=sln_news_scraper_settings_key)
                sln_news_scraper_settings.urls = []

            if url not in sln_news_scraper_settings.urls:
                sln_news_scraper_settings.urls.append(url)
                sln_news_scraper_settings.put()
                deferred.defer(create_news_item, sln_settings, broadcast_type, message, title, url,
                               _transactional=True)

        db.run_in_transaction(trans)
示例#10
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error(
            "check_for_news_in_be_lede failed no broadcast type found with name '%s'",
            BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"http://www.lede.be/nieuwsoverzicht.Aspx"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_lede.\n%s" %
                      response.content)
        return

    tree = html.fromstring(response.content.decode("utf8"))

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(
        service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    news_items = tree.xpath(
        '//div[@class="nwsOverview"]')[0].getchildren()[3].getchildren()

    for li in news_items:
        for item in li.getchildren():
            if item.tag == "a":
                url = item.xpath("@href")[0]
                if not (url.startswith("http://")
                        or url.startswith("https://")):
                    url = u"http://www.lede.be%s" % url

                url = unicode(url) if not isinstance(url, unicode) else url
                if url in urls:
                    continue
                message = _get_news_details(url)
                title = u'%s' % item.text

                def trans():
                    sln_news_scraper_settings = SolutionNewsScraperSettings.get(
                        sln_news_scraper_settings_key)
                    if not sln_news_scraper_settings:
                        sln_news_scraper_settings = SolutionNewsScraperSettings(
                            key=sln_news_scraper_settings_key)
                        sln_news_scraper_settings.urls = []

                    if url not in sln_news_scraper_settings.urls:
                        sln_news_scraper_settings.urls.append(url)
                        sln_news_scraper_settings.put()
                        deferred.defer(create_news_item,
                                       sln_settings,
                                       broadcast_type,
                                       message,
                                       title,
                                       url,
                                       _transactional=True)

                db.run_in_transaction(trans)
示例#11
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error(
            "check_for_news_in_be_zele failed no broadcast type found with name '%s'",
            BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"https://www.zele.be/nieuws"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_zele.\n%s" %
                      response.content)
        return

    tree = html.fromstring(response.content.decode("utf8"))

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(
        service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    news_items = tree.xpath(
        '//div[@class="news-partial "]//div[@class="partial-container"]//ul//li'
    )

    for news_item in news_items:
        a = news_item.getchildren()[0]
        url = u'%s' % a.xpath("@href")[0]

        if not (url.startswith("http://") or url.startswith("https://")):
            url = u"https://www.zele.be%s" % url

        if url in urls:
            continue

        title = None
        message = u''
        for item in a.getchildren():
            if item.tag == "h3":
                title = u'%s' % item.text
            if item.tag == "div" and item.xpath("@class")[0] == "short":
                message = item.text

        if title:

            def trans():
                sln_news_scraper_settings = SolutionNewsScraperSettings.get(
                    sln_news_scraper_settings_key)
                if not sln_news_scraper_settings:
                    sln_news_scraper_settings = SolutionNewsScraperSettings(
                        key=sln_news_scraper_settings_key)
                    sln_news_scraper_settings.urls = []

                if url not in sln_news_scraper_settings.urls:
                    sln_news_scraper_settings.urls.append(url)
                    sln_news_scraper_settings.put()
                    deferred.defer(create_news_item,
                                   sln_settings,
                                   broadcast_type,
                                   message,
                                   title,
                                   url,
                                   _transactional=True)

            db.run_in_transaction(trans)
示例#12
0
def _check_for_news(service_user):
    sln_settings = get_solution_settings(service_user)
    if BROADCAST_TYPE_NEWS not in sln_settings.broadcast_types:
        logging.error(
            "check_for_news_in_be_wetteren failed no broadcast type found with name '%s'",
            BROADCAST_TYPE_NEWS)
        return

    broadcast_type = transl(BROADCAST_TYPE_NEWS, sln_settings.main_language)

    url = u"http://www.wetteren.be/nieuwsoverzicht/505/default.aspx?_vs=0_N"
    response = urlfetch.fetch(url, deadline=60)
    if response.status_code != 200:
        logging.error("Could not check for news in be_wetteren.\n%s" %
                      response.content)
        return

    tree = html.fromstring(response.content.decode("utf8"))

    sln_news_scraper_settings_key = SolutionNewsScraperSettings.create_key(
        service_user)

    def trans():
        sln_news_scraper_settings = SolutionNewsScraperSettings.get(
            sln_news_scraper_settings_key)
        if not sln_news_scraper_settings:
            return []

        return sln_news_scraper_settings.urls

    urls = db.run_in_transaction(trans)

    for title_node in tree.xpath(
            '//div[@id="overzicht"]//ul[@class="nieuws"]//li[@class="normal"]'
    ):
        _, a_title, _, span_detail = title_node.getchildren()
        title = u'%s' % a_title.text
        url = u'%s' % a_title.xpath("@href")[0]
        if not (url.startswith("http://") or url.startswith("https://")):
            url = u"http://www.wetteren.be%s" % url
        if url in urls:
            continue

        child = span_detail
        message = u''
        while True:
            message = child.text and child.text.strip()
            if message:
                break

            news_short_children = child.getchildren()
            if not news_short_children:
                break
            child = news_short_children[0]

        if message:
            message = u'%s' % message

            def trans():
                sln_news_scraper_settings = SolutionNewsScraperSettings.get(
                    sln_news_scraper_settings_key)
                if not sln_news_scraper_settings:
                    sln_news_scraper_settings = SolutionNewsScraperSettings(
                        key=sln_news_scraper_settings_key)
                    sln_news_scraper_settings.urls = []

                if url not in sln_news_scraper_settings.urls:
                    sln_news_scraper_settings.urls.append(url)
                    sln_news_scraper_settings.put()
                    deferred.defer(create_news_item,
                                   sln_settings,
                                   broadcast_type,
                                   message,
                                   title,
                                   url,
                                   _transactional=True)

            db.run_in_transaction(trans)