Exemplo n.º 1
0
def crawl_chg(crawler, out):
    def _chg_content(page):
        return page.split('<div class="container" id="article">')[1].split(
            '<!-- /.right columns -->')[0]

    sitemap = 'https://www.chg.gov.ie/ga/help/sitemap/'
    res = crawler.fetch(sitemap)
    if res.status != 200:
        return
    links = set()
    html = res.content.decode('utf-8')
    body = _chg_content(html)
    for pagelink in re.findall('<a href="([^"]*)">', body):
        if pagelink.startswith('https://www.chg.gov.ie/ga/'):
            links.add(pagelink)
    for link in links:
        pres = crawler.fetch(link)
        if pres.status != 200:
            continue
        phtml = pres.content.decode('utf-8')
        ptext = _chg_content(phtml)
        title = re.search(r'<title>(.+?)</title>', phtml)
        if title: title = striptags(title.group(1).split('|')[0]).strip()
        pubdate = pres.headers.get('Last-Modified')
        out.write('# Location: %s\n' % link)
        out.write('# Genre: Government\n')
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        for paragraph in re.findall(r'<p>(.+?)</p>', ptext):
            cleaned = cleantext(paragraph)
            out.write(cleaned + '\n')
Exemplo n.º 2
0
def crawl_nuachtrte(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'http://www.rte.ie/sitemap.xml',
        subsitemap_filter=lambda x: _check_rte_sitemap(x))
    pubdate_regex = re.compile(
        r'name="DC.date" (?:scheme="DCTERMS.URI" )?content="([0-9T:+\-]{19,25})"'
    )
    for url in sorted(sitemap.keys()):
        if not _rtenuacht_path(url):
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        title = re.search(r'<title>(.+?)</title>', html)
        if title: title = striptags(title.group(1).split('- RTÉ')[0]).strip()
        if title: out.write(cleantext(title) + '\n')
        for paragraph in re.findall(r'<p>(.+?)</p>', html):
            cleaned = cleantext(paragraph)
            if _rte_writable_paragraph(cleaned):
                out.write(cleaned + '\n')
            else:
                continue
Exemplo n.º 3
0
def crawl_gsw_wettiger_nochrichte(crawler):
    urls = crawler.fetch_sitemap(
        'https://wettiger-nochrichte.net/sitemap.xml').keys()
    out = crawler.get_output('gsw-u-sd-chag')
    for url in sorted(urls):
        if url.find('//wettiger-nochrichte.net/20') < 0:
            continue
        html = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'<time class="entry-date" datetime="(.+?)"', html)
        html = html.split('class="post-content">')
        html = html[1].split('<style')[0]
        paragraphs = []
        for p in re.split(r'</?(p|h1|h2).+?>', html):
            p = ' '.join(replace_html_entities(striptags(p)).split())
            if ((p not in ('', 'p', 'h2', 'h3')) and (not p.startswith('http'))
                    and ('<' not in p)
                    and (not p.endswith('by Wettiger Nochrichte'))
                    and (not p.endswith('by LuFiLa'))
                    and (not p.endswith('by Wettiger'))):
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate.group(1))
            for p in paragraphs:
                out.write(p + '\n')
Exemplo n.º 4
0
def crawl_gsw_seislerblog(crawler):
    urls = set()
    for i in range(1, 16):
        indexurl = ('http://www.freiburger-nachrichten.ch/blogs/seislerblog'
                    '?page=%d' % i)
        html = crawler.fetch(indexurl).content.decode('utf-8')
        for url in re.findall(r'<a href="(/blogs/seislerblog/.+?)[\s"]', html):
            urls.add(urljoin(indexurl, url))
    out = crawler.get_output('gsw-u-sd-chfr')
    for url in sorted(urls):
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Blog\n')
        text = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(
            r'<span class="date-created">([0-9]{1,2})\.([0-9]{2})\.'
            '(20[0-9]{2})</span>', text)
        if pubdate != None:
            day, month, year = pubdate.groups()
            pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day))
            out.write('# Publication-Date: %s\n' % pubdate)
        text = text.split('<h1>', 1)[-1].split('<section')[0]
        text = text.replace('\n', ' ')
        for tag in ('</p>', '</h1>', '</div>'):
            text = text.replace(tag, '\n')
        for p in [
                ' '.join(striptags(t).strip().split())
                for t in text.splitlines()
        ]:
            if p and p != 'Kommentare':
                out.write(p + '\n')
Exemplo n.º 5
0
def crawl_gsw_derbund(crawler):
    urls = set()
    for i in range(1, 200):
        url = ('https://www.derbund.ch/ajax/tags.html?'
               'action=moreDossierStories&section_id=11127&page=%d'
               '&dossier_id=3069' % i)
        items = json.loads(crawler.fetch(url).content)['items']
        for path in re.findall(r'<a href="(.+?)"', ''.join(items)):
            if not path.startswith('/stichwort/autor/'):
                urls.add(urljoin('https://www.derbund.ch/', path))
        if len(items) == 0:
            break
    out = crawler.get_output('gsw-u-sd-chbe')
    for url in sorted(urls):
        text = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'Erstellt: ([0-9]{1,2})\.([0-9]{2})\.([0-9]{4})',
                            text)
        if pubdate is not None:
            day, month, year = pubdate.groups()
            pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Blog\n')
        if pubdate is not None:
            out.write('# Publication-Date: %s\n' % pubdate)
        text = text.split('<div id="mainContent">')[1]
        text = text.split('<span class"idcode"')[0].split('(Der Bund)')[0]
        text = text.replace('***', ' ')
        if text.find('var badwordserch = 1;') >= 0:
            text = text.split('var badwordserch = 1;', 1)[1]
        paras = [' '.join(striptags(p).split()) for p in text.split('</p>')]
        for p in paras:
            if p:
                out.write(p + '\n')
Exemplo n.º 6
0
def crawl_than_lwin_times(crawler, out):
    sitemap = crawler.fetch_sitemap('http://thanlwintimes.com/sitemap.xml')
    for url in sorted(sitemap.keys()):
        html = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'<meta itemprop="datePublished" content="(.+?)"',
                            html)
        if pubdate is None:
            continue
        # prepare for split; some texts use different tags
        html = html.replace('</div><pre>', '</div><p>')
        html = html.replace('</div><div class="td-post-content"><p>',
                            '</div><p>')
        if html.find('</div><p>') < 0:
            continue
        text = html.split('</div><p>')[1]
        text = text.split('<div class=\'sfsi_Sicons ')[0]
        text = text.split('</noscript>')[0]
        text = text.replace('\n', ' ')
        text = text.replace('</p>', '\n').replace('</div>', '\n')
        paragraphs = []
        for p in text.splitlines():
            p = ' '.join(striptags(replace_html_entities(p)).split())
            if p and ('>' not in p) and (p.find('"caption":') < 0):
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s\n' % pubdate.groups(1))
            for p in paragraphs:
                out.write(p + '\n')
Exemplo n.º 7
0
def crawl_panglong(crawler, out):
    urls = set()
    extract_urls = lambda h: re.findall(r'http://panglong.org/\?p=[0-9]+', h)
    for cat in range(1, 20):
        caturl = 'http://panglong.org/?cat=%d' % cat
        page = crawler.fetch(caturl)
        if page.status != 200:
            continue
        urls.update(extract_urls(page.content))
        pageids = re.findall(r';paged=([0-9]+)', page.content)
        if len(pageids) > 0:
            for pageid in range(2, max([int(p) for p in pageids]) + 1):
                cpurl = 'http://panglong.org/?cat=%d&paged=%d' % (cat, pageid)
                page = crawler.fetch(cpurl)
                if page.status == 200:
                    urls.update(extract_urls(page.content))
    for url in urls:
        try:
            html = crawler.fetch(url).content.decode('utf-8')
        except UnicodeDecodeError:  # a handful of documents are invalid utf8
            continue
        pubdate = re.search(r'<meta itemprop="datePublished" content="(.+)?"',
                            html)
        if pubdate is not None:
            pubdate = pubdate.group(1).strip()
        title = re.search(r'<meta property="og:title" content="(.+?)"', html)
        paras = []
        if title is not None:
            paras.append(title.group(1).strip())
        if html.find('class="entry-content">') > 0:
            text = html.split('class="entry-content">')[1]
            text = text.split('<div')[0]
            for p in text.split('</p>'):
                p = ' '.join(striptags(replace_html_entities(p)).split())
                if p:
                    paras.append(p)
        if len(paras) == 0:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')
Exemplo n.º 8
0
def crawl_mon_news(crawler, out):
    urls = set()
    for year in range(2009, datetime.today().year + 1):
        first_page = crawler.fetch('http://mon.monnews.org/%d/' % year)
        html = first_page.content.decode('utf-8')
        urls.update(extract_mon_news_urls(html))
        num_pages = re.search(
            r'<a href="http://mon.monnews.org/\d+/page/(\d+)/" class="last"',
            html)
        if num_pages != None:
            num_pages = int(num_pages.group(1))
            for page in range(2, num_pages + 1):
                next_page = crawler.fetch(
                    'http://mon.monnews.org/%d/page/%d/' % (year, page))
                if next_page.status != 200:
                    continue
                html = next_page.content.decode('utf-8')
                urls.update(extract_mon_news_urls(html))
    for url in sorted(urls):
        html = crawler.fetch(url.encode('utf-8')).content.decode('utf-8')
        pubdate = re.search(
            r'<meta property="article:published_time" content="(.+?)"', html)
        if pubdate is None:
            continue
        pubdate = pubdate.groups(1)
        text = html.split('</section>')[1].split('<div class="sharedaddy')[0]
        text = text.split('Share this:')[0]
        text = text.replace('\n', ' ')
        text = text.replace('</p>', '\n').replace('</div>', '\n')
        paragraphs = []
        for p in text.splitlines():
            p = ' '.join(striptags(replace_html_entities(p)).split())
            if p and '>' not in p:
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s\n' % pubdate)
            for p in paragraphs:
                out.write(p + '\n')
Exemplo n.º 9
0
def _scrape_paiperatapu(crawler, out):
    booklist = list()
    books = crawler.fetch(
        'http://www.paiperatapu.maori.nz/paipera-tapu-online')
    assert books.status == 200, books.status
    bookshtml = books.content.decode('utf-8')
    bookshtmlinner = bookshtml.split('<div class="bible-book-list">')[1].split(
        '<li class="first bible-search">')[0]
    for bookslink in re.findall(r'<a href="(/bible/[0-9]*/[^"]*)">',
                                bookshtmlinner):
        bookurl = 'http://www.paiperatapu.maori.nz' + bookslink
        book = crawler.fetch(bookurl)
        assert book.status == 200, book.status
        bookhtml = book.content.decode('utf-8')
        bookhtmlinner = bookhtml.split('<ul class="bible-chapter-list">')[
            1].split('<div class="bible-links">')[0]
        for chapterlink in re.findall(
                r'<a href="(/bible/[0-9]*/[^/]*/[^"]*)">', bookhtmlinner):
            url = 'http://www.paiperatapu.maori.nz' + chapterlink
            chapter = crawler.fetch(url)
            assert chapter.status == 200, chapter.status
            chapterhtml = chapter.content.decode('utf-8')
            if '<dl class="bible-chapter-content">' not in chapterhtml:
                continue
            out.write('# Location: %s\n' % url)
            title = re.search(r'<title>(.+?)</title>', chapterhtml)
            if title:
                title = striptags(title.group(1).split('| Te')[0]).strip()
            # Title is in English
            if title: out.write('# Title: %s\n' % cleantext(title))
            out.write('# Genre: Religion\n')
            chapterhtmlinner = chapterhtml.split(
                '<dl class="bible-chapter-content">')[1].split(
                    '<div class="bible-chapter-seek">')[0]
            for verse in re.finditer(
                    r'<dt><a name="[^"]*"></a>([^<]*)</dt><dd class="[^"]*">([^<]*)</dd>',
                    chapterhtmlinner):
                out.write('%s %s\n' %
                          (verse.group(1), cleantext(verse.group(2))))
Exemplo n.º 10
0
def crawl_manxradio(crawler, out):
    urls = set()
    for i in range(1, 100):
        url = 'http://www.manxradio.com/news/manx-gaelic/archive/?page=%d' % i
        r = crawler.fetch(url)
        if r.status != 200 or r.content.find(b'No stories to show.') > 0:
            break
        for p in re.findall(r'<a href="/(news/manx-gaelic/[^"]+)"', r.content):
            url = 'http://www.manxradio.com/' + p
            if url.find('?') < 0:
                urls.add(url)
    for url in urls:
        r = crawler.fetch(url)
        assert r.status == 200, r.status
        html = r.content.decode('utf-8')
        pubdate = _extract_manxradio_timestamp(html)
        text = html.split('<p class="news-abstract">')
        if len(text) < 2:
            continue
        text = text[1].split('<STRONG>')[0].split('<strong>')[0]
        text = text.split('<p><span lang=""><b>')[0]
        text = text.replace('<p>', '\n').replace('</p>', '\n')
        text = text.replace('<P>', '\n').replace('</P>', '\n')
        text = striptags(replace_html_entities(text))
        text = text.replace(' - ', ' – ').replace("'", '’')
        if text.find('Listen to this audio') >= 0:
            continue
        paras = [' '.join(s.split()) for s in text.splitlines()]
        paras = [p for p in paras if p]
        if len(paras) == 0:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')
Exemplo n.º 11
0
def _scrape_maoritelevision(crawler, out):
    articlelist = set()
    articlelist.add('http://www.maoritelevision.com/mi/purongo/purongo-hou')
    articlelist.add('http://www.maoritelevision.com/mi/purongo/hakinakina')
    for i in range(1, 101):
        articlelist.add(
            'http://www.maoritelevision.com/mi/purongo/purongo-hou?page=%d' %
            i)
        articlelist.add(
            'http://www.maoritelevision.com/mi/purongo/hakinakina?page=%d' % i)
    links = set()
    pubdate_regex = re.compile(r'<time datetime="([0-9T:+\-]{25})"')
    for url in articlelist:
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        for articlepiece in content.split('<article')[1:]:
            for artlink in re.findall('<a href="(/mi/purongo/[^"]*)"',
                                      articlepiece):
                if not artlink.startswith('/mi/purongo/purongo-hou'):
                    links.add('http://www.maoritelevision.com%s' % artlink)
    for url in links:
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        if 'a-motu/rereatea-midday-news' in url:
            continue
        html = doc.content.decode('utf-8')
        if 'lang="mi"' not in html:
            continue
        if 'itemprop="articleBody"' not in html:
            continue
        genre = 'Sport' if '/hakinakina/' in url else 'News'
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = doc.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        # These news stories are a parallel (or at least comparable) corpus, so keeping
        # the link to the English article
        english = re.search(
            r'<a href="(/news/[^"]*)" class="language-link" lang="en">', html)
        if english:
            english = 'http://www.maoritelevision.com%s' % english.group(1)
        tags = set()
        if '<ul class="tags">' in html:
            tagshtml = html.split('<ul class="tags">')[1].split('</ul>')[0]
            for tag in re.findall(r'<a href="(?:[^"]*)">([^<]*)</a>',
                                  tagshtml):
                tags.add(cleantext(tag))
        paras = []
        title = re.search(r'<title>(.+?)</title>', html)
        if title:
            paras.append(
                cleantext(striptags(title.group(1).split('| Māori')[0])))
        articlehtml = html.split('class="field-body"')[1].split('</div>')[0]
        paras.extend(
            [cleantext(p) for p in re.findall(r'<p>(.+?)</p>', articlehtml)])
        paras = [p for p in paras
                 if p and p.find(' the ') < 0]  # filter out English
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: %s\n' % genre)
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        if english: out.write('# Translation.en: %s\n' % english)
        if tags: out.write('# Tags: %s\n' % ', '.join(tags))
        out.write('\n'.join(paras) + '\n')