def crawl_chg(crawler, out): def _chg_content(page): return page.split('<div class="container" id="article">')[1].split( '<!-- /.right columns -->')[0] sitemap = 'https://www.chg.gov.ie/ga/help/sitemap/' res = crawler.fetch(sitemap) if res.status != 200: return links = set() html = res.content.decode('utf-8') body = _chg_content(html) for pagelink in re.findall('<a href="([^"]*)">', body): if pagelink.startswith('https://www.chg.gov.ie/ga/'): links.add(pagelink) for link in links: pres = crawler.fetch(link) if pres.status != 200: continue phtml = pres.content.decode('utf-8') ptext = _chg_content(phtml) title = re.search(r'<title>(.+?)</title>', phtml) if title: title = striptags(title.group(1).split('|')[0]).strip() pubdate = pres.headers.get('Last-Modified') out.write('# Location: %s\n' % link) out.write('# Genre: Government\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for paragraph in re.findall(r'<p>(.+?)</p>', ptext): cleaned = cleantext(paragraph) out.write(cleaned + '\n')
def crawl_nuachtrte(crawler, out): sitemap = crawler.fetch_sitemap( 'http://www.rte.ie/sitemap.xml', subsitemap_filter=lambda x: _check_rte_sitemap(x)) pubdate_regex = re.compile( r'name="DC.date" (?:scheme="DCTERMS.URI" )?content="([0-9T:+\-]{19,25})"' ) for url in sorted(sitemap.keys()): if not _rtenuacht_path(url): continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) title = re.search(r'<title>(.+?)</title>', html) if title: title = striptags(title.group(1).split('- RTÉ')[0]).strip() if title: out.write(cleantext(title) + '\n') for paragraph in re.findall(r'<p>(.+?)</p>', html): cleaned = cleantext(paragraph) if _rte_writable_paragraph(cleaned): out.write(cleaned + '\n') else: continue
def crawl_gsw_wettiger_nochrichte(crawler): urls = crawler.fetch_sitemap( 'https://wettiger-nochrichte.net/sitemap.xml').keys() out = crawler.get_output('gsw-u-sd-chag') for url in sorted(urls): if url.find('//wettiger-nochrichte.net/20') < 0: continue html = crawler.fetch(url).content.decode('utf-8') pubdate = re.search(r'<time class="entry-date" datetime="(.+?)"', html) html = html.split('class="post-content">') html = html[1].split('<style')[0] paragraphs = [] for p in re.split(r'</?(p|h1|h2).+?>', html): p = ' '.join(replace_html_entities(striptags(p)).split()) if ((p not in ('', 'p', 'h2', 'h3')) and (not p.startswith('http')) and ('<' not in p) and (not p.endswith('by Wettiger Nochrichte')) and (not p.endswith('by LuFiLa')) and (not p.endswith('by Wettiger'))): paragraphs.append(p) if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate.group(1)) for p in paragraphs: out.write(p + '\n')
def crawl_gsw_seislerblog(crawler): urls = set() for i in range(1, 16): indexurl = ('http://www.freiburger-nachrichten.ch/blogs/seislerblog' '?page=%d' % i) html = crawler.fetch(indexurl).content.decode('utf-8') for url in re.findall(r'<a href="(/blogs/seislerblog/.+?)[\s"]', html): urls.add(urljoin(indexurl, url)) out = crawler.get_output('gsw-u-sd-chfr') for url in sorted(urls): out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') text = crawler.fetch(url).content.decode('utf-8') pubdate = re.search( r'<span class="date-created">([0-9]{1,2})\.([0-9]{2})\.' '(20[0-9]{2})</span>', text) if pubdate != None: day, month, year = pubdate.groups() pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day)) out.write('# Publication-Date: %s\n' % pubdate) text = text.split('<h1>', 1)[-1].split('<section')[0] text = text.replace('\n', ' ') for tag in ('</p>', '</h1>', '</div>'): text = text.replace(tag, '\n') for p in [ ' '.join(striptags(t).strip().split()) for t in text.splitlines() ]: if p and p != 'Kommentare': out.write(p + '\n')
def crawl_gsw_derbund(crawler): urls = set() for i in range(1, 200): url = ('https://www.derbund.ch/ajax/tags.html?' 'action=moreDossierStories§ion_id=11127&page=%d' '&dossier_id=3069' % i) items = json.loads(crawler.fetch(url).content)['items'] for path in re.findall(r'<a href="(.+?)"', ''.join(items)): if not path.startswith('/stichwort/autor/'): urls.add(urljoin('https://www.derbund.ch/', path)) if len(items) == 0: break out = crawler.get_output('gsw-u-sd-chbe') for url in sorted(urls): text = crawler.fetch(url).content.decode('utf-8') pubdate = re.search(r'Erstellt: ([0-9]{1,2})\.([0-9]{2})\.([0-9]{4})', text) if pubdate is not None: day, month, year = pubdate.groups() pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day)) out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') if pubdate is not None: out.write('# Publication-Date: %s\n' % pubdate) text = text.split('<div id="mainContent">')[1] text = text.split('<span class"idcode"')[0].split('(Der Bund)')[0] text = text.replace('***', ' ') if text.find('var badwordserch = 1;') >= 0: text = text.split('var badwordserch = 1;', 1)[1] paras = [' '.join(striptags(p).split()) for p in text.split('</p>')] for p in paras: if p: out.write(p + '\n')
def crawl_than_lwin_times(crawler, out): sitemap = crawler.fetch_sitemap('http://thanlwintimes.com/sitemap.xml') for url in sorted(sitemap.keys()): html = crawler.fetch(url).content.decode('utf-8') pubdate = re.search(r'<meta itemprop="datePublished" content="(.+?)"', html) if pubdate is None: continue # prepare for split; some texts use different tags html = html.replace('</div><pre>', '</div><p>') html = html.replace('</div><div class="td-post-content"><p>', '</div><p>') if html.find('</div><p>') < 0: continue text = html.split('</div><p>')[1] text = text.split('<div class=\'sfsi_Sicons ')[0] text = text.split('</noscript>')[0] text = text.replace('\n', ' ') text = text.replace('</p>', '\n').replace('</div>', '\n') paragraphs = [] for p in text.splitlines(): p = ' '.join(striptags(replace_html_entities(p)).split()) if p and ('>' not in p) and (p.find('"caption":') < 0): paragraphs.append(p) if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s\n' % pubdate.groups(1)) for p in paragraphs: out.write(p + '\n')
def crawl_panglong(crawler, out): urls = set() extract_urls = lambda h: re.findall(r'http://panglong.org/\?p=[0-9]+', h) for cat in range(1, 20): caturl = 'http://panglong.org/?cat=%d' % cat page = crawler.fetch(caturl) if page.status != 200: continue urls.update(extract_urls(page.content)) pageids = re.findall(r';paged=([0-9]+)', page.content) if len(pageids) > 0: for pageid in range(2, max([int(p) for p in pageids]) + 1): cpurl = 'http://panglong.org/?cat=%d&paged=%d' % (cat, pageid) page = crawler.fetch(cpurl) if page.status == 200: urls.update(extract_urls(page.content)) for url in urls: try: html = crawler.fetch(url).content.decode('utf-8') except UnicodeDecodeError: # a handful of documents are invalid utf8 continue pubdate = re.search(r'<meta itemprop="datePublished" content="(.+)?"', html) if pubdate is not None: pubdate = pubdate.group(1).strip() title = re.search(r'<meta property="og:title" content="(.+?)"', html) paras = [] if title is not None: paras.append(title.group(1).strip()) if html.find('class="entry-content">') > 0: text = html.split('class="entry-content">')[1] text = text.split('<div')[0] for p in text.split('</p>'): p = ' '.join(striptags(replace_html_entities(p)).split()) if p: paras.append(p) if len(paras) == 0: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_mon_news(crawler, out): urls = set() for year in range(2009, datetime.today().year + 1): first_page = crawler.fetch('http://mon.monnews.org/%d/' % year) html = first_page.content.decode('utf-8') urls.update(extract_mon_news_urls(html)) num_pages = re.search( r'<a href="http://mon.monnews.org/\d+/page/(\d+)/" class="last"', html) if num_pages != None: num_pages = int(num_pages.group(1)) for page in range(2, num_pages + 1): next_page = crawler.fetch( 'http://mon.monnews.org/%d/page/%d/' % (year, page)) if next_page.status != 200: continue html = next_page.content.decode('utf-8') urls.update(extract_mon_news_urls(html)) for url in sorted(urls): html = crawler.fetch(url.encode('utf-8')).content.decode('utf-8') pubdate = re.search( r'<meta property="article:published_time" content="(.+?)"', html) if pubdate is None: continue pubdate = pubdate.groups(1) text = html.split('</section>')[1].split('<div class="sharedaddy')[0] text = text.split('Share this:')[0] text = text.replace('\n', ' ') text = text.replace('</p>', '\n').replace('</div>', '\n') paragraphs = [] for p in text.splitlines(): p = ' '.join(striptags(replace_html_entities(p)).split()) if p and '>' not in p: paragraphs.append(p) if len(paragraphs) > 0: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s\n' % pubdate) for p in paragraphs: out.write(p + '\n')
def _scrape_paiperatapu(crawler, out): booklist = list() books = crawler.fetch( 'http://www.paiperatapu.maori.nz/paipera-tapu-online') assert books.status == 200, books.status bookshtml = books.content.decode('utf-8') bookshtmlinner = bookshtml.split('<div class="bible-book-list">')[1].split( '<li class="first bible-search">')[0] for bookslink in re.findall(r'<a href="(/bible/[0-9]*/[^"]*)">', bookshtmlinner): bookurl = 'http://www.paiperatapu.maori.nz' + bookslink book = crawler.fetch(bookurl) assert book.status == 200, book.status bookhtml = book.content.decode('utf-8') bookhtmlinner = bookhtml.split('<ul class="bible-chapter-list">')[ 1].split('<div class="bible-links">')[0] for chapterlink in re.findall( r'<a href="(/bible/[0-9]*/[^/]*/[^"]*)">', bookhtmlinner): url = 'http://www.paiperatapu.maori.nz' + chapterlink chapter = crawler.fetch(url) assert chapter.status == 200, chapter.status chapterhtml = chapter.content.decode('utf-8') if '<dl class="bible-chapter-content">' not in chapterhtml: continue out.write('# Location: %s\n' % url) title = re.search(r'<title>(.+?)</title>', chapterhtml) if title: title = striptags(title.group(1).split('| Te')[0]).strip() # Title is in English if title: out.write('# Title: %s\n' % cleantext(title)) out.write('# Genre: Religion\n') chapterhtmlinner = chapterhtml.split( '<dl class="bible-chapter-content">')[1].split( '<div class="bible-chapter-seek">')[0] for verse in re.finditer( r'<dt><a name="[^"]*"></a>([^<]*)</dt><dd class="[^"]*">([^<]*)</dd>', chapterhtmlinner): out.write('%s %s\n' % (verse.group(1), cleantext(verse.group(2))))
def crawl_manxradio(crawler, out): urls = set() for i in range(1, 100): url = 'http://www.manxradio.com/news/manx-gaelic/archive/?page=%d' % i r = crawler.fetch(url) if r.status != 200 or r.content.find(b'No stories to show.') > 0: break for p in re.findall(r'<a href="/(news/manx-gaelic/[^"]+)"', r.content): url = 'http://www.manxradio.com/' + p if url.find('?') < 0: urls.add(url) for url in urls: r = crawler.fetch(url) assert r.status == 200, r.status html = r.content.decode('utf-8') pubdate = _extract_manxradio_timestamp(html) text = html.split('<p class="news-abstract">') if len(text) < 2: continue text = text[1].split('<STRONG>')[0].split('<strong>')[0] text = text.split('<p><span lang=""><b>')[0] text = text.replace('<p>', '\n').replace('</p>', '\n') text = text.replace('<P>', '\n').replace('</P>', '\n') text = striptags(replace_html_entities(text)) text = text.replace(' - ', ' – ').replace("'", '’') if text.find('Listen to this audio') >= 0: continue paras = [' '.join(s.split()) for s in text.splitlines()] paras = [p for p in paras if p] if len(paras) == 0: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def _scrape_maoritelevision(crawler, out): articlelist = set() articlelist.add('http://www.maoritelevision.com/mi/purongo/purongo-hou') articlelist.add('http://www.maoritelevision.com/mi/purongo/hakinakina') for i in range(1, 101): articlelist.add( 'http://www.maoritelevision.com/mi/purongo/purongo-hou?page=%d' % i) articlelist.add( 'http://www.maoritelevision.com/mi/purongo/hakinakina?page=%d' % i) links = set() pubdate_regex = re.compile(r'<time datetime="([0-9T:+\-]{25})"') for url in articlelist: doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') for articlepiece in content.split('<article')[1:]: for artlink in re.findall('<a href="(/mi/purongo/[^"]*)"', articlepiece): if not artlink.startswith('/mi/purongo/purongo-hou'): links.add('http://www.maoritelevision.com%s' % artlink) for url in links: doc = crawler.fetch(url) if doc.status != 200: continue if 'a-motu/rereatea-midday-news' in url: continue html = doc.content.decode('utf-8') if 'lang="mi"' not in html: continue if 'itemprop="articleBody"' not in html: continue genre = 'Sport' if '/hakinakina/' in url else 'News' pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = doc.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] # These news stories are a parallel (or at least comparable) corpus, so keeping # the link to the English article english = re.search( r'<a href="(/news/[^"]*)" class="language-link" lang="en">', html) if english: english = 'http://www.maoritelevision.com%s' % english.group(1) tags = set() if '<ul class="tags">' in html: tagshtml = html.split('<ul class="tags">')[1].split('</ul>')[0] for tag in re.findall(r'<a href="(?:[^"]*)">([^<]*)</a>', tagshtml): tags.add(cleantext(tag)) paras = [] title = re.search(r'<title>(.+?)</title>', html) if title: paras.append( cleantext(striptags(title.group(1).split('| Māori')[0]))) articlehtml = html.split('class="field-body"')[1].split('</div>')[0] paras.extend( [cleantext(p) for p in re.findall(r'<p>(.+?)</p>', articlehtml)]) paras = [p for p in paras if p and p.find(' the ') < 0] # filter out English if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: %s\n' % genre) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) if english: out.write('# Translation.en: %s\n' % english) if tags: out.write('# Tags: %s\n' % ', '.join(tags)) out.write('\n'.join(paras) + '\n')